add leaderboard tag to increase visibility

#4
by mrfakename - opened
.gitattributes CHANGED
@@ -25,6 +25,7 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -32,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
- scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
.gitignore CHANGED
@@ -1,22 +1,3 @@
1
- auto_evals/
2
- venv/
3
- __pycache__/
4
- .env
5
- .ipynb_checkpoints
6
- *ipynb
7
- .vscode/
8
-
9
- eval-queue/
10
- eval-results/
11
- eval-queue-bk/
12
- eval-results-bk/
13
- logs/
14
  evals/
15
-
16
- .gradio/
17
- .evals/
18
  __pycache__/*
19
- *.pyc
20
-
21
- # saved data automatically
22
- leaderboard/current-rbv2-data.csv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  evals/
 
 
 
2
  __pycache__/*
3
+ *.pyc
 
 
 
Makefile DELETED
@@ -1,13 +0,0 @@
1
- .PHONY: style format
2
-
3
-
4
- style:
5
- python -m black --line-length 119 .
6
- python -m isort .
7
- ruff check --fix .
8
-
9
-
10
- quality:
11
- python -m black --check --line-length 119 .
12
- python -m isort --check-only .
13
- ruff check .
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -4,10 +4,19 @@ emoji: 📐
4
  colorFrom: pink
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 4.36.0 # for colors + sorting, newer cannot
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
  tags:
12
- - leaderboard
13
  ---
 
 
 
 
 
 
 
 
 
 
4
  colorFrom: pink
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.12.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
  tags:
12
+ - leaderboard
13
  ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
16
+
17
+ To develop this app, it can be run with:
18
+ ```
19
+ gradio app.py
20
+ ```
21
+
22
+ Paper: https://arxiv.org/abs/2403.13787
app.py CHANGED
@@ -1,65 +1,37 @@
1
- import os
2
- from pathlib import Path
3
-
4
  import gradio as gr
5
- import numpy as np
6
- import pandas as pd
7
- from datasets import load_dataset
8
  from huggingface_hub import HfApi, snapshot_download
 
 
 
 
 
 
 
 
9
 
10
- from leaderboard.constants import example_counts, subset_mapping
11
- from leaderboard.css import custom_css
12
- from leaderboard.md import *
13
- from leaderboard.utils import load_all_data
14
-
15
- #######################################################
16
- # Setup #
17
- #######################################################
18
  api = HfApi()
19
 
20
  COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
21
- evals_repo = "allenai/reward-bench-2-results"
22
-
23
- eval_set_repo = "allenai/reward-bench-2"
24
- eval_set_repo_v1 = "allenai/reward-bench"
25
 
 
26
  repo_dir_rewardbench = "./evals/rewardbench/"
27
 
 
 
 
28
  print("Pulling evaluation results")
29
  repo = snapshot_download(
30
  local_dir=repo_dir_rewardbench,
31
- ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*", "eval-set/allenai/open_instruct_dev*"],
32
  repo_id=evals_repo,
33
  use_auth_token=COLLAB_TOKEN,
34
- tqdm_class=None,
35
  etag_timeout=30,
36
  repo_type="dataset",
37
  )
38
 
39
- ###########################################
40
- # Load Data #
41
- ###########################################
42
-
43
-
44
- def avg_over_rewardbench_v2(dataframe_core):
45
- domain_cols = ["Factuality", "Precise IF", "Math", "Safety", "Focus", "Ties"]
46
- domain_weights = [1, 1, 1, 1, 1, 1]
47
- new_df = dataframe_core.copy()
48
-
49
- # for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
50
- # Get the domain data and handle missing values
51
- domain_data = new_df[domain_cols].values
52
- masked_data = np.ma.masked_array(domain_data, np.isnan(domain_data))
53
-
54
- # Calculate weighted average
55
- average = np.ma.average(masked_data, axis=1, weights=domain_weights)
56
- new_df["average"] = average.filled(np.nan)
57
-
58
- # Rearrange columns for consistent output
59
- keep_columns = ["model", "model_type", "average"] + domain_cols
60
- new_df = new_df[keep_columns]
61
- return new_df
62
-
63
 
64
  def avg_over_rewardbench(dataframe_core, dataframe_prefs):
65
  """
@@ -78,19 +50,13 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
78
  # for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
79
  for subset, sub_subsets in subset_mapping.items():
80
  subset_cols = [col for col in new_df.columns if col in sub_subsets]
81
- sub_data = new_df[subset_cols].values # take the relevant column values
82
- sub_counts = [example_counts[s] for s in subset_cols] # take the example counts
83
- new_df[subset] = np.average(sub_data, axis=1, weights=sub_counts) # take the weighted average
84
  # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
85
 
86
  data_cols = list(subset_mapping.keys())
87
- keep_columns = (
88
- [
89
- "model",
90
- ]
91
- + ["model_type"]
92
- + data_cols
93
- )
94
  # keep_columns = ["model", "average"] + subsets
95
  new_df = new_df[keep_columns]
96
 
@@ -112,7 +78,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
112
  # new_df.at[i, "Prior Sets (0.5 weight)"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0]
113
  else:
114
  values.append(np.nan)
115
-
116
  new_df["Prior Sets (0.5 weight)"] = values
117
 
118
  # add total average
@@ -129,90 +95,83 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
129
  new_df = new_df[keep_columns]
130
  return new_df
131
 
 
 
 
132
 
133
- def prep_df(df):
134
- # add column to 0th entry with count (column name itself empty)
135
- df.insert(0, "", range(1, 1 + len(df)))
136
 
137
- # replace "model" with "Model" and "model_type" with "Model Type" and "average" with "Average"
138
- df = df.rename(columns={"model": "Model", "model_type": "Model Type", "average": "Average"})
139
-
140
- # if "Model Type" in columns
141
- if "Model Type" in df.columns:
142
- # get model_types that have generative in them
143
- mask = df["Model Type"].str.contains("generative", case=False, na=False)
144
-
145
- # set these values to "Generative"
146
- df.loc[mask, "Model Type"] = "Generative"
147
-
148
- return df
149
-
150
-
151
- # get v1 data
152
- orig_data_path = "leaderboard/final-rbv1-data.csv"
153
- rb_orig_snapshot = pd.read_csv(orig_data_path)
154
- # rename column "Unnamed: 0" to ""
155
- rb_orig_snapshot = rb_orig_snapshot.rename(columns={"Unnamed: 0": ""})
156
- # rb_orig_snapshot = rb_orig_snapshot.drop(columns=["Unnamed: 0", ''])
157
- rb_orig_snapshot.reset_index(drop=True, inplace=True)
158
-
159
- rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by="average", ascending=False)
160
- rewardbench_data_avg_intermediate = avg_over_rewardbench_v2(rewardbench_data.copy())
161
-
162
- # Prepare RBv1 scores for merging
163
- rb_v1_scores_to_merge = rb_orig_snapshot[["Model", "Score"]].copy()
 
 
 
 
164
 
165
- # if " ⚠️" in rb_v1_scores_to_merge["Model"].values, shorten the model name without it
166
- rb_v1_scores_to_merge["Model"] = rb_v1_scores_to_merge["Model"].str.replace(" ⚠️", "", regex=False)
167
 
168
- rb_v1_scores_to_merge.rename(columns={"Score": "RBv1"}, inplace=True)
169
- # rename rb_v1 "Model" to "model"
170
- rb_v1_scores_to_merge.rename(columns={"Model": "model"}, inplace=True)
171
 
172
- # Merge RBv1 scores into the v2 data
173
- rewardbench_data_avg = pd.merge(rewardbench_data_avg_intermediate, rb_v1_scores_to_merge, on="model", how="left")
 
 
 
174
 
175
- # Drop any models with only RBv1 scores and no v2 scores
176
- rewardbench_data_avg = rewardbench_data_avg.dropna(subset=["average"])
177
 
178
- # Sort by the v2 average
179
- rewardbench_data_avg = rewardbench_data_avg.sort_values(by="average", ascending=False)
 
180
 
 
 
 
181
 
182
  # add count column to all dataframes
183
  rewardbench_data = prep_df(rewardbench_data)
184
  rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
 
185
 
186
- # Ensure RBv1 is the last column if it's not already (merge usually places it at the end of non-key columns)
187
- # If 'RBv1' is present and not last, move it to be the last column.
188
- if "RBv1" in rewardbench_data_avg.columns:
189
- rbv1_col = rewardbench_data_avg.pop("RBv1")
190
- rewardbench_data_avg["RBv1"] = rbv1_col
191
 
192
- # save rewardbench_data_avg as csv to src/current-rbv2-data.csv
193
- v2_data_path = "leaderboard/current-rbv2-data.csv"
194
- rewardbench_data_avg.to_csv(v2_data_path, index=False)
195
-
196
- col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
197
- col_types_rewardbench_v1 = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rb_orig_snapshot.columns) - 1)
198
-
199
- # import ipdb; ipdb.set_trace()
200
-
201
- ###########################################
202
- # Leaderboard Helpers & Setting #
203
- ###########################################
204
 
205
  # for showing random samples
206
- eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="test")
207
- eval_set_v1 = load_dataset(eval_set_repo_v1, use_auth_token=COLLAB_TOKEN, split="filtered")
208
- subsets = eval_set.unique("subset")
209
- subsets_v1 = eval_set_v1.unique("subset")
210
-
211
  def random_sample(r: gr.Request, subset):
212
  if subset is None or subset == []:
213
  sample_index = np.random.randint(0, len(eval_set) - 1)
214
  sample = eval_set[sample_index]
215
- else: # filter by subsets (can be list)
216
  if isinstance(subset, str):
217
  subset = [subset]
218
  # filter down dataset to only include the subset(s)
@@ -220,82 +179,24 @@ def random_sample(r: gr.Request, subset):
220
  sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
221
  sample = eval_set_filtered[sample_index]
222
 
223
- markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
224
  return markdown_text
225
 
 
226
 
227
- # Duplicating because they use global variables with gradio setup
228
- def random_sample_v1(r: gr.Request, subset):
229
- if subset is None or subset == []:
230
- sample_index = np.random.randint(0, len(eval_set) - 1)
231
- sample = eval_set[sample_index]
232
- else: # filter by subsets (can be list)
233
- if isinstance(subset, str):
234
- subset = [subset]
235
- # filter down dataset to only include the subset(s)
236
- eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
237
- sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
238
- sample = eval_set_filtered[sample_index]
239
-
240
- markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
241
- return markdown_text
242
-
243
-
244
- color_map = {
245
- "Generative": "#7497db",
246
- "Custom Classifier": "#E8ECF2",
247
- "Seq. Classifier": "#ffcd75",
248
- "DPO": "#75809c",
249
- }
250
-
251
-
252
- def color_model_type_column(df, color_map):
253
- """
254
- Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
255
-
256
- Parameters:
257
- df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
258
- color_map (dict): A dictionary mapping model types to colors.
259
-
260
- Returns:
261
- pd.Styler: The styled DataFrame.
262
- """
263
-
264
- # Function to apply color based on the model type
265
- def apply_color(val):
266
- color = color_map.get(val, "default") # Default color if not specified in color_map
267
- return f"background-color: {color}"
268
-
269
- # Format for different columns
270
- format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Average", "Model", "Model Type"]}
271
- format_dict["Average"] = "{:.2f}"
272
- format_dict[""] = "{:d}"
273
-
274
- return df.style.applymap(apply_color, subset=["Model Type"]).format(format_dict, na_rep="")
275
-
276
-
277
- def regex_table(dataframe, regex, filter_button, style=True):
278
  """
279
  Takes a model name as a regex, then returns only the rows that has that in it.
280
  """
281
  # Split regex statement by comma and trim whitespace around regexes
282
  regex_list = [x.strip() for x in regex.split(",")]
283
  # Join the list into a single regex pattern with '|' acting as OR
284
- combined_regex = "|".join(regex_list)
285
-
286
- # remove internal ai2 data
287
- dataframe = dataframe[~dataframe["Model"].str.contains("ai2", case=False, na=False)]
288
 
289
  # if filter_button, remove all rows with "ai2" in the model name
290
- update_scores = False
291
  if isinstance(filter_button, list) or isinstance(filter_button, str):
292
- if "Prior Sets" not in filter_button and "Prior Sets (0.5 weight)" in dataframe.columns:
293
- update_scores = True
294
- # remove the column "Prior Sets (0.5 weight)" from the outputted table
295
- dataframe = dataframe.drop(columns=["Prior Sets (0.5 weight)"])
296
- if "RBv1" not in filter_button and "RBv1" in dataframe.columns:
297
- # remove the column "Prior Sets (0.5 weight)" from the outputted table
298
- dataframe = dataframe.drop(columns=["RBv1"])
299
  if "Seq. Classifiers" not in filter_button:
300
  dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifier", case=False, na=False)]
301
  if "DPO" not in filter_button:
@@ -307,18 +208,8 @@ def regex_table(dataframe, regex, filter_button, style=True):
307
  # Filter the dataframe such that 'model' contains any of the regex patterns
308
  data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
309
 
310
- # if update the score to not use prior sets, do so
311
- if update_scores:
312
- data["Score"] = (data["Chat"] + data["Chat Hard"] + data["Safety"] + data["Reasoning"]) / 4
313
- # if "Prior Sets (0.5 weight)" in data.columns:
314
- # data["Prior Sets (0.5 weight)"] = np.nan
315
- # sort array by Score column
316
- data = data.sort_values(by="Score", ascending=False)
317
-
318
- data.reset_index(drop=True, inplace=True)
319
-
320
  # replace column '' with count/rank
321
- data[""] = np.arange(1, 1 + len(data))
322
 
323
  # if Score exists, round to 2 decimals
324
  if "Score" in data.columns:
@@ -328,206 +219,161 @@ def regex_table(dataframe, regex, filter_button, style=True):
328
  # round all others to 1 decimal
329
  for col in data.columns:
330
  if col not in ["", "Model", "Model Type", "Score", "Average"]:
331
- # replace any data[col].values == '' with np.nan
332
- data[col] = data[col].replace("", np.nan)
333
  data[col] = np.round(np.array(data[col].values).astype(float), 1)
334
- if style:
335
- # apply color
336
- data = color_model_type_column(data, color_map)
337
-
338
  return data
339
 
340
-
341
  # import ipdb; ipdb.set_trace()
342
 
343
- total_models = len(
344
- regex_table(
345
- rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
346
- ).values
347
- )
348
- total_models_v1 = len(
349
- regex_table(
350
- rb_orig_snapshot.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
351
- ).values
352
- )
353
- assets = Path("leaderboard").resolve() # absolute dir with the image
354
-
355
- # Using a string for a predefined color
356
- theme = gr.themes.Default(primary_hue="blue")
357
 
358
- #############################################
359
- # Gradio App #
360
- #############################################
361
-
362
- with gr.Blocks(theme=theme, css=custom_css) as app:
363
  # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
364
  with gr.Row():
365
  with gr.Column(scale=6):
366
- gr.Markdown(TOP_TEXT)
367
- # with gr.Column(scale=4):
368
- # # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
369
- # # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
370
- # # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
371
- # gr.Markdown("""
372
- # ![](/gradio_api/file=leaderboard/logo.png)
373
- # """)
374
-
375
- with gr.Tabs(elem_id="outer-tabs", elem_classes="tabs-big") as tabs_big:
376
- with gr.TabItem("🏆 RewardBench 2"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  with gr.Row():
378
- with gr.Column(scale=7):
379
- gr.Markdown(CAPTION_V2.format(str(total_models)))
380
- with gr.Column(scale=3):
381
- # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
382
- # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
383
- # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
384
- gr.Markdown(
385
- """
386
- ![](/gradio_api/file=leaderboard/logo.png)
387
- """
388
- )
389
- with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
390
- with gr.TabItem("Leaderboard"):
391
- with gr.Row():
392
- search_1 = gr.Textbox(
393
- label="Model Search (delimit with , )",
394
- placeholder="Model Search (delimit with , )",
395
- show_label=False,
396
- scale=8,
397
- )
398
- model_types_1 = gr.CheckboxGroup(
399
- ["Seq. Classifiers", "Custom Classifiers", "Generative", "RBv1"],
400
- value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
401
- show_label=False,
402
- scale=8,
403
- )
404
- # narrow, non-expanding download button
405
- gr.DownloadButton(
406
- label="Download CSV",
407
- value=v2_data_path,
408
- size="sm", # shorter height / padding
409
- scale=0, # ← **width stays just big enough for the text**
410
- min_width=140, # (optional) guarantee it doesn’t collapse
411
- )
412
- with gr.Row():
413
- # reference data
414
- rewardbench_table_hidden = gr.Dataframe(
415
- rewardbench_data_avg.values,
416
- datatype=col_types_rewardbench_v1,
417
- headers=rewardbench_data_avg.columns.tolist(),
418
- visible=False,
419
- )
420
- rewardbench_table = gr.Dataframe(
421
- regex_table(
422
- rewardbench_data_avg.copy(),
423
- "",
424
- ["Seq. Classifiers", "Custom Classifiers", "Generative"],
425
- ),
426
- datatype=col_types_rewardbench_v1,
427
- headers=rewardbench_data_avg.columns.tolist(),
428
- elem_id="rewardbench_dataframe_avg",
429
- height=800, # 800 px ≈ ~25 rows on default row-height
430
- )
431
-
432
- with gr.TabItem("About"):
433
- with gr.Row():
434
- gr.Markdown(ABOUT_TEXT_V2)
435
-
436
- with gr.TabItem("Dataset Viewer"):
437
- with gr.Row():
438
- # loads one sample
439
- gr.Markdown("""## Random Dataset Sample Viewer""")
440
- subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
441
- button_data = gr.Button("Show Random Sample")
442
-
443
- with gr.Row():
444
- sample_display = gr.Markdown("{sampled data loads here}")
445
-
446
- button_data.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
447
- with gr.TabItem("RewardBench"):
448
  with gr.Row():
449
- gr.Markdown(CAPTION_V1.format(str(total_models_v1)))
450
- with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
451
- with gr.TabItem("Leaderboard"):
452
- with gr.Row():
453
- search_1_v1 = gr.Textbox(
454
- label="Model Search (delimit with , )",
455
- placeholder="Model Search (delimit with , )",
456
- show_label=False,
457
- )
458
- model_types_1_v1 = gr.CheckboxGroup(
459
- ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
460
- value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
461
- label="Model Types",
462
- show_label=False,
463
- # info="Which model types to include.",
464
- )
465
- # narrow, non-expanding download button
466
- gr.DownloadButton(
467
- label="Download CSV",
468
- value=orig_data_path,
469
- size="sm", # shorter height / padding
470
- scale=0, # ← **width stays just big enough for the text**
471
- min_width=140, # (optional) guarantee it doesn’t collapse
472
- )
473
- with gr.Row():
474
- # reference data
475
- rewardbench_table_hidden_v1 = gr.Dataframe(
476
- rb_orig_snapshot.values,
477
- datatype=col_types_rewardbench,
478
- headers=rb_orig_snapshot.columns.tolist(),
479
- visible=False,
480
- )
481
- rewardbench_table_v1 = gr.Dataframe(
482
- regex_table(
483
- rb_orig_snapshot.copy(),
484
- "",
485
- ["Seq. Classifiers", "Custom Classifiers", "Generative"],
486
- ),
487
- datatype=col_types_rewardbench,
488
- headers=rb_orig_snapshot.columns.tolist(),
489
- elem_id="rewardbench_dataframe_avg_v1",
490
- height=800, # 800 px ~25 rows on default row-height
491
- )
492
- with gr.TabItem("About"):
493
- with gr.Row():
494
- gr.Markdown(ABOUT_TEXT_V1)
495
-
496
- with gr.TabItem("Dataset Viewer"):
497
- with gr.Row():
498
- # loads one sample
499
- gr.Markdown("""## Random Dataset Sample Viewer""")
500
- subset_selector_v1 = gr.Dropdown(subsets_v1, label="Subset", value=None, multiselect=True)
501
- button_data_v1 = gr.Button("Show Random Sample")
502
-
503
- with gr.Row():
504
- sample_display_v1 = gr.Markdown("{sampled data loads here}")
505
-
506
- button_data_v1.click(fn=random_sample_v1, inputs=[subset_selector_v1], outputs=[sample_display_v1])
 
 
 
 
 
 
507
 
508
- search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
509
- search_1_v1.change(
510
- regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
511
- )
 
 
 
512
 
513
- model_types_1.change(
514
- regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
515
- )
516
- model_types_1_v1.change(
517
- regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
518
- )
 
 
 
 
 
 
 
 
 
 
 
 
519
 
520
  with gr.Row():
521
  with gr.Accordion("📚 Citation", open=False):
522
  citation_button = gr.Textbox(
523
- value=r"""@misc{RewardBench2,
524
- title={RewardBench 2: Advancing Reward Model Evaluation},
525
- author={Malik, Saumya and Pyatkin, Valentina and Land, Sander and Morrison, Jacob and Smith, Noah A. and Hajishirzi, Hannaneh and Lambert, Nathan},
526
- year={2025},
527
- howpublished={\url{https://huggingface.co/spaces/allenai/reward-bench}},
528
- }
529
-
530
- @misc{RewardBench,
531
  title={RewardBench: Evaluating Reward Models for Language Modeling},
532
  author={Lambert, Nathan and Pyatkin, Valentina and Morrison, Jacob and Miranda, LJ and Lin, Bill Yuchen and Chandu, Khyathi and Dziri, Nouha and Kumar, Sachin and Zick, Tom and Choi, Yejin and Smith, Noah A. and Hajishirzi, Hannaneh},
533
  year={2024},
@@ -538,5 +384,18 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
538
  elem_id="citation-button",
539
  show_copy_button=True,
540
  )
 
 
 
 
 
 
 
 
 
 
541
 
542
- app.launch(allowed_paths=[str(assets)]) # had .queue() before launch before... not sure if that's necessary
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import os
 
 
3
  from huggingface_hub import HfApi, snapshot_download
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from datasets import load_dataset
6
+ from src.utils import load_all_data
7
+ from src.md import ABOUT_TEXT, TOP_TEXT
8
+ from src.plt import plot_avg_correlation
9
+ from src.constants import subset_mapping, length_categories, example_counts
10
+ from src.css import custom_css
11
+ import numpy as np
12
 
 
 
 
 
 
 
 
 
13
  api = HfApi()
14
 
15
  COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
16
+ evals_repo = "allenai/reward-bench-results"
 
 
 
17
 
18
+ eval_set_repo = "allenai/reward-bench"
19
  repo_dir_rewardbench = "./evals/rewardbench/"
20
 
21
+ def restart_space():
22
+ api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN)
23
+
24
  print("Pulling evaluation results")
25
  repo = snapshot_download(
26
  local_dir=repo_dir_rewardbench,
27
+ ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
28
  repo_id=evals_repo,
29
  use_auth_token=COLLAB_TOKEN,
30
+ tqdm_class=None,
31
  etag_timeout=30,
32
  repo_type="dataset",
33
  )
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  def avg_over_rewardbench(dataframe_core, dataframe_prefs):
37
  """
 
50
  # for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
51
  for subset, sub_subsets in subset_mapping.items():
52
  subset_cols = [col for col in new_df.columns if col in sub_subsets]
53
+ sub_data = new_df[subset_cols].values # take the relevant column values
54
+ sub_counts = [example_counts[s] for s in subset_cols] # take the example counts
55
+ new_df[subset] = np.average(sub_data, axis=1, weights=sub_counts) # take the weighted average
56
  # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
57
 
58
  data_cols = list(subset_mapping.keys())
59
+ keep_columns = ["model",] + ["model_type"] + data_cols
 
 
 
 
 
 
60
  # keep_columns = ["model", "average"] + subsets
61
  new_df = new_df[keep_columns]
62
 
 
78
  # new_df.at[i, "Prior Sets (0.5 weight)"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0]
79
  else:
80
  values.append(np.nan)
81
+
82
  new_df["Prior Sets (0.5 weight)"] = values
83
 
84
  # add total average
 
95
  new_df = new_df[keep_columns]
96
  return new_df
97
 
98
+ def expand_subsets(dataframe):
99
+ # TODO need to modify data/ script to do this
100
+ pass
101
 
 
 
 
102
 
103
+ def length_bias_check(dataframe):
104
+ """
105
+ Takes the raw rewardbench dataframe and splits the data into new buckets according to length_categories.
106
+ Then, take the average of the three buckets as "average"
107
+ """
108
+ new_df = dataframe.copy()
109
+ existing_subsets = new_df.columns[3:] # model, model_type, average
110
+ final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
111
+ # new data is empty list dict for each final subset
112
+ new_data = {s: [] for s in final_subsets}
113
+
114
+ # now, subsets correspond to those with True, Nuetral, and False length bias
115
+ # check if length_categories[subset] == "True" or "False" or "Neutral"
116
+ for subset in existing_subsets:
117
+ subset_data = new_df[subset].values
118
+ subset_length = length_categories[subset]
119
+ # route to the correct bucket
120
+ if subset_length == "True":
121
+ new_data["Length Bias"].append(subset_data)
122
+ elif subset_length == "Neutral":
123
+ new_data["Neutral"].append(subset_data)
124
+ elif subset_length == "False":
125
+ new_data["Terse Bias"].append(subset_data)
126
+
127
+ # take average of new_data and add to new_df (removing other columns than model)
128
+ for subset in final_subsets:
129
+ new_df[subset] = np.nanmean(new_data[subset], axis=0)
130
+ keep_columns = ["model"] + final_subsets
131
+ new_df = new_df[keep_columns]
132
+ # recompute average
133
+ # new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2)
134
 
135
+ return new_df
 
136
 
 
 
 
137
 
138
+
139
+ rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
140
+ rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
141
+ prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
142
+ # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
143
 
144
+ rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
 
145
 
146
+ def prep_df(df):
147
+ # add column to 0th entry with count (column name itself empty)
148
+ df.insert(0, '', range(1, 1 + len(df)))
149
 
150
+ # replace "model" with "Model" and "model_type" with "Model Type" and "average" with "Average"
151
+ df = df.rename(columns={"model": "Model", "model_type": "Model Type", "average": "Average"})
152
+ return df
153
 
154
  # add count column to all dataframes
155
  rewardbench_data = prep_df(rewardbench_data)
156
  rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
157
+ # adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
158
 
159
+ rewardbench_data_length = prep_df(rewardbench_data_length)
160
+ prefs_data = prep_df(prefs_data)
 
 
 
161
 
162
+ col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
163
+ col_types_rewardbench_avg = ["number"] + ["markdown"]+ ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
164
+ cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
165
+ col_types_prefs = ["number"] + ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
166
+ # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
 
 
 
 
 
 
 
167
 
168
  # for showing random samples
169
+ eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
 
 
 
 
170
  def random_sample(r: gr.Request, subset):
171
  if subset is None or subset == []:
172
  sample_index = np.random.randint(0, len(eval_set) - 1)
173
  sample = eval_set[sample_index]
174
+ else: # filter by subsets (can be list)
175
  if isinstance(subset, str):
176
  subset = [subset]
177
  # filter down dataset to only include the subset(s)
 
179
  sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
180
  sample = eval_set_filtered[sample_index]
181
 
182
+ markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
183
  return markdown_text
184
 
185
+ subsets = eval_set.unique("subset")
186
 
187
+ def regex_table(dataframe, regex, filter_button):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  """
189
  Takes a model name as a regex, then returns only the rows that has that in it.
190
  """
191
  # Split regex statement by comma and trim whitespace around regexes
192
  regex_list = [x.strip() for x in regex.split(",")]
193
  # Join the list into a single regex pattern with '|' acting as OR
194
+ combined_regex = '|'.join(regex_list)
 
 
 
195
 
196
  # if filter_button, remove all rows with "ai2" in the model name
 
197
  if isinstance(filter_button, list) or isinstance(filter_button, str):
198
+ if "AI2 Experiments" not in filter_button and ("ai2" not in regex):
199
+ dataframe = dataframe[~dataframe["Model"].str.contains("ai2", case=False, na=False)]
 
 
 
 
 
200
  if "Seq. Classifiers" not in filter_button:
201
  dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifier", case=False, na=False)]
202
  if "DPO" not in filter_button:
 
208
  # Filter the dataframe such that 'model' contains any of the regex patterns
209
  data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
210
 
 
 
 
 
 
 
 
 
 
 
211
  # replace column '' with count/rank
212
+ data[''] = np.arange(1, 1 + len(data))
213
 
214
  # if Score exists, round to 2 decimals
215
  if "Score" in data.columns:
 
219
  # round all others to 1 decimal
220
  for col in data.columns:
221
  if col not in ["", "Model", "Model Type", "Score", "Average"]:
222
+ # replace any data[col].values == '' with np.NaN
223
+ data[col] = data[col].replace('', np.NaN)
224
  data[col] = np.round(np.array(data[col].values).astype(float), 1)
 
 
 
 
225
  return data
226
 
 
227
  # import ipdb; ipdb.set_trace()
228
 
229
+ total_models = len(regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"]).values)
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
+ with gr.Blocks(css=custom_css) as app:
 
 
 
 
232
  # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
233
  with gr.Row():
234
  with gr.Column(scale=6):
235
+ gr.Markdown(TOP_TEXT.format(str(total_models)))
236
+ with gr.Column(scale=4):
237
+ # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
238
+ # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
239
+ # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
240
+ gr.Markdown("""
241
+ ![](file/src/logo.png)
242
+ """)
243
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
244
+ with gr.TabItem("🏆 RewardBench Leaderboard"):
245
+ with gr.Row():
246
+ search_1 = gr.Textbox(label="Model Search (delimit with , )",
247
+ placeholder="Model Search (delimit with , )",
248
+ show_label=False)
249
+ model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
250
+ value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
251
+ label="Model Types",
252
+ show_label=False,
253
+ # info="Which model types to include.",
254
+ )
255
+ with gr.Row():
256
+ # reference data
257
+ rewardbench_table_hidden = gr.Dataframe(
258
+ rewardbench_data_avg.values,
259
+ datatype=col_types_rewardbench_avg,
260
+ headers=rewardbench_data_avg.columns.tolist(),
261
+ visible=False,
262
+ )
263
+ rewardbench_table = gr.Dataframe(
264
+ regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values,
265
+ datatype=col_types_rewardbench_avg,
266
+ headers=rewardbench_data_avg.columns.tolist(),
267
+ elem_id="rewardbench_dataframe_avg",
268
+ height=1000,
269
+ )
270
+
271
+ with gr.TabItem("🔍 RewardBench - Detailed"):
272
  with gr.Row():
273
+ search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
274
+ model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
275
+ value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
276
+ label="Model Types",
277
+ show_label=False,
278
+ # info="Which model types to include."
279
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  with gr.Row():
281
+ # ref data
282
+ rewardbench_table_detailed_hidden = gr.Dataframe(
283
+ rewardbench_data.values,
284
+ datatype=col_types_rewardbench,
285
+ headers=rewardbench_data.columns.tolist(),
286
+ visible=False,
287
+ )
288
+ rewardbench_table_detailed = gr.Dataframe(
289
+ regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values,
290
+ datatype=col_types_rewardbench,
291
+ headers=rewardbench_data.columns.tolist(),
292
+ elem_id="rewardbench_dataframe",
293
+ height=1000,
294
+ )
295
+ # with gr.TabItem("rewardbench Eval Set - Length Bias"):
296
+ # with gr.Row():
297
+ # # backup
298
+ # rewardbench_table_len_hidden = gr.Dataframe(
299
+ # rewardbench_data_length.values,
300
+ # datatype=cols_rewardbench_data_length,
301
+ # headers=rewardbench_data_length.columns.tolist(),
302
+ # visible=False,
303
+ # )
304
+ # rewardbench_table_len = gr.Dataframe(
305
+ # regex_table(rewardbench_data_length.copy(), "", False).values,
306
+ # datatype=cols_rewardbench_data_length,
307
+ # headers=rewardbench_data_length.columns.tolist(),
308
+ # elem_id="rewardbench_dataframe_length",
309
+ # height=1000,
310
+ # )
311
+ with gr.TabItem("Prior Test Sets"):
312
+ with gr.Row():
313
+ search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
314
+ model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
315
+ value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
316
+ label="Model Types",
317
+ show_label=False,
318
+ # info="Which model types to include.",
319
+ )
320
+ with gr.Row():
321
+ PREF_SET_TEXT = """
322
+ For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic Helpful, Anthropic HHH, Stanford SHP, and OpenAI's Summarize data are used in the leaderboard ranking.
323
+ """
324
+ gr.Markdown(PREF_SET_TEXT)
325
+ with gr.Row():
326
+ # backup
327
+ pref_sets_table_hidden = gr.Dataframe(
328
+ prefs_data.values,
329
+ datatype=col_types_prefs,
330
+ headers=prefs_data.columns.tolist(),
331
+ visible=False,
332
+ )
333
+ pref_sets_table = gr.Dataframe(
334
+ regex_table(prefs_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values,
335
+ datatype=col_types_prefs,
336
+ headers=prefs_data.columns.tolist(),
337
+ elem_id="prefs_dataframe",
338
+ height=1000,
339
+ )
340
+
341
+
342
+ with gr.TabItem("About"):
343
+ with gr.Row():
344
+ gr.Markdown(ABOUT_TEXT)
345
 
346
+ with gr.TabItem("Dataset Viewer"):
347
+ with gr.Row():
348
+ # loads one sample
349
+ gr.Markdown("""## Random Dataset Sample Viewer
350
+ Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
351
+ subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
352
+ button = gr.Button("Show Random Sample")
353
 
354
+ with gr.Row():
355
+ sample_display = gr.Markdown("{sampled data loads here}")
356
+
357
+ button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
358
+ # removed plot because not pretty enough
359
+ # with gr.TabItem("Model Correlation"):
360
+ # with gr.Row():
361
+ # plot = plot_avg_correlation(rewardbench_data_avg, prefs_data)
362
+ # gr.Plot(plot)
363
+
364
+ search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
365
+ search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
366
+ # search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len)
367
+ search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
368
+
369
+ model_types_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
370
+ model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
371
+ model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
372
 
373
  with gr.Row():
374
  with gr.Accordion("📚 Citation", open=False):
375
  citation_button = gr.Textbox(
376
+ value=r"""@misc{RewardBench,
 
 
 
 
 
 
 
377
  title={RewardBench: Evaluating Reward Models for Language Modeling},
378
  author={Lambert, Nathan and Pyatkin, Valentina and Morrison, Jacob and Miranda, LJ and Lin, Bill Yuchen and Chandu, Khyathi and Dziri, Nouha and Kumar, Sachin and Zick, Tom and Choi, Yejin and Smith, Noah A. and Hajishirzi, Hannaneh},
379
  year={2024},
 
384
  elem_id="citation-button",
385
  show_copy_button=True,
386
  )
387
+ # Load data when app starts, TODO make this used somewhere...
388
+ # def load_data_on_start():
389
+ # data_rewardbench = load_all_data(repo_dir_rewardbench)
390
+ # rewardbench_table.update(data_rewardbench)
391
+
392
+ # data_rewardbench_avg = avg_over_rewardbench(repo_dir_rewardbench)
393
+ # rewardbench_table.update(data_rewardbench_avg)
394
+
395
+ # data_prefs = load_all_data(repo_dir_prefs)
396
+ # pref_sets_table.update(data_prefs)
397
 
398
+ scheduler = BackgroundScheduler()
399
+ scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
400
+ scheduler.start()
401
+ app.launch(allowed_paths=['src/']) # had .queue() before launch before... not sure if that's necessary
leaderboard/final-rbv1-data.csv DELETED
@@ -1,193 +0,0 @@
1
- ,Model,Model Type,Score,Chat,Chat Hard,Safety,Reasoning,Prior Sets (0.5 weight)
2
- 1,"<a target=""_blank"" href=""https://huggingface.co/infly/INF-ORM-Llama3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">infly/INF-ORM-Llama3.1-70B</a>",Seq. Classifier,95.10529562974679,96.64804469273743,91.00877192982456,93.64864864864865,99.1157172477765,
3
- 2,"<a target=""_blank"" href=""https://huggingface.co/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1</a>",Seq. Classifier,94.99413134933042,96.36871508379889,90.78947368421052,93.78378378378379,99.03455284552845,
4
- 3,"<a target=""_blank"" href=""https://huggingface.co/nicolinho/QRM-Gemma-2-27B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nicolinho/QRM-Gemma-2-27B</a>",Seq. Classifier,94.43611331484493,96.64804469273743,90.13157894736842,92.70270270270271,98.26212691657118,
5
- 4,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Reward-Gemma-2-27B-v0.2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Reward-Gemma-2-27B-v0.2</a>",Seq. Classifier,94.26093621016115,96.08938547486034,89.91228070175438,92.97297297297297,98.0691056910569,
6
- 5,"<a target=""_blank"" href=""https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nvidia/Llama-3.1-Nemotron-70B-Reward</a> *",Custom Classifier,94.10897209520822,97.48603351955308,85.74561403508773,95.13513513513513,98.0691056910569,
7
- 6,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Reward-Gemma-2-27B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Reward-Gemma-2-27B</a> ⚠️",Seq. Classifier,93.80116450605776,95.81005586592178,91.44736842105263,91.89189189189189,96.05534184536477,
8
- 7,"<a target=""_blank"" href=""https://huggingface.co/SF-Foundation/TextEval-Llama3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">SF-Foundation/TextEval-Llama3.1-70B</a> * ⚠️",Generative,93.48032435319458,94.1340782122905,90.13157894736842,93.24324324324324,96.41239700987613,
9
- 8,"<a target=""_blank"" href=""https://huggingface.co/meta-metrics/MetaMetrics-RM-v1.0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-metrics/MetaMetrics-RM-v1.0</a>",Custom Classifier,93.42462545063005,98.32402234636872,86.40350877192982,90.8108108108108,98.16015987341082,
10
- 9,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Critic-Llama-3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Critic-Llama-3.1-70B</a> ⚠️",Generative,93.30801781900792,96.64804469273743,87.93859649122807,93.10810810810811,95.5373219839581,
11
- 10,"<a target=""_blank"" href=""https://huggingface.co/nicolinho/QRM-Llama3.1-8B-v2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nicolinho/QRM-Llama3.1-8B-v2</a>",Seq. Classifier,93.13653373860271,96.36871508379889,86.84210526315789,92.56756756756756,96.76774703988652,
12
- 11,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Reward-Llama-3.1-8B-v0.2</a>",Seq. Classifier,93.12997963530022,94.6927374301676,88.37719298245614,92.70270270270271,96.7472854258744,
13
- 12,"<a target=""_blank"" href=""https://huggingface.co/nicolinho/QRM-Llama3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nicolinho/QRM-Llama3.1-8B</a> ⚠️",Seq. Classifier,93.05891420009982,94.41340782122904,89.69298245614036,92.29729729729729,95.83196922573254,
14
- 13,"<a target=""_blank"" href=""https://huggingface.co/LxzGordon/URM-LLaMa-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">LxzGordon/URM-LLaMa-3.1-8B</a> ⚠️",Seq. Classifier,92.93773298857982,95.53072625698324,88.15789473684211,91.08108108108108,96.98122987941288,
15
- 14,"<a target=""_blank"" href=""https://huggingface.co/Salesforce/SFR-LLaMa-3.1-70B-Judge-r"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Salesforce/SFR-LLaMa-3.1-70B-Judge-r</a> *",Generative,92.71833683150776,96.92737430167598,84.75877192982456,91.62162162162163,97.56557947290882,
16
- 15,"<a target=""_blank"" href=""https://huggingface.co/R-I-S-E/RISE-Judge-Qwen2.5-32B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">R-I-S-E/RISE-Judge-Qwen2.5-32B</a>",Generative,92.66088172895866,96.64804469273743,83.33333333333333,91.89189189189189,98.77025699787198,
17
- 16,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Reward-Llama-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Reward-Llama-3.1-8B</a> ⚠️",Seq. Classifier,92.52495013691698,95.81005586592178,87.28070175438596,90.8108108108108,96.19823211654936,
18
- 17,"<a target=""_blank"" href=""https://huggingface.co/AtlaAI/Selene-1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">AtlaAI/Selene-1</a>",Generative,92.41086740661206,97.76536312849161,83.99122807017544,92.16216216216216,95.72471626561904,
19
- 18,"<a target=""_blank"" href=""https://huggingface.co/general-preference/GPM-Llama-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">general-preference/GPM-Llama-3.1-8B</a> ⚠️",Custom Classifier,92.23713029788581,93.29608938547486,88.59649122807018,91.08108108108108,95.97485949691712,
20
- 19,"<a target=""_blank"" href=""https://huggingface.co/nvidia/Nemotron-4-340B-Reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nvidia/Nemotron-4-340B-Reward</a> *",Custom Classifier,91.9958677606516,95.81005586592178,87.06140350877193,91.48648648648648,93.6255251814263,
21
- 20,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-Llama3-8B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-Llama3-8B-rewardmodel-ft</a> ⚠️",Seq. Classifier,91.53526049213252,95.53072625698324,86.1842105263158,90.8108108108108,93.61529437442026,
22
- 21,"<a target=""_blank"" href=""https://huggingface.co/nicolinho/QRM-Llama3-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nicolinho/QRM-Llama3-8B</a> ⚠️",Seq. Classifier,91.0990919512119,95.81005586592178,81.14035087719299,89.86486486486487,97.581096196868,
23
- 22,"<a target=""_blank"" href=""https://huggingface.co/SF-Foundation/TextEval-OffsetBias-12B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">SF-Foundation/TextEval-OffsetBias-12B</a> *",Generative,91.04924182882311,91.89944134078212,86.62280701754386,92.02702702702703,93.64769192993944,
24
- 23,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-llama3.2-3B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-llama3.2-3B-rewardmodel-ft</a>",Seq. Classifier,90.92295892363056,91.62011173184358,84.86842105263158,92.70270270270271,94.50060020734435,
25
- 24,"<a target=""_blank"" href=""https://huggingface.co/Salesforce/SFR-nemo-12B-Judge-r"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Salesforce/SFR-nemo-12B-Judge-r</a> *",Generative,90.26551100385808,97.20670391061452,82.23684210526316,86.48648648648648,95.13201151306815,
26
- 25,"<a target=""_blank"" href=""https://huggingface.co/internlm/internlm2-20b-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">internlm/internlm2-20b-reward</a>",Seq. Classifier,90.15948083664846,98.88268156424581,76.53508771929825,89.45945945945945,95.76069460359032,
27
- 26,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-VL-Reward-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-VL-Reward-7B</a>",Seq. Classifier,90.07022246172819,89.94413407821229,87.5,91.08108108108108,91.75567468761938,
28
- 27,"<a target=""_blank"" href=""https://huggingface.co/facebook/Self-taught-evaluator-llama3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">facebook/Self-taught-evaluator-llama3.1-70B</a> *",Generative,90.01358317701886,96.92737430167598,85.08771929824562,89.5945945945946,88.44464451355923,
29
- 28,"<a target=""_blank"" href=""https://huggingface.co/LxzGordon/URM-LLaMa-3-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">LxzGordon/URM-LLaMa-3-8B</a>",Seq. Classifier,89.90981543420907,96.92737430167598,78.7280701754386,88.24324324324324,95.74057401647842,
30
- 29,"<a target=""_blank"" href=""https://huggingface.co/NCSOFT/Llama-3-OffsetBias-RM-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NCSOFT/Llama-3-OffsetBias-RM-8B</a>",Seq. Classifier,89.41975692993036,97.20670391061452,81.79824561403508,86.75675675675676,91.91732143831506,
31
- 30,"<a target=""_blank"" href=""https://huggingface.co/AtlaAI/Selene-1-Mini-Llama-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">AtlaAI/Selene-1-Mini-Llama-3.1-8B</a>",Generative,89.12784912886812,93.57541899441341,79.3859649122807,89.25675675675676,94.29325585202162,
32
- 31,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Critic-Llama-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Critic-Llama-3.1-8B</a>",Generative,88.95511699074142,93.57541899441341,81.35964912280701,91.08108108108108,89.80431876466416,
33
- 32,"<a target=""_blank"" href=""https://huggingface.co/nvidia/Llama3-70B-SteerLM-RM"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nvidia/Llama3-70B-SteerLM-RM</a> *",Custom Classifier,88.76963582088416,91.34078212290503,80.26315789473684,92.83783783783784,90.63676542805698,
34
- 33,"<a target=""_blank"" href=""https://huggingface.co/Salesforce/SFR-LLaMa-3.1-8B-Judge-r"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Salesforce/SFR-LLaMa-3.1-8B-Judge-r</a> *",Generative,88.65372403487248,95.53072625698324,77.74122807017544,86.21621621621621,95.12672559611501,
35
- 34,"<a target=""_blank"" href=""https://huggingface.co/facebook/Self-taught-Llama-3-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">facebook/Self-taught-Llama-3-70B</a> *",Generative,88.62795600264494,96.92737430167598,83.99122807017544,91.08108108108108,82.5121405576472,
36
- 35,"<a target=""_blank"" href=""https://huggingface.co/RLHFlow/ArmoRM-Llama3-8B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">RLHFlow/ArmoRM-Llama3-8B-v0.1</a>",Custom Classifier,88.60367185781917,96.92737430167598,76.75438596491227,90.54054054054055,97.34715174332952,74.29414161945574
37
- 36,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-gemma2-2B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-gemma2-2B-rewardmodel-ft</a>",Seq. Classifier,88.39250002515702,93.01675977653632,77.19298245614036,92.16216216216216,91.19809570578929,
38
- 37,"<a target=""_blank"" href=""https://huggingface.co/google/gemini-1.5-pro-0514"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/gemini-1.5-pro-0514</a> *",Generative,88.20069001791948,92.31843575418995,80.59210526315789,87.9054054054054,91.98681364892467,
39
- 38,"<a target=""_blank"" href=""https://huggingface.co/R-I-S-E/RISE-Judge-Qwen2.5-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">R-I-S-E/RISE-Judge-Qwen2.5-7B</a>",Generative,88.19099980224239,92.17877094972067,76.53508771929825,87.97297297297297,96.07716756697768,
40
- 39,"<a target=""_blank"" href=""https://huggingface.co/Cohere May 2024"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Cohere May 2024</a> *",Custom Classifier,88.16038708182192,96.36871508379889,71.2719298245614,92.29729729729729,97.68272221312816,78.20215489882585
41
- 40,"<a target=""_blank"" href=""https://huggingface.co/google/flame-1.0-24B-july-2024"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/flame-1.0-24B-july-2024</a> *",Generative,87.80801832232187,92.17877094972067,75.65789473684211,89.5945945945946,93.80081300813008,
42
- 41,"<a target=""_blank"" href=""https://huggingface.co/internlm/internlm2-7b-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">internlm/internlm2-7b-reward</a>",Seq. Classifier,87.59316719911449,99.16201117318435,69.51754385964912,87.16216216216216,94.53095160146232,
43
- 42,"<a target=""_blank"" href=""https://huggingface.co/ZiyiYe/Con-J-Qwen2-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ZiyiYe/Con-J-Qwen2-7B</a> ⚠️",Generative,87.12028871485069,91.89944134078212,80.26315789473684,88.24324324324324,88.0753123806406,
44
- 43,"<a target=""_blank"" href=""https://huggingface.co/google/gemini-1.5-pro-0924"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/gemini-1.5-pro-0924</a>",Generative,86.78430992050927,94.1340782122905,76.97368421052632,85.8108108108108,90.21866644840945,
45
- 44,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4o-2024-08-06"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4o-2024-08-06</a>",Generative,86.72554986675267,96.08938547486034,76.09649122807018,88.10810810810811,86.60821465597208,
46
- 45,"<a target=""_blank"" href=""https://huggingface.co/RLHFlow/pair-preference-model-LLaMA3-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">RLHFlow/pair-preference-model-LLaMA3-8B</a>",Custom Classifier,85.74792972712865,98.32402234636872,65.78947368421052,89.72972972972973,94.73420363398264,74.57650875557454
47
- 46,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-llama3-8B-sftreg"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-llama3-8B-sftreg</a>",Seq. Classifier,85.42084389305319,98.60335195530726,67.76315789473684,89.1891891891892,92.29347410923774,73.08924874053665
48
- 47,"<a target=""_blank"" href=""https://huggingface.co/opencompass/CompassJudger-1-32B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">opencompass/CompassJudger-1-32B-Instruct</a>",Generative,85.22047081369766,98.04469273743017,65.13157894736842,85.27027027027027,92.43534129972173,
49
- 48,"<a target=""_blank"" href=""https://huggingface.co/Cohere March 2024"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Cohere March 2024</a> *",Custom Classifier,85.10802881361649,94.6927374301676,65.13157894736842,87.70270270270271,98.17073170731707,74.57675774743672
50
- 49,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-llama3-8B-distill"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-llama3-8B-distill</a>",Seq. Classifier,84.63918882385776,98.32402234636872,68.42105263157895,86.75675675675676,91.3273449009658,72.09434614337957
51
- 50,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-Gemma-2B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-Gemma-2B-rewardmodel-ft</a> ⚠️",Seq. Classifier,84.46827345209587,89.3854748603352,75.21929824561404,84.45945945945945,88.80886124297484,
52
- 51,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4-0125-preview"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4-0125-preview</a>",Generative,84.33564801010327,95.25139664804469,74.34210526315789,87.56756756756756,86.9236645386588,70.85136405607162
53
- 52,"<a target=""_blank"" href=""https://huggingface.co/mattshumer/Reflection-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mattshumer/Reflection-70B</a>",Generative,84.22327632009588,97.48603351955308,70.6140350877193,83.17567567567568,85.61736099743548,
54
- 53,"<a target=""_blank"" href=""https://huggingface.co/Anthropic/claude-3-5-sonnet-20240620"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Anthropic/claude-3-5-sonnet-20240620</a>",Generative,84.17242041164789,96.36871508379889,74.01315789473684,81.62162162162163,84.68618704643423,
55
- 54,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo</a>",Generative,84.12067803631126,97.20670391061452,74.56140350877193,77.56756756756756,87.14703715829104,
56
- 55,"<a target=""_blank"" href=""https://huggingface.co/opencompass/CompassJudger-1-14B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">opencompass/CompassJudger-1-14B-Instruct</a>",Generative,84.09022697921793,97.48603351955308,62.280701754385966,83.91891891891892,92.67525372401374,
57
- 56,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3.1-70B-Instruct</a>",Generative,84.05217990917473,97.20670391061452,70.17543859649123,82.83783783783784,85.98873929175534,
58
- 57,"<a target=""_blank"" href=""https://huggingface.co/NCSOFT/Llama-3-OffsetBias-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NCSOFT/Llama-3-OffsetBias-8B</a>",Generative,83.96777752436938,92.45810055865921,80.26315789473684,86.75675675675676,76.39309488732471,
59
- 58,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4-turbo-2024-04-09"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4-turbo-2024-04-09</a>",Generative,83.95011678629895,95.25139664804469,75.43859649122807,87.56756756756756,82.70345664866045,73.629016365689
60
- 59,"<a target=""_blank"" href=""https://huggingface.co/sfairXC/FsfairX-LLaMA3-RM-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">sfairXC/FsfairX-LLaMA3-RM-v0.1</a>",Seq. Classifier,83.38339965331156,99.44134078212291,65.13157894736842,86.75675675675676,86.43633709827031,74.91856971076719
61
- 60,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4o-2024-05-13"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4o-2024-05-13</a>",Generative,83.2681071132992,96.64804469273743,70.39473684210526,86.48648648648648,84.86965951874285,72.61510893954863
62
- 61,"<a target=""_blank"" href=""https://huggingface.co/opencompass/CompassJudger-1-7B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">opencompass/CompassJudger-1-7B-Instruct</a>",Generative,83.16709323590604,97.76536312849161,60.96491228070175,84.45945945945945,89.47863807497134,
63
- 62,"<a target=""_blank"" href=""https://huggingface.co/internlm/internlm2-1_8b-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">internlm/internlm2-1_8b-reward</a>",Seq. Classifier,82.16733515408055,93.57541899441341,66.2280701754386,81.62162162162163,87.24422982484859,
64
- 63,"<a target=""_blank"" href=""https://huggingface.co/CIR-AMS/BTRM_Qwen2_7b_0613"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">CIR-AMS/BTRM_Qwen2_7b_0613</a>",Seq. Classifier,81.72269085246006,97.48603351955308,57.23684210526316,90.13513513513513,87.74894963714738,70.2902968779431
65
- 64,"<a target=""_blank"" href=""https://huggingface.co/openbmb/Eurus-RM-7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openbmb/Eurus-RM-7b</a>",Seq. Classifier,81.58895090730017,98.04469273743017,65.5701754385965,81.35135135135135,86.3251623288045,71.71779445333651
66
- 65,"<a target=""_blank"" href=""https://huggingface.co/Nexusflow/Starling-RM-34B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Nexusflow/Starling-RM-34B</a>",Seq. Classifier,81.33351263768401,96.92737430167598,57.23684210526316,87.70270270270271,88.45078299776287,71.36620952434669
67
- 66,"<a target=""_blank"" href=""https://huggingface.co/google/gemma-2-27b-it"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/gemma-2-27b-it</a>",Generative,80.89669003773389,94.83240223463687,59.10087719298246,86.35135135135135,83.30212937196487,
68
- 67,"<a target=""_blank"" href=""https://huggingface.co/google/gemini-1.5-flash-001"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/gemini-1.5-flash-001</a>",Generative,80.5391103484727,92.17877094972067,63.48684210526316,86.95945945945945,85.1162219675888,69.36940417219024
69
- 68,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/Gemma-2B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/Gemma-2B-rewardmodel-ft</a> ⚠️",Seq. Classifier,80.47843057507436,77.93296089385476,74.78070175438596,85.27027027027027,83.92978938178643,
70
- 69,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-v2.5-13b-preference-mix-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-v2.5-13b-preference-mix-rm</a>",Seq. Classifier,80.26558812003782,93.57541899441341,68.20175438596492,77.29729729729729,88.50261908659355,67.23611355180205
71
- 70,"<a target=""_blank"" href=""https://huggingface.co/Anthropic/claude-3-opus-20240229"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Anthropic/claude-3-opus-20240229</a>",Generative,80.0759036376447,94.6927374301676,60.30701754385965,86.62162162162163,78.68223795492989,
72
- 71,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4o-mini-2024-07-18"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4o-mini-2024-07-18</a>",Generative,80.06759386119498,94.97206703910615,60.74561403508772,80.8108108108108,83.7418835597752,
73
- 72,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/RM-Mistral-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/RM-Mistral-7B</a>",Seq. Classifier,79.8233742639417,96.64804469273743,60.526315789473685,87.02702702702703,77.35615485349484,75.29528365000934
74
- 73,"<a target=""_blank"" href=""https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NousResearch/Hermes-3-Llama-3.1-70B</a>",Generative,78.47084260833167,96.22905027932961,56.68859649122807,82.29729729729729,78.6684263654717,
75
- 74,"<a target=""_blank"" href=""https://huggingface.co/hendrydong/Mistral-RM-for-RAFT-GSHF-v0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">hendrydong/Mistral-RM-for-RAFT-GSHF-v0</a>",Seq. Classifier,78.46503174091394,98.32402234636872,57.89473684210526,85.0,74.33602062530693,75.07572604066365
76
- 75,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo</a>",Generative,78.08002309698713,87.56983240223464,66.8859649122807,75.06756756756756,82.79672750586566,
77
- 76,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/reward-model-Mistral-7B-instruct-Unified-Feedback"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/reward-model-Mistral-7B-instruct-Unifie...</a>",Seq. Classifier,76.61192139206588,97.76536312849161,50.6578947368421,85.27027027027027,73.88893435914224,74.3423675391006
78
- 77,"<a target=""_blank"" href=""https://huggingface.co/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3</a>",DPO,76.52088102568138,97.20670391061452,63.37719298245614,76.35135135135135,72.84129972172205,69.13483329884433
79
- 78,"<a target=""_blank"" href=""https://huggingface.co/stabilityai/stablelm-2-12b-chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stabilityai/stablelm-2-12b-chat</a>",DPO,76.41872322421631,96.64804469273743,55.48245614035088,78.10810810810811,89.44862770775359,48.39403572004667
80
- 79,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3-70B-Instruct</a>",Generative,76.26515082171642,97.62569832402235,58.88157894736842,72.97297297297297,78.53644895509358,70.3529589965331
81
- 80,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-2-dpo-70b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-2-dpo-70b</a>",DPO,76.20735542607979,97.48603351955308,60.526315789473685,84.45945945945945,74.07206580455066,52.778449688644265
82
- 81,"<a target=""_blank"" href=""https://huggingface.co/gemini-1.5-flash-8b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">gemini-1.5-flash-8b</a>",Generative,76.00524043227317,94.41340782122904,59.86842105263158,73.98648648648648,75.75264636874557,
83
- 82,"<a target=""_blank"" href=""https://huggingface.co/Ahjeong/MMPO_Gemma_7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ahjeong/MMPO_Gemma_7b</a>",DPO,75.8660587247668,96.92737430167598,61.40350877192982,71.35135135135135,77.55872483221475,68.31261000855747
84
- 83,"<a target=""_blank"" href=""https://huggingface.co/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-20240229_meta-llama/Llama-3-70b-chat-hf"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022...</a>",Generative,75.77705517745792,95.25139664804469,54.05701754385965,80.33783783783784,73.46196868008948,
85
- 84,"<a target=""_blank"" href=""https://huggingface.co/allenai/llama-3-tulu-2-dpo-70b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/llama-3-tulu-2-dpo-70b</a>",DPO,74.9612075859509,96.36871508379889,57.45614035087719,74.86486486486487,80.2023653625798,56.86669694931664
86
- 85,"<a target=""_blank"" href=""https://huggingface.co/NousResearch/Nous-Hermes-2-Mistral-7B-DPO"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NousResearch/Nous-Hermes-2-Mistral-7B-DPO</a>",DPO,74.80880493527766,92.17877094972067,60.526315789473685,82.43243243243244,73.75184154526109,55.500522983723165
87
- 86,"<a target=""_blank"" href=""https://huggingface.co/Anthropic/claude-3-sonnet-20240229"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Anthropic/claude-3-sonnet-20240229</a>",Generative,74.57545943180953,93.43575418994413,56.578947368421055,81.6891891891892,69.07005374583947,69.63124589949818
88
- 87,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai/Mixtral-8x7B-Instruct-v0.1</a>",DPO,74.54632435829336,94.97206703910615,64.03508771929825,72.56756756756756,78.71855731980139,50.330359933093675
89
- 88,"<a target=""_blank"" href=""https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">prometheus-eval/prometheus-8x7b-v2.0</a>",Generative,74.5095375782243,93.01675977653632,47.14912280701754,80.47297297297297,77.39929475637038,
90
- 89,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-Gemma-2B-sftreg"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-Gemma-2B-sftreg</a>",Seq. Classifier,74.50927082674883,95.53072625698324,48.68421052631579,79.32432432432432,76.83949909968898,69.82591702611495
91
- 90,"<a target=""_blank"" href=""https://huggingface.co/general-preference/GPM-Gemma-2B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">general-preference/GPM-Gemma-2B</a>",Custom Classifier,74.49128373533642,71.50837988826815,69.73684210526316,81.21621621621621,75.50369673159818,
92
- 91,"<a target=""_blank"" href=""https://huggingface.co/0-hero/Matter-0.1-7B-boost-DPO-preview"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">0-hero/Matter-0.1-7B-boost-DPO-preview</a>",DPO,74.47914014376505,91.06145251396649,60.96491228070175,71.35135135135135,83.94718175369673,55.6624654944527
93
- 92,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-v2.5-70b-uf-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-v2.5-70b-uf-rm</a>",Seq. Classifier,73.98314832639727,86.59217877094972,71.71052631578948,70.13513513513513,75.70046925301467,57.571715987797305
94
- 93,"<a target=""_blank"" href=""https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">HuggingFaceH4/zephyr-7b-alpha</a>",DPO,73.92192687696839,91.62011173184358,62.5,76.62162162162163,75.13982102908277,53.534233127619544
95
- 94,"<a target=""_blank"" href=""https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">upstage/SOLAR-10.7B-Instruct-v1.0</a>",DPO,73.91132026830088,81.56424581005587,68.64035087719299,85.13513513513513,72.51596005892944,49.49049865208112
96
- 95,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-2-dpo-13b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-2-dpo-13b</a>",DPO,73.68126195691116,95.81005586592178,58.333333333333336,79.45945945945945,73.22972936105201,49.46620157266727
97
- 96,"<a target=""_blank"" href=""https://huggingface.co/opencompass/CompassJudger-1-1.5B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">opencompass/CompassJudger-1-1.5B-Instruct</a>",Generative,73.44238723104029,96.36871508379889,49.23245614035088,78.17567567567568,69.99270202433568,
98
- 97,"<a target=""_blank"" href=""https://huggingface.co/allenai/llama-3-tulu-2-8b-uf-mean-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/llama-3-tulu-2-8b-uf-mean-rm</a>",Seq. Classifier,73.41574916848018,95.25139664804469,59.21052631578947,61.62162162162162,82.1155262727124,64.3436007999852
99
- 98,"<a target=""_blank"" href=""https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">HuggingFaceH4/starchat2-15b-v0.1</a>",DPO,73.22060109644468,93.85474860335195,55.48245614035088,70.94594594594595,81.58522944289845,55.248649602907626
100
- 99,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/Gemma-2B-rewardmodel-baseline"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/Gemma-2B-rewardmodel-baseline</a>",Seq. Classifier,72.89758740021966,94.1340782122905,46.92982456140351,78.64864864864865,73.84050853931359,68.97216667866445
101
- 100,"<a target=""_blank"" href=""https://huggingface.co/Anthropic/claude-3-haiku-20240307"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Anthropic/claude-3-haiku-20240307</a>",Generative,72.89194286431167,92.73743016759776,51.973684210526315,79.52702702702703,70.60194658154636,66.34730980541012
102
- 101,"<a target=""_blank"" href=""https://huggingface.co/HuggingFaceH4/zephyr-7b-beta"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">HuggingFaceH4/zephyr-7b-beta</a>",DPO,72.80507814531524,95.25139664804469,62.719298245614034,65.67567567567568,77.89497735581382,52.16300745754066
103
- 102,"<a target=""_blank"" href=""https://huggingface.co/allenai/llama-3-tulu-2-dpo-8b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/llama-3-tulu-2-dpo-8b</a>",DPO,72.74751270450155,95.25139664804469,53.50877192982456,66.48648648648648,86.63038140448519,50.973541402832126
104
- 103,"<a target=""_blank"" href=""https://huggingface.co/0-hero/Matter-0.1-7B-DPO-preview"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">0-hero/Matter-0.1-7B-DPO-preview</a>",DPO,72.47264404067178,89.3854748603352,57.675438596491226,63.78378378378378,88.54320128771758,53.477999309390405
105
- 104,"<a target=""_blank"" href=""https://huggingface.co/jondurbin/bagel-dpo-34b-v0.5"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">jondurbin/bagel-dpo-34b-v0.5</a>",DPO,72.15167952196515,93.85474860335195,55.04385964912281,64.45945945945945,88.8907076990233,44.867564875771365
106
- 105,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-2-dpo-7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-2-dpo-7b</a>",DPO,72.11611434356087,97.48603351955308,56.14035087719298,75.27027027027027,71.75717520598025,47.737369346054734
107
- 106,"<a target=""_blank"" href=""https://huggingface.co/prometheus-eval/prometheus-7b-v2.0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">prometheus-eval/prometheus-7b-v2.0</a>",Generative,72.04295178846496,85.47486033519553,49.12280701754386,77.0945945945946,76.4795452065259,
108
- 107,"<a target=""_blank"" href=""https://huggingface.co/stabilityai/stablelm-zephyr-3b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stabilityai/stablelm-zephyr-3b</a>",DPO,71.45809212918405,86.31284916201118,60.08771929824562,74.05405405405405,75.73184372783325,50.74989667836822
109
- 108,"<a target=""_blank"" href=""https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO</a>",DPO,71.38329552978793,91.62011173184358,60.526315789473685,81.48648648648648,61.26104927156654,52.66173320935087
110
- 109,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json</a>",Seq. Classifier,71.27478404602779,93.57541899441341,40.78947368421053,79.45945945945945,,
111
- 110,"<a target=""_blank"" href=""https://huggingface.co/berkeley-nest/Starling-RM-7B-alpha"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">berkeley-nest/Starling-RM-7B-alpha</a>",Seq. Classifier,71.13020256724107,98.04469273743017,45.6140350877193,84.45945945945945,57.998444917335085,67.93855870128164
112
- 111,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-380k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,70.58403596186601,95.25139664804469,39.473684210526315,77.02702702702703,,
113
- 112,"<a target=""_blank"" href=""https://huggingface.co/CohereForAI/c4ai-command-r-plus"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">CohereForAI/c4ai-command-r-plus</a>",Generative,70.56998248762835,95.11173184357541,57.56578947368421,59.86486486486486,70.40312789872866,69.23881422694875
114
- 113,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-2660k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,70.19339171573809,94.97206703910615,37.5,78.10810810810811,,
115
- 114,"<a target=""_blank"" href=""https://huggingface.co/allenai/llama-3-tulu-2-70b-uf-mean-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/llama-3-tulu-2-70b-uf-mean-rm</a>",Seq. Classifier,70.19307792664753,86.31284916201118,56.14035087719298,60.945945945945944,82.68367708844875,59.57205519263016
116
- 115,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-3420k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,70.07936854820123,93.85474860335195,38.81578947368421,77.56756756756756,,
117
- 116,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-3.8m.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,70.03734328271229,94.1340782122905,38.81578947368421,77.16216216216216,,
118
- 117,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/RM-Gemma-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/RM-Gemma-7B</a>",Seq. Classifier,69.66957334431098,96.92737430167598,49.780701754385966,57.83783783783784,73.62395645768537,70.68641939562845
119
- 118,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-3040k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,69.44952818151877,93.85474860335195,37.06140350877193,77.43243243243244,,
120
- 119,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-1900k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,69.2421964746281,94.41340782122904,35.74561403508772,77.56756756756756,,
121
- 120,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/RM-Gemma-7B-4096"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/RM-Gemma-7B-4096</a>",Seq. Classifier,69.22303170109127,94.97206703910615,50.219298245614034,56.08108108108108,75.10912860806461,70.24413536208964
122
- 121,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-760k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,69.04502561956252,94.41340782122904,35.96491228070175,76.75675675675676,,
123
- 122,"<a target=""_blank"" href=""https://huggingface.co/openbmb/UltraRM-13b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openbmb/UltraRM-13b</a>",Seq. Classifier,69.02867919901104,96.36871508379889,55.48245614035088,59.86486486486486,62.44270748076608,72.94062565153789
124
- 123,"<a target=""_blank"" href=""https://huggingface.co/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5</a>",Seq. Classifier,69.00517292135855,88.54748603351955,48.68421052631579,63.108108108108105,77.51882468489114,65.32929758655776
125
- 124,"<a target=""_blank"" href=""https://huggingface.co/openbmb/Eurus-7b-kto"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openbmb/Eurus-7b-kto</a>",DPO,68.99912142883106,95.25139664804469,53.728070175438596,60.54054054054054,74.67261417580619,52.606849779819356
126
- 125,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-2280k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,68.95403268602327,93.85474860335195,37.06140350877193,75.94594594594595,,
127
- 126,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-14B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-14B-Chat</a>",DPO,68.64045386840729,57.262569832402235,70.17543859649123,71.21621621621621,89.61129753914987,41.23304044714641
128
- 127,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-1140k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,68.08398077583611,93.01675977653632,35.96491228070175,75.27027027027027,,
129
- 128,"<a target=""_blank"" href=""https://huggingface.co/RLHFlow/LLaMA3-iterative-DPO-final"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">RLHFlow/LLaMA3-iterative-DPO-final</a>",DPO,67.82774529803461,83.79888268156425,59.21052631578947,78.64864864864865,61.60650952147105,43.920573347364794
130
- 129,"<a target=""_blank"" href=""https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">HuggingFaceH4/zephyr-7b-gemma-v0.1</a>",DPO,67.57835885153328,95.81005586592178,49.56140350877193,58.24324324324324,74.63476018988378,51.70630404815817
131
- 130,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized.json</a>",Seq. Classifier,67.55772237983352,91.34078212290503,39.03508771929825,72.29729729729729,,
132
- 131,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-7B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-7B-Chat</a>",DPO,67.50138253417825,53.63128491620112,69.07894736842105,69.1891891891892,90.41475691602555,42.884086027930344
133
- 132,"<a target=""_blank"" href=""https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openbmb/MiniCPM-2B-dpo-fp32</a>",DPO,67.304776500488,89.10614525139665,49.3421052631579,57.2972972972973,82.33378348884159,49.58432590300511
134
- 133,"<a target=""_blank"" href=""https://huggingface.co/mightbe/Better-PairRM"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mightbe/Better-PairRM</a>",Custom Classifier,67.29754324103595,95.53072625698324,39.25438596491228,82.02702702702703,49.826076280897034,72.40145810968448
135
- 134,"<a target=""_blank"" href=""https://huggingface.co/allenai/OLMo-7B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/OLMo-7B-Instruct</a>",DPO,67.27282652187517,89.66480446927375,50.6578947368421,64.86486486486487,71.6763518306324,51.72760689365022
136
- 135,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-72B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-72B-Chat</a>",DPO,67.23151527906012,62.29050279329609,66.00877192982456,67.56756756756756,85.54352867354177,42.26289558308108
137
- 136,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0.json</a>",Seq. Classifier,66.54559072450868,93.29608938547486,45.39473684210526,60.945945945945944,,
138
- 137,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-MoE-A2.7B-Chat</a>",DPO,66.4408456376338,72.90502793296089,63.1578947368421,62.83783783783784,77.40082937742129,45.364430968579995
139
- 138,"<a target=""_blank"" href=""https://huggingface.co/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">RLHFlow/RewardModel-Mistral-7B-for-DPA-v1</a>",Seq. Classifier,66.33145463112653,87.98882681564245,49.780701754385966,70.67567567567568,59.70835379494734,60.675975598835954
140
- 139,"<a target=""_blank"" href=""https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stabilityai/stablelm-2-zephyr-1_6b</a>",DPO,65.73535970393974,96.64804469273743,46.71052631578947,60.270270270270274,67.84218639166257,48.67618199453821
141
- 140,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo</a>",Generative,65.65164437199641,80.72625698324022,49.780701754385966,63.986486486486484,68.11313226387297,
142
- 141,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/RM-Gemma-2B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/RM-Gemma-2B</a>",Seq. Classifier,65.48909618129333,94.41340782122904,40.78947368421053,49.86486486486486,76.37399738091341,66.51837812920436
143
- 142,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-3.5-turbo-0125"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-3.5-turbo-0125</a>",Generative,65.34011575979856,92.17877094972067,44.51754385964912,65.47297297297297,59.12315163420091,65.4761630050997
144
- 143,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-v2.5-70b-preference-mix-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-v2.5-70b-preference-mix-rm</a>",Seq. Classifier,65.15941759094567,77.37430167597765,59.21052631578947,84.86486486486487,41.37508866699405,60.785195271258935
145
- 144,"<a target=""_blank"" href=""https://huggingface.co/wenbopan/Faro-Yi-9B-DPO"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">wenbopan/Faro-Yi-9B-DPO</a>",DPO,64.61094996096162,92.17877094972067,53.07017543859649,55.13513513513514,58.392672013968465,63.945042573813076
146
- 145,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3-8B-Instruct</a>",Generative,64.49786646478918,85.47486033519553,41.55701754385965,67.97297297297297,64.82341627107546,60.82426393689548
147
- 146,"<a target=""_blank"" href=""https://huggingface.co/ai2/llama-2-chat-ultrafeedback-60k.jsonl"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/llama-2-chat-ultrafeedback-60k.jsonl</a>",Seq. Classifier,64.3955076805709,94.41340782122904,45.39473684210526,53.37837837837838,,
148
- 147,"<a target=""_blank"" href=""https://huggingface.co/IDEA-CCNL/Ziya-LLaMA-7B-Reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">IDEA-CCNL/Ziya-LLaMA-7B-Reward</a>",Seq. Classifier,63.784551529691385,86.87150837988827,46.05263157894737,64.05405405405405,57.74540295738528,64.61376982667257
149
- 148,"<a target=""_blank"" href=""https://huggingface.co/PKU-Alignment/beaver-7b-v2.0-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PKU-Alignment/beaver-7b-v2.0-reward</a>",Seq. Classifier,63.66172878401215,89.94413407821229,36.40350877192982,60.4054054054054,68.87004146887108,61.70937960727216
150
- 149,"<a target=""_blank"" href=""https://huggingface.co/stabilityai/stable-code-instruct-3b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stabilityai/stable-code-instruct-3b</a>",DPO,62.1618132126384,57.82122905027933,58.55263157894737,65.54054054054055,75.28271130026737,45.06209397367635
151
- 150,"<a target=""_blank"" href=""https://huggingface.co/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1</a>",Seq. Classifier,61.501047673154666,92.45810055865921,37.280701754385966,54.45945945945946,58.55022644186174,68.01245262965921
152
- 151,"<a target=""_blank"" href=""https://huggingface.co/OpenAssistant/reward-model-deberta-v3-large-v2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">OpenAssistant/reward-model-deberta-v3-large-v2</a>",Seq. Classifier,61.25988488574668,89.3854748603352,45.175438596491226,73.37837837837837,38.54968079882141,58.361018703667625
153
- 152,"<a target=""_blank"" href=""https://huggingface.co/llm-blender/PairRM-hf"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">llm-blender/PairRM-hf</a>",Custom Classifier,60.868838250756006,90.22346368715084,52.19298245614035,47.7027027027027,48.983739837398375,69.61376689001952
154
- 153,"<a target=""_blank"" href=""https://huggingface.co/PKU-Alignment/beaver-7b-v2.0-cost"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PKU-Alignment/beaver-7b-v2.0-cost</a>",Seq. Classifier,59.56778097839703,57.262569832402235,45.6140350877193,76.08108108108108,62.111570360670044,53.97151608182796
155
- 154,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_llama13b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_llama13b</a>",DPO,59.52205456101889,84.07821229050279,37.719298245614034,46.486486486486484,70.76683308779397,57.5968308283755
156
- 155,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_llama30b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_llama30b</a>",DPO,59.00687538053444,84.35754189944134,40.57017543859649,60.54054054054054,50.75435150324658,58.616659661160035
157
- 156,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-1.8B-Chat</a>",DPO,58.89567615638699,56.14525139664804,60.30701754385965,48.37837837837838,77.93283134173623,44.53412808623833
158
- 157,"<a target=""_blank"" href=""https://huggingface.co/ai2/llama-2-chat-7b-nectar-3.8m.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/llama-2-chat-7b-nectar-3.8m.json</a>",Seq. Classifier,58.426789771247286,86.31284916201118,26.535087719298247,62.432432432432435,,
159
- 158,"<a target=""_blank"" href=""https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-cost"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PKU-Alignment/beaver-7b-v1.0-cost</a>",Seq. Classifier,57.97567401900532,61.73184357541899,42.324561403508774,73.51351351351352,54.82109728815409,56.999034609857176
160
- 159,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_llama30b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_llama30b</a>",DPO,56.18285201407361,69.27374301675978,44.73684210526316,62.83783783783784,47.449118786489876,57.0505846339612
161
- 160,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_pythia1-4b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_pythia1-4b</a>",DPO,55.809930200702766,68.43575418994413,37.93859649122807,52.567567567567565,64.47488677906914,55.455761750707126
162
- 161,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_pythia6-9b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_pythia6-9b</a>",DPO,55.6117865296703,77.6536312849162,36.18421052631579,53.648648648648646,54.153707644459004,57.22568255835343
163
- 162,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_pythia2-8b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_pythia2-8b</a>",DPO,54.96592159422631,75.69832402234637,34.21052631578947,47.432432432432435,62.1572679652971,55.69619287630597
164
- 163,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-4B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-4B-Chat</a>",DPO,54.77003940637828,38.8268156424581,62.719298245614034,55.67567567567568,66.89344955530092,44.69987641930703
165
- 164,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_llama13b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_llama13b</a>",DPO,53.99846978252061,71.22905027932961,42.98245614035088,56.486486486486484,44.013272766955865,56.56369669643977
166
- 165,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_llama7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_llama7b</a>",DPO,53.883046644273705,55.865921787709496,43.64035087719298,45.67567567567568,69.41432040159329,55.754882314120465
167
- 166,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_llama7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_llama7b</a>",DPO,53.036829672694374,57.82122905027933,44.51754385964912,52.027027027027025,56.58147814699623,55.43691088634592
168
- 167,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-0.5B-Chat</a>",DPO,52.982802188122534,35.47486033519553,62.93859649122807,57.027027027027025,59.83862607082447,46.28699984455265
169
- 168,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_pythia2-8b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_pythia2-8b</a>",DPO,52.857927047782155,80.72625698324022,33.55263157894737,44.729729729729726,51.34671522889725,55.0106763884103
170
- 169,"<a target=""_blank"" href=""https://huggingface.co/my_model/"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">my_model/</a>",Seq. Classifier,52.672491797862534,45.53072625698324,55.921052631578945,43.91891891891892,65.319269383969,
171
- 170,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_pythia6-9b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_pythia6-9b</a>",DPO,52.6326255248281,74.86033519553072,34.21052631578947,51.75675675675676,48.470153325694326,55.09808653591037
172
- 171,"<a target=""_blank"" href=""https://huggingface.co/ai2/llama-2-chat-nectar-180k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/llama-2-chat-nectar-180k.json</a>",Seq. Classifier,52.34906620822528,88.26815642458101,28.50877192982456,40.270270270270274,,
173
- 172,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_pythia1-4b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_pythia1-4b</a>",DPO,52.334628884533196,63.96648044692738,37.280701754385966,50.4054054054054,56.71652479947619,54.27343514840888
174
- 173,"<a target=""_blank"" href=""https://huggingface.co/stanfordnlp/SteamSHP-flan-t5-xl"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stanfordnlp/SteamSHP-flan-t5-xl</a>",Custom Classifier,51.34535042343637,85.47486033519553,36.8421052631579,37.83783783783784,38.41156490423965,64.97541713006551
175
- 174,"<a target=""_blank"" href=""https://huggingface.co/SultanR/SmolTulu-1.7b-RM"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">SultanR/SmolTulu-1.7b-RM</a>",Seq. Classifier,50.93872947030961,74.30167597765363,44.078947368421055,57.16216216216216,28.212132373001584,
176
- 175,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_pythia12-0b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_pythia12-0b</a>",DPO,50.52988550561952,74.86033519553072,36.18421052631579,47.567567567567565,41.27175751623288,55.001227939281776
177
- 176,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/hh_rlhf_rm_open_llama_3b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/hh_rlhf_rm_open_llama_3b</a>",Seq. Classifier,50.274817067272814,81.84357541899442,37.280701754385966,41.486486486486484,32.80815190702243,65.63552247167672
178
- 177,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_pythia12-0b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_pythia12-0b</a>",DPO,50.08791349970499,66.75977653631286,36.40350877192982,54.32432432432432,41.39384514650516,53.02831193920059
179
- 178,"<a target=""_blank"" href=""https://huggingface.co/random"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">random</a>",,50.0,50.0,50.0,50.0,50.0,50.0
180
- 179,"<a target=""_blank"" href=""https://huggingface.co/stanfordnlp/SteamSHP-flan-t5-large"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stanfordnlp/SteamSHP-flan-t5-large</a>",Custom Classifier,49.62050475651485,85.75418994413408,33.1140350877193,37.432432432432435,35.62673923719103,62.72974940567991
181
- 180,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-v2.5-13b-uf-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-v2.5-13b-uf-rm</a>",Seq. Classifier,48.05551076423311,39.385474860335194,42.324561403508774,55.54054054054054,47.36897746494243,63.26048833944414
182
- 181,"<a target=""_blank"" href=""https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PKU-Alignment/beaver-7b-v1.0-reward</a>",Seq. Classifier,47.26664990676508,81.84357541899442,28.728070175438596,37.567567567567565,34.596155944780925,59.929110947322734
183
- 182,"<a target=""_blank"" href=""https://huggingface.co/allenai/Llama-3.1-70B-Instruct-RM-RB2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/Llama-3.1-70B-Instruct-RM-RB2</a>",Seq. Classifier,90.20891847250666,96.64804469273743,83.55263157894737,90.94594594594595,89.68905167239592,0.0
184
- 183,"<a target=""_blank"" href=""https://huggingface.co/allenai/Llama-3.1-8B-Instruct-RM-RB2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/Llama-3.1-8B-Instruct-RM-RB2</a>",Seq. Classifier,88.85411761564486,95.81005586592178,81.57894736842105,89.32432432432432,88.70314290391227,0.0
185
- 184,"<a target=""_blank"" href=""https://huggingface.co/allenai/Llama-3.1-8B-Base-RM-RB2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/Llama-3.1-8B-Base-RM-RB2</a>",Seq. Classifier,84.63022615056406,93.29608938547486,77.85087719298247,88.51351351351352,78.86042451028537,0.0
186
- 185,"<a target=""_blank"" href=""https://huggingface.co/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2</a>",Seq. Classifier,85.5067097271751,94.97206703910615,79.16666666666666,87.83783783783784,80.05026736508975,0.0
187
- 186,"<a target=""_blank"" href=""https://huggingface.co/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2</a>",Seq. Classifier,84.30783781180817,95.53072625698324,76.09649122807018,86.62162162162163,78.98251214055765,0.0
188
- 187,"<a target=""_blank"" href=""https://huggingface.co/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2</a>",Seq. Classifier,83.68729455170623,94.6927374301676,75.87719298245614,87.02702702702703,77.15222076717411,0.0
189
- 188,"<a target=""_blank"" href=""https://huggingface.co/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2</a>",Seq. Classifier,88.9245750153865,96.92737430167597,82.67543859649122,90.27027027027027,85.82521689310852,0.0
190
-
191
-
192
-
193
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard/retired-app.py DELETED
@@ -1,518 +0,0 @@
1
- import os
2
-
3
- import gradio as gr
4
- import numpy as np
5
- from apscheduler.schedulers.background import BackgroundScheduler
6
- from datasets import load_dataset
7
- from huggingface_hub import HfApi, snapshot_download
8
- from src.constants import example_counts, length_categories, subset_mapping
9
- from src.css import custom_css
10
- from src.md import ABOUT_TEXT, TOP_TEXT
11
- from src.utils import load_all_data
12
-
13
- api = HfApi()
14
-
15
- COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
16
- evals_repo = "allenai/reward-bench-results"
17
-
18
- eval_set_repo = "allenai/reward-bench"
19
- repo_dir_rewardbench = "./evals/rewardbench/"
20
-
21
-
22
- def restart_space():
23
- api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN)
24
-
25
-
26
- print("Pulling evaluation results")
27
- repo = snapshot_download(
28
- local_dir=repo_dir_rewardbench,
29
- ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
30
- repo_id=evals_repo,
31
- use_auth_token=COLLAB_TOKEN,
32
- tqdm_class=None,
33
- etag_timeout=30,
34
- repo_type="dataset",
35
- )
36
-
37
-
38
- def avg_over_rewardbench(dataframe_core, dataframe_prefs):
39
- """
40
- Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
41
-
42
- We average over 4 core sections (per prompt weighting):
43
- 1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
44
- 2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
45
- 3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
46
- 4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
47
- 5. Prior Sets (0.5 weight): Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
48
- """
49
- new_df = dataframe_core.copy()
50
- dataframe_prefs = dataframe_prefs.copy()
51
-
52
- # for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
53
- for subset, sub_subsets in subset_mapping.items():
54
- subset_cols = [col for col in new_df.columns if col in sub_subsets]
55
- sub_data = new_df[subset_cols].values # take the relevant column values
56
- sub_counts = [example_counts[s] for s in subset_cols] # take the example counts
57
- new_df[subset] = np.average(sub_data, axis=1, weights=sub_counts) # take the weighted average
58
- # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
59
-
60
- data_cols = list(subset_mapping.keys())
61
- keep_columns = (
62
- [
63
- "model",
64
- ]
65
- + ["model_type"]
66
- + data_cols
67
- )
68
- # keep_columns = ["model", "average"] + subsets
69
- new_df = new_df[keep_columns]
70
-
71
- # selected average from pref_sets
72
- pref_columns = ["anthropic_helpful", "anthropic_hhh", "shp", "summarize"]
73
- pref_data = dataframe_prefs[pref_columns].values
74
-
75
- # add column test sets knowing the rows are not identical, take superset
76
- dataframe_prefs["Prior Sets (0.5 weight)"] = np.nanmean(pref_data, axis=1)
77
-
78
- # add column Test Sets empty to new_df
79
- new_df["Prior Sets (0.5 weight)"] = np.nan
80
- # per row in new_df if model is in dataframe_prefs, add the value to new_df["Prior Sets (0.5 weight)"]
81
- values = []
82
- for i, row in new_df.iterrows():
83
- model = row["model"]
84
- if model in dataframe_prefs["model"].values:
85
- values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0])
86
- # new_df.at[i, "Prior Sets (0.5 weight)"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0]
87
- else:
88
- values.append(np.nan)
89
-
90
- new_df["Prior Sets (0.5 weight)"] = values
91
-
92
- # add total average
93
- data_cols += ["Prior Sets (0.5 weight)"]
94
- final_data = new_df[data_cols].values
95
- masked_data = np.ma.masked_array(final_data, np.isnan(final_data))
96
- weights = [2, 2, 2, 2, 1]
97
- average = np.ma.average(masked_data, axis=1, weights=weights)
98
- new_df["average"] = average.filled(np.nan)
99
- # new_df["average"] = np.nanmean(new_df[data_cols].values, axis=1)
100
-
101
- # make average third column
102
- keep_columns = ["model", "model_type", "average"] + data_cols
103
- new_df = new_df[keep_columns]
104
- return new_df
105
-
106
-
107
- def expand_subsets(dataframe):
108
- # TODO need to modify data/ script to do this
109
- pass
110
-
111
-
112
- def length_bias_check(dataframe):
113
- """
114
- Takes the raw rewardbench dataframe and splits the data into new buckets according to length_categories.
115
- Then, take the average of the three buckets as "average"
116
- """
117
- new_df = dataframe.copy()
118
- existing_subsets = new_df.columns[3:] # model, model_type, average
119
- final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
120
- # new data is empty list dict for each final subset
121
- new_data = {s: [] for s in final_subsets}
122
-
123
- # now, subsets correspond to those with True, Nuetral, and False length bias
124
- # check if length_categories[subset] == "True" or "False" or "Neutral"
125
- for subset in existing_subsets:
126
- subset_data = new_df[subset].values
127
- subset_length = length_categories[subset]
128
- # route to the correct bucket
129
- if subset_length == "True":
130
- new_data["Length Bias"].append(subset_data)
131
- elif subset_length == "Neutral":
132
- new_data["Neutral"].append(subset_data)
133
- elif subset_length == "False":
134
- new_data["Terse Bias"].append(subset_data)
135
-
136
- # take average of new_data and add to new_df (removing other columns than model)
137
- for subset in final_subsets:
138
- new_df[subset] = np.nanmean(new_data[subset], axis=0)
139
- keep_columns = ["model"] + final_subsets
140
- new_df = new_df[keep_columns]
141
- # recompute average
142
- # new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2)
143
-
144
- return new_df
145
-
146
-
147
- rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by="average", ascending=False)
148
- rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by="Terse Bias", ascending=False)
149
- prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by="average", ascending=False)
150
- # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
151
-
152
- rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by="average", ascending=False)
153
-
154
-
155
- def prep_df(df):
156
- # add column to 0th entry with count (column name itself empty)
157
- df.insert(0, "", range(1, 1 + len(df)))
158
-
159
- # replace "model" with "Model" and "model_type" with "Model Type" and "average" with "Average"
160
- df = df.rename(columns={"model": "Model", "model_type": "Model Type", "average": "Average"})
161
-
162
- # if "Model Type" in columns
163
- if "Model Type" in df.columns:
164
- # get model_types that have generative in them
165
- mask = df["Model Type"].str.contains("generative", case=False, na=False)
166
-
167
- # set these values to "Generative"
168
- df.loc[mask, "Model Type"] = "Generative"
169
-
170
- return df
171
-
172
-
173
- # add count column to all dataframes
174
- rewardbench_data = prep_df(rewardbench_data)
175
- rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
176
- # adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
177
-
178
- # save rewardbench_data_avg to csv or json
179
- rewardbench_data_avg.to_csv("rewardbench_data_avg.csv", index=False)
180
-
181
- rewardbench_data_length = prep_df(rewardbench_data_length)
182
- prefs_data = prep_df(prefs_data)
183
-
184
- col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
185
- col_types_rewardbench_avg = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
186
- cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
187
- col_types_prefs = ["number"] + ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
188
- # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
189
-
190
- # for showing random samples
191
- eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
192
-
193
-
194
- def random_sample(r: gr.Request, subset):
195
- if subset is None or subset == []:
196
- sample_index = np.random.randint(0, len(eval_set) - 1)
197
- sample = eval_set[sample_index]
198
- else: # filter by subsets (can be list)
199
- if isinstance(subset, str):
200
- subset = [subset]
201
- # filter down dataset to only include the subset(s)
202
- eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
203
- sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
204
- sample = eval_set_filtered[sample_index]
205
-
206
- markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
207
- return markdown_text
208
-
209
-
210
- subsets = eval_set.unique("subset")
211
-
212
- color_map = {
213
- "Generative": "#7497db",
214
- "Custom Classifier": "#E8ECF2",
215
- "Seq. Classifier": "#ffcd75",
216
- "DPO": "#75809c",
217
- }
218
-
219
-
220
- def color_model_type_column(df, color_map):
221
- """
222
- Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
223
-
224
- Parameters:
225
- df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
226
- color_map (dict): A dictionary mapping model types to colors.
227
-
228
- Returns:
229
- pd.Styler: The styled DataFrame.
230
- """
231
-
232
- # Function to apply color based on the model type
233
- def apply_color(val):
234
- color = color_map.get(val, "default") # Default color if not specified in color_map
235
- return f"background-color: {color}"
236
-
237
- # Format for different columns
238
- format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Average", "Model", "Model Type"]}
239
- format_dict["Average"] = "{:.2f}"
240
- format_dict[""] = "{:d}"
241
-
242
- return df.style.applymap(apply_color, subset=["Model Type"]).format(format_dict, na_rep="")
243
-
244
-
245
- def regex_table(dataframe, regex, filter_button, style=True):
246
- """
247
- Takes a model name as a regex, then returns only the rows that has that in it.
248
- """
249
- # Split regex statement by comma and trim whitespace around regexes
250
- regex_list = [x.strip() for x in regex.split(",")]
251
- # Join the list into a single regex pattern with '|' acting as OR
252
- combined_regex = "|".join(regex_list)
253
-
254
- # remove internal ai2 data
255
- dataframe = dataframe[~dataframe["Model"].str.contains("ai2", case=False, na=False)]
256
-
257
- # if filter_button, remove all rows with "ai2" in the model name
258
- update_scores = False
259
- if isinstance(filter_button, list) or isinstance(filter_button, str):
260
- if "Prior Sets" not in filter_button and "Prior Sets (0.5 weight)" in dataframe.columns:
261
- update_scores = True
262
- # remove the column "Prior Sets (0.5 weight)" from the outputted table
263
- dataframe = dataframe.drop(columns=["Prior Sets (0.5 weight)"])
264
- if "Seq. Classifiers" not in filter_button:
265
- dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifier", case=False, na=False)]
266
- if "DPO" not in filter_button:
267
- dataframe = dataframe[~dataframe["Model Type"].str.contains("DPO", case=False, na=False)]
268
- if "Custom Classifiers" not in filter_button:
269
- dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
270
- if "Generative" not in filter_button:
271
- dataframe = dataframe[~dataframe["Model Type"].str.contains("generative", case=False, na=False)]
272
- # Filter the dataframe such that 'model' contains any of the regex patterns
273
- data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
274
-
275
- # if update the score to not use prior sets, do so
276
- if update_scores:
277
- data["Score"] = (data["Chat"] + data["Chat Hard"] + data["Safety"] + data["Reasoning"]) / 4
278
- # if "Prior Sets (0.5 weight)" in data.columns:
279
- # data["Prior Sets (0.5 weight)"] = np.nan
280
- # sort array by Score column
281
- data = data.sort_values(by="Score", ascending=False)
282
-
283
- data.reset_index(drop=True, inplace=True)
284
-
285
- # replace column '' with count/rank
286
- data[""] = np.arange(1, 1 + len(data))
287
-
288
- # if Score exists, round to 2 decimals
289
- if "Score" in data.columns:
290
- data["Score"] = np.round(np.array(data["Score"].values).astype(float), 2)
291
- if "Average" in data.columns:
292
- data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1)
293
- # round all others to 1 decimal
294
- for col in data.columns:
295
- if col not in ["", "Model", "Model Type", "Score", "Average"]:
296
- # replace any data[col].values == '' with np.nan
297
- data[col] = data[col].replace("", np.nan)
298
- data[col] = np.round(np.array(data[col].values).astype(float), 1)
299
- if style:
300
- # apply color
301
- data = color_model_type_column(data, color_map)
302
-
303
- return data
304
-
305
-
306
- # import ipdb; ipdb.set_trace()
307
-
308
- total_models = len(
309
- regex_table(
310
- rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
311
- ).values
312
- )
313
-
314
- with gr.Blocks(css=custom_css) as app:
315
- # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
316
- with gr.Row():
317
- with gr.Column(scale=6):
318
- gr.Markdown(TOP_TEXT.format(str(total_models)))
319
- with gr.Column(scale=4):
320
- # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
321
- # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
322
- # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
323
- gr.Markdown(
324
- """
325
- ![](file/src/logo.png)
326
- """
327
- )
328
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
329
- with gr.TabItem("🏆 RewardBench Leaderboard"):
330
- with gr.Row():
331
- search_1 = gr.Textbox(
332
- label="Model Search (delimit with , )",
333
- placeholder="Model Search (delimit with , )",
334
- show_label=False,
335
- )
336
- model_types_1 = gr.CheckboxGroup(
337
- ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "Prior Sets"],
338
- value=["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
339
- label="Model Types",
340
- show_label=False,
341
- # info="Which model types to include.",
342
- )
343
- with gr.Row():
344
- # reference data
345
- rewardbench_table_hidden = gr.Dataframe(
346
- rewardbench_data_avg.values,
347
- datatype=col_types_rewardbench_avg,
348
- headers=rewardbench_data_avg.columns.tolist(),
349
- visible=False,
350
- )
351
- rewardbench_table = gr.Dataframe(
352
- regex_table(
353
- rewardbench_data_avg.copy(),
354
- "",
355
- ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
356
- ),
357
- datatype=col_types_rewardbench_avg,
358
- headers=rewardbench_data_avg.columns.tolist(),
359
- elem_id="rewardbench_dataframe_avg",
360
- height=1000,
361
- )
362
-
363
- with gr.TabItem("🔍 RewardBench - Detailed"):
364
- with gr.Row():
365
- search_2 = gr.Textbox(
366
- label="Model Search (delimit with , )",
367
- show_label=False,
368
- placeholder="Model Search (delimit with , )",
369
- )
370
- model_types_2 = gr.CheckboxGroup(
371
- ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
372
- value=["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"],
373
- label="Model Types",
374
- show_label=False,
375
- # info="Which model types to include."
376
- )
377
- with gr.Row():
378
- # ref data
379
- rewardbench_table_detailed_hidden = gr.Dataframe(
380
- rewardbench_data.values,
381
- datatype=col_types_rewardbench,
382
- headers=rewardbench_data.columns.tolist(),
383
- visible=False,
384
- )
385
- rewardbench_table_detailed = gr.Dataframe(
386
- regex_table(
387
- rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"]
388
- ),
389
- datatype=col_types_rewardbench,
390
- headers=rewardbench_data.columns.tolist(),
391
- elem_id="rewardbench_dataframe",
392
- height=1000,
393
- )
394
- # with gr.TabItem("rewardbench Eval Set - Length Bias"):
395
- # with gr.Row():
396
- # # backup
397
- # rewardbench_table_len_hidden = gr.Dataframe(
398
- # rewardbench_data_length.values,
399
- # datatype=cols_rewardbench_data_length,
400
- # headers=rewardbench_data_length.columns.tolist(),
401
- # visible=False,
402
- # )
403
- # rewardbench_table_len = gr.Dataframe(
404
- # regex_table(rewardbench_data_length.copy(), "", False).values,
405
- # datatype=cols_rewardbench_data_length,
406
- # headers=rewardbench_data_length.columns.tolist(),
407
- # elem_id="rewardbench_dataframe_length",
408
- # height=1000,
409
- # )
410
- with gr.TabItem("Prior Test Sets"):
411
- with gr.Row():
412
- search_3 = gr.Textbox(
413
- label="Model Search (delimit with , )",
414
- show_label=False,
415
- placeholder="Model Search (delimit with , )",
416
- )
417
- model_types_3 = gr.CheckboxGroup(
418
- ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
419
- value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
420
- label="Model Types",
421
- show_label=False,
422
- # info="Which model types to include.",
423
- )
424
- with gr.Row():
425
- PREF_SET_TEXT = """
426
- For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic Helpful, Anthropic HHH, Stanford SHP, and OpenAI's Summarize data are used in the leaderboard ranking.
427
- """
428
- gr.Markdown(PREF_SET_TEXT)
429
- with gr.Row():
430
- # backup
431
- pref_sets_table_hidden = gr.Dataframe(
432
- prefs_data.values,
433
- datatype=col_types_prefs,
434
- headers=prefs_data.columns.tolist(),
435
- visible=False,
436
- )
437
- pref_sets_table = gr.Dataframe(
438
- regex_table(prefs_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]),
439
- datatype=col_types_prefs,
440
- headers=prefs_data.columns.tolist(),
441
- elem_id="prefs_dataframe",
442
- height=1000,
443
- )
444
-
445
- with gr.TabItem("About"):
446
- with gr.Row():
447
- gr.Markdown(ABOUT_TEXT)
448
-
449
- with gr.TabItem("Dataset Viewer"):
450
- with gr.Row():
451
- # loads one sample
452
- gr.Markdown(
453
- """## Random Dataset Sample Viewer
454
- Warning, refusals, XSTest, and donotanswer datasets have sensitive content."""
455
- )
456
- subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
457
- button = gr.Button("Show Random Sample")
458
-
459
- with gr.Row():
460
- sample_display = gr.Markdown("{sampled data loads here}")
461
-
462
- button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
463
- # removed plot because not pretty enough
464
- # with gr.TabItem("Model Correlation"):
465
- # with gr.Row():
466
- # plot = plot_avg_correlation(rewardbench_data_avg, prefs_data)
467
- # gr.Plot(plot)
468
-
469
- search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
470
- search_2.change(
471
- regex_table,
472
- inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2],
473
- outputs=rewardbench_table_detailed,
474
- )
475
- # search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len)
476
- search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
477
-
478
- model_types_1.change(
479
- regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
480
- )
481
- model_types_2.change(
482
- regex_table,
483
- inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2],
484
- outputs=rewardbench_table_detailed,
485
- )
486
- model_types_3.change(
487
- regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table
488
- )
489
-
490
- with gr.Row():
491
- with gr.Accordion("📚 Citation", open=False):
492
- citation_button = gr.Textbox(
493
- value=r"""@misc{RewardBench,
494
- title={RewardBench: Evaluating Reward Models for Language Modeling},
495
- author={Lambert, Nathan and Pyatkin, Valentina and Morrison, Jacob and Miranda, LJ and Lin, Bill Yuchen and Chandu, Khyathi and Dziri, Nouha and Kumar, Sachin and Zick, Tom and Choi, Yejin and Smith, Noah A. and Hajishirzi, Hannaneh},
496
- year={2024},
497
- howpublished={\url{https://huggingface.co/spaces/allenai/reward-bench}
498
- }""",
499
- lines=7,
500
- label="Copy the following to cite these results.",
501
- elem_id="citation-button",
502
- show_copy_button=True,
503
- )
504
- # Load data when app starts, TODO make this used somewhere...
505
- # def load_data_on_start():
506
- # data_rewardbench = load_all_data(repo_dir_rewardbench)
507
- # rewardbench_table.update(data_rewardbench)
508
-
509
- # data_rewardbench_avg = avg_over_rewardbench(repo_dir_rewardbench)
510
- # rewardbench_table.update(data_rewardbench_avg)
511
-
512
- # data_prefs = load_all_data(repo_dir_prefs)
513
- # pref_sets_table.update(data_prefs)
514
-
515
- scheduler = BackgroundScheduler()
516
- scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
517
- scheduler.start()
518
- app.launch(allowed_paths=["src/"]) # had .queue() before launch before... not sure if that's necessary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pyproject.toml DELETED
@@ -1,13 +0,0 @@
1
- [tool.ruff]
2
- # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
- select = ["E", "F"]
4
- ignore = ["E501"] # line too long (black is taking care of this)
5
- line-length = 119
6
- fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
-
8
- [tool.isort]
9
- profile = "black"
10
- line_length = 119
11
-
12
- [tool.black]
13
- line-length = 119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
  apscheduler
2
- pandas>=1.5
3
- datasets==2.21.0
4
- fastapi==0.109.2
 
1
  apscheduler
2
+ pandas
3
+ datasets
 
{leaderboard → src}/constants.py RENAMED
@@ -1,28 +1,28 @@
1
  # reference for length bias categories
2
  length_categories = {
3
- "alpacaeval-easy": "True",
4
- "alpacaeval-hard": "True",
5
- "alpacaeval-length": "Neutral",
6
- "donotanswer": "False",
7
- "hep-cpp": "Neutral",
8
- "hep-go": "Neutral",
9
- "hep-java": "Neutral",
10
- "hep-js": "Neutral",
11
- "hep-python": "Neutral",
12
- "hep-rust": "Neutral",
13
- "llmbar-adver-GPTInst": "False",
14
- "llmbar-adver-GPTOut": "Neutral",
15
- "llmbar-adver-manual": "False",
16
- "llmbar-adver-neighbor": "False",
17
- "llmbar-natural": "Neutral",
18
- "math-prm": "Neutral",
19
- "mt-bench-easy": "False",
20
- "mt-bench-hard": "False",
21
- "mt-bench-med": "Neutral",
22
- "refusals-dangerous": "False",
23
- "refusals-offensive": "False",
24
- "xstest-should-refuse": "False",
25
- "xstest-should-respond": "True",
26
  }
27
 
28
  example_counts = {
@@ -32,7 +32,7 @@ example_counts = {
32
  "mt-bench-easy": 28,
33
  "mt-bench-med": 40,
34
  "mt-bench-hard": 37,
35
- "math-prm": 984, # actual length 447, upweighting to be equal to code
36
  "refusals-dangerous": 100,
37
  "refusals-offensive": 100,
38
  "llmbar-natural": 100,
@@ -40,34 +40,21 @@ example_counts = {
40
  "llmbar-adver-GPTInst": 92,
41
  "llmbar-adver-GPTOut": 47,
42
  "llmbar-adver-manual": 46,
43
- "xstest-should-refuse": 154,
44
- "xstest-should-respond": 250, # Note, refuse and respond were accidentally swapped until 9 Sept 2024
45
  "donotanswer": 136,
46
  "hep-cpp": 164,
47
  "hep-go": 164,
48
  "hep-java": 164,
49
  "hep-js": 164,
50
  "hep-python": 164,
51
- "hep-rust": 164,
52
  }
53
 
54
  # note, this order should match the dataframe.
55
  subset_mapping = {
56
- "Chat": ["alpacaeval-easy", "alpacaeval-hard", "alpacaeval-length", "mt-bench-easy", "mt-bench-med"],
57
- "Chat Hard": [
58
- "llmbar-adver-GPTInst",
59
- "llmbar-adver-GPTOut",
60
- "llmbar-adver-manual",
61
- "llmbar-adver-neighbor",
62
- "llmbar-natural",
63
- "mt-bench-hard",
64
- ],
65
- "Safety": [
66
- "donotanswer",
67
- "refusals-dangerous",
68
- "refusals-offensive",
69
- "xstest-should-refuse",
70
- "xstest-should-respond",
71
- ],
72
- "Reasoning": ["hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust", "math-prm"],
73
  }
 
1
  # reference for length bias categories
2
  length_categories = {
3
+ 'alpacaeval-easy': 'True',
4
+ 'alpacaeval-hard': 'True',
5
+ 'alpacaeval-length': 'Neutral',
6
+ 'donotanswer': 'False',
7
+ 'hep-cpp': 'Neutral',
8
+ 'hep-go': 'Neutral',
9
+ 'hep-java': 'Neutral',
10
+ 'hep-js': 'Neutral',
11
+ 'hep-python': 'Neutral',
12
+ 'hep-rust': 'Neutral',
13
+ 'llmbar-adver-GPTInst': 'False',
14
+ 'llmbar-adver-GPTOut': 'Neutral',
15
+ 'llmbar-adver-manual': 'False',
16
+ 'llmbar-adver-neighbor': 'False',
17
+ 'llmbar-natural': 'Neutral',
18
+ 'math-prm': 'Neutral',
19
+ 'mt-bench-easy': 'False',
20
+ 'mt-bench-hard': 'False',
21
+ 'mt-bench-med': 'Neutral',
22
+ 'refusals-dangerous': 'False',
23
+ 'refusals-offensive': 'False',
24
+ 'xstest-should-refuse': 'False',
25
+ 'xstest-should-respond': 'True'
26
  }
27
 
28
  example_counts = {
 
32
  "mt-bench-easy": 28,
33
  "mt-bench-med": 40,
34
  "mt-bench-hard": 37,
35
+ "math-prm": 984, # actual length 447, upweighting to be equal to code
36
  "refusals-dangerous": 100,
37
  "refusals-offensive": 100,
38
  "llmbar-natural": 100,
 
40
  "llmbar-adver-GPTInst": 92,
41
  "llmbar-adver-GPTOut": 47,
42
  "llmbar-adver-manual": 46,
43
+ "xstest-should-refuse": 250,
44
+ "xstest-should-respond": 154,
45
  "donotanswer": 136,
46
  "hep-cpp": 164,
47
  "hep-go": 164,
48
  "hep-java": 164,
49
  "hep-js": 164,
50
  "hep-python": 164,
51
+ "hep-rust": 164
52
  }
53
 
54
  # note, this order should match the dataframe.
55
  subset_mapping = {
56
+ "Chat": ['alpacaeval-easy', 'alpacaeval-hard', 'alpacaeval-length', 'mt-bench-easy', 'mt-bench-med'],
57
+ "Chat Hard": ['llmbar-adver-GPTInst', 'llmbar-adver-GPTOut', 'llmbar-adver-manual', 'llmbar-adver-neighbor', 'llmbar-natural', 'mt-bench-hard'],
58
+ "Safety": ['donotanswer', 'refusals-dangerous', 'refusals-offensive', 'xstest-should-refuse', 'xstest-should-respond'],
59
+ "Reasoning": ["hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust", "math-prm"]
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  }
{leaderboard → src}/css.py RENAMED
@@ -1,4 +1,3 @@
1
- ACCENT = "#245ED4" # OLMo Blue. Not currently used.
2
  custom_css = """
3
 
4
  /* Full width space */
@@ -12,11 +11,12 @@ custom_css = """
12
  }
13
 
14
  .tab-buttons button {
15
- font-size: 30px;
16
  }
17
 
18
  h1 {
19
  font-size: 32px !important;
20
  margin-top: 0px !important;
21
  }
22
- """
 
 
 
1
  custom_css = """
2
 
3
  /* Full width space */
 
11
  }
12
 
13
  .tab-buttons button {
14
+ font-size: 20px;
15
  }
16
 
17
  h1 {
18
  font-size: 32px !important;
19
  margin-top: 0px !important;
20
  }
21
+
22
+ """
{leaderboard → src}/logo.png RENAMED
File without changes
{leaderboard → src}/md.py RENAMED
@@ -1,87 +1,7 @@
1
- from datetime import datetime
2
-
3
- import pytz
4
-
5
- ABOUT_TEXT_V2 = """
6
- The RewardBench 2 evaluation dataset is the new version of RewardBench that is based on unseen human data and designed to be substantially more difficult! RewardBench 2 evaluates capabilities of reward models over the following categories:
7
- 1. **Factuality** (*NEW!*): Tests the ability of RMs to detect hallucinations and other basic errors in completions.
8
- 2. **Precise Instruction Following** (*NEW!*): Tests the ability of RMs to judge whether text follows precise instructions, such as "Answer without the letter u".
9
- 3. **Math**: Tests RMs' abilities at math, on open-ended human prompts ranging from middle school physics and geometry to college-level chemistry, calculus, combinatorics, and more.
10
- 4. **Safety**: Tests RMs' abilities to correctly comply with or refuse prompts related to harmful use cases as well as general compliance behaviors.
11
- 5. **Focus**: Tests RMs' ability to detect high-quality, on-topic answers to general user queries.
12
- 6. **Ties** (*NEW*!): This new type of subset tests the robustness of RMs in domains with many possible similar answers. For example, the question "Name a color of the rainbow" has seven possible correct answers and infinitely many incorrect ones.
13
-
14
- The RewardBench 2 leaderboard averages over these six subsets.
15
- For the first five categories, the scoring for RewardBench 2 evaluates success as whether the score of a prompt-chosen pair is greater than the score of *three* prompt-rejected pairs.
16
- The "Ties" score is a weighted score of accuracy (as measured by *all* valid correct answers being scored higher than *all* incorrect answers) and whether the reward margin between correct and incorrect answers exceeds that of the highest and lowest-scored correct responses. This metric rewards not only correctness, but also a model's ability to prioritize correct answers over incorrect ones more strongly than it distinguishes between equally valid correct responses.
17
-
18
- <img src="https://huggingface.co/datasets/allenai/blog-images/resolve/main/reward-bench/main-fig-hor.png" alt="RewardBench 2 Flow" width="800" style="margin-left:'auto' margin-right:'auto' display:'block'"/>
19
-
20
- ## Dataset Construction Summary
21
- | Domain | Count | Prompt Source | Method of generating completions | Completion Filtering |
22
- |--------|-------|---------------|----------------------------------|---------------------|
23
- | Factuality | 475 | Human | Both | Multi-LM-as-a-judge |
24
- | Precise IF | 160 | Human | Natural | Verifier functions |
25
- | Math | 183 | Human | Natural | Majority voting |
26
- | Safety | 450 | CoCoNot | Both | LM-as-a-judge & rubrics |
27
- | Focus | 495 | Human | System Prompt Variation | N/A |
28
- | Ties | 102 | Manual | System Prompt Variation | Manual verification |
29
-
30
- ## Dataset Details
31
-
32
- Each sample in the dataset has the following items.
33
- Note, the dataset is single-turn:
34
- * `prompt` (`str`): the instruction given in the various test sets.
35
- * `chosen` (`list[str]`): the chosen response(s) (1 chosen response for all subsets but ties)
36
- * `rejected` (`list[str]`): the rejected responses (3 chosen responses for all subsets but ties)
37
- * `num_correct` (`int`): the number of chosen responses
38
- * `num_rejected` (`int`): the number of rejected responses
39
- * `total_completions` (`int`): the total number of responses
40
- * `models` (`list[str]`): a list of models that the chosen and rejected responses are generated from, respectively
41
- * `subset` (`str`): the subset the datapoint is part of.
42
- * `id` (`int`): an incremented id for every prompt in the benchmark.
43
-
44
- To select a specific subset use HuggingFace Datasets `.filter` functionality.
45
- ```
46
- dataset = dataset.filter(lambda ex: ex["subset"] == "Factuality")
47
- ```
48
-
49
- ## Models Used
50
- We generated completions from the following models:
51
- - [Mistral 7B Instruct v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) (Apache 2.0)
52
- - [Tulu 3 8B](https://huggingface.co/allenai/Llama-3.1-Tulu-3-8B) (Llama 3.1 Community License Agreement)
53
- - [Tulu 3 70B](https://huggingface.co/allenai/Llama-3.1-Tulu-3-70B) (Llama 3.1 Community License Agreement)
54
- - [Llama 3.1 8B Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) (Llama 3.1 Community License Agreement)
55
- - [Llama 3.1 70B Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct) (Llama 3.1 Community License Agreement)
56
- - [Llama 3.2 1B Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) (Llama 3.2 Community License Agreement)
57
- - [Llama 2 7B Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) (Llama 2 Community License Agreement)
58
- - [Tulu 2 70B](https://huggingface.co/allenai/tulu-2-dpo-70b) (Ai2 ImpACT Low Risk License)
59
- - [Qwen2.5 72B Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) (Qwen License Agreement)
60
- - [Qwen2.5 7B Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) (Apache 2.0)
61
- - [Qwen2.5 14B Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) (Apache 2.0)
62
- - [Qwen2.5 0.5B Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) (Apache 2.0)
63
- - [Qwen2.5 Math 72B Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) (Qwen License Agreement)
64
- - [Qwen2.5 Math 7B Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) (Apache 2.0)
65
- - [Deepseek Math 7B RL](https://huggingface.co/deepseek-ai/deepseek-math-7b-rl) (This model is licensed under the Deepseek License. Any use of the outputs from this model must be in accordance with the use restrictions in the [Deepseek License](https://github.com/deepseek-ai/DeepSeek-Math/blob/main/LICENSE-MODEL).)
66
- - [OLMoE 1B 7B 0924 Instruct](https://huggingface.co/allenai/OLMoE-1B-7B-0924) (Apache 2.0)
67
- - [Dolphin 2.0 Mistral 7b](https://huggingface.co/cognitivecomputations/dolphin-2.0-mistral-7b) (Apache 2.0)
68
- - [Zephyr 7b Beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) (MIT License)
69
- - GPT-4o (Outputs produced by GPT-4 are subject to OpenAI's [terms of use](https://openai.com/policies/row-terms-of-use/))
70
- - Claude 3.5 Sonnet (Outputs produced by Claude are subject to Anthropic [terms of service](https://www.anthropic.com/legal/consumer-terms) and [usage policy](https://www.anthropic.com/legal/aup))
71
-
72
- ## License
73
- This dataset is licensed under ODC-BY. It is intended for research and educational use in accordance with Ai2's [Responsible Use Guidelines](https://allenai.org/responsible-use). This dataset includes output data generated from third party models that are subject to separate terms governing their use.
74
-
75
- ## Trained Reward Models
76
- We also trained and released several reward models— check out the [RewardBench 2 Collection](https://huggingface.co/collections/allenai/reward-bench-2-683d2612a4b3e38a3e53bb51) to use them!
77
- """
78
-
79
- ABOUT_TEXT_V1 = """
80
  We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
81
  A win is when the score for the chosen response is higher than the score for the rejected response.
82
 
83
- Note: Models with (*) after the model name are independently submitted model scores which have not been verified by the RewardBench team.
84
-
85
  ## Overview
86
 
87
  We average over 4 core sections (per prompt weighting):
@@ -100,15 +20,21 @@ Once all subsets weighted averages are achieved, the final RewardBench score is
100
  We include multiple types of reward models in this evaluation:
101
  1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
102
  2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
103
- 3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed. *Note*: This also includes other models trained with implicit rewards, such as those trained with [KTO](https://arxiv.org/abs/2402.01306).
104
  4. **Random**: Random choice baseline.
105
  4. **Generative**: Prompting fine-tuned models to choose between two answers, similar to MT Bench and AlpacaEval.
106
 
107
  All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
108
- *Note*: The reference models for DPO models (and other implicit rewards) can be found in two ways.
109
- * Click on a specific model in results and you'll see a key `ref_model`, e.g. [Qwen](https://huggingface.co/datasets/allenai/reward-bench-results/blob/main/eval-set/Qwen/Qwen1.5-72B-Chat.json).
110
- * All the reference models are listed in the [evaluation configs](https://github.com/allenai/reward-bench/blob/main/scripts/configs/eval_configs.yaml).
111
 
 
 
 
 
 
 
 
 
112
 
113
  ### Subset Details
114
 
@@ -129,8 +55,8 @@ Total number of the prompts is: 2985, filtered from 5123.
129
  | llmbar-adver-GPTInst | 92 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
130
  | llmbar-adver-GPTOut | 47 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
131
  | llmbar-adver-manual | 46 | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
132
- | xstest-should-refuse | 450, 154 | False response dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
133
- | xstest-should-respond | 450, 250 | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
134
  | do not answer | 939, 136 | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer) |
135
  | math-prm | 447 | Human references vs. model error from OpenAI's Let's Verify Step by Step |
136
  | hep-cpp | 164 | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124)) |
@@ -170,21 +96,8 @@ Lengths (mean, std. dev.) include the prompt
170
  For more details, see the [dataset](https://huggingface.co/datasets/allenai/reward-bench).
171
  """
172
 
173
- # Get Pacific time zone (handles PST/PDT automatically)
174
- pacific_tz = pytz.timezone("America/Los_Angeles")
175
- current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
176
-
177
- TOP_TEXT = """# RewardBench: Evaluating Reward Models"""
178
-
179
- CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human data and designed to be substantially more difficult!
180
-
181
- [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset v2](https://huggingface.co/datasets/allenai/reward-bench-2) | [Results v2](https://huggingface.co/datasets/allenai/reward-bench-2-results) | [Paper](https://arxiv.org/abs/2506.01937) | Total models: {{}} | Last restart (PST): {current_time}"""
182
-
183
- CAPTION_V1 = f"""The original RewardBench -- the first reward model evaluation.
184
-
185
- [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset v1](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results v1](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper v1](https://arxiv.org/abs/2403.13787) | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
186
-
187
- **Note**: This leaderboard is frozen and will not be updated. The final version of the evaluation results are available in the source for this application.
188
-
189
- ⚠️ Many of the top models were trained on unintentionally contaminated, AI-generated data, for more information, see this [gist](https://gist.github.com/natolambert/1aed306000c13e0e8c5bc17c1a5dd300).
190
- """
 
1
+ ABOUT_TEXT = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
3
  A win is when the score for the chosen response is higher than the score for the rejected response.
4
 
 
 
5
  ## Overview
6
 
7
  We average over 4 core sections (per prompt weighting):
 
20
  We include multiple types of reward models in this evaluation:
21
  1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
22
  2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
23
+ 3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
24
  4. **Random**: Random choice baseline.
25
  4. **Generative**: Prompting fine-tuned models to choose between two answers, similar to MT Bench and AlpacaEval.
26
 
27
  All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
28
+ Others, such as **Generative Judge** are coming soon.
 
 
29
 
30
+ ### Model Types
31
+
32
+ Currently, we evaluate the following model types:
33
+ 1. **Sequence Classifiers**: A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
34
+ 2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
35
+ 3. **DPO**: Models trained with Direct Preference Optimization (DPO) with a reference model being either the base or supervised fine-tuning checkpoint.
36
+
37
+ Support of DPO models without a reference model is coming soon.
38
 
39
  ### Subset Details
40
 
 
55
  | llmbar-adver-GPTInst | 92 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
56
  | llmbar-adver-GPTOut | 47 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
57
  | llmbar-adver-manual | 46 | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
58
+ | xstest-should-refuse | 450, 250 | False response dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
59
+ | xstest-should-respond | 450, 154 | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
60
  | do not answer | 939, 136 | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer) |
61
  | math-prm | 447 | Human references vs. model error from OpenAI's Let's Verify Step by Step |
62
  | hep-cpp | 164 | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124)) |
 
96
  For more details, see the [dataset](https://huggingface.co/datasets/allenai/reward-bench).
97
  """
98
 
99
+ TOP_TEXT = """
100
+ # RewardBench: Evaluating Reward Models
101
+ ### Evaluating the capabilities, safety, and pitfalls of reward models
102
+ [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {}
103
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
{leaderboard → src}/plt.py RENAMED
@@ -1,55 +1,53 @@
1
  import matplotlib.pyplot as plt
2
  import pandas as pd
3
-
4
  from .utils import undo_hyperlink
5
 
6
-
7
  def plot_avg_correlation(df1, df2):
8
  """
9
  Plots the "average" column for each unique model that appears in both dataframes.
10
-
11
  Parameters:
12
  - df1: pandas DataFrame containing columns "model" and "average".
13
  - df2: pandas DataFrame containing columns "model" and "average".
14
  """
15
  # Identify the unique models that appear in both DataFrames
16
- common_models = pd.Series(list(set(df1["model"]) & set(df2["model"])))
17
-
18
  # Set up the plot
19
  plt.figure(figsize=(13, 6), constrained_layout=True)
20
 
21
- # axes from 0 to 1 for x and y
22
  plt.xlim(0.475, 0.8)
23
  plt.ylim(0.475, 0.8)
24
 
25
  # larger font (16)
26
- plt.rcParams.update({"font.size": 12, "axes.labelsize": 14, "axes.titlesize": 14})
27
  # plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)
28
  # plt.tight_layout()
29
  # plt.margins(0,0)
30
 
31
  for model in common_models:
32
  # Filter data for the current model
33
- df1_model_data = df1[df1["model"] == model]["average"].values
34
- df2_model_data = df2[df2["model"] == model]["average"].values
35
-
36
  # Plotting
37
  plt.scatter(df1_model_data, df2_model_data, label=model)
38
  m_name = undo_hyperlink(model)
39
  if m_name == "No text found":
40
  m_name = "Random"
41
- # Add text above each point like
42
  # plt.text(x[i] + 0.1, y[i] + 0.1, label, ha='left', va='bottom')
43
- plt.text(
44
- df1_model_data - 0.005, df2_model_data, m_name, horizontalalignment="right", verticalalignment="center"
45
- )
46
 
47
  # add correlation line to scatter plot
48
  # first, compute correlation
49
- corr = df1["average"].corr(df2["average"])
50
  # add correlation line based on corr
 
 
51
 
52
- plt.xlabel("HERM Eval. Set Avg.", fontsize=16)
53
- plt.ylabel("Pref. Test Sets Avg.", fontsize=16)
54
  # plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
55
- return plt
 
1
  import matplotlib.pyplot as plt
2
  import pandas as pd
 
3
  from .utils import undo_hyperlink
4
 
 
5
  def plot_avg_correlation(df1, df2):
6
  """
7
  Plots the "average" column for each unique model that appears in both dataframes.
8
+
9
  Parameters:
10
  - df1: pandas DataFrame containing columns "model" and "average".
11
  - df2: pandas DataFrame containing columns "model" and "average".
12
  """
13
  # Identify the unique models that appear in both DataFrames
14
+ common_models = pd.Series(list(set(df1['model']) & set(df2['model'])))
15
+
16
  # Set up the plot
17
  plt.figure(figsize=(13, 6), constrained_layout=True)
18
 
19
+ # axes from 0 to 1 for x and y
20
  plt.xlim(0.475, 0.8)
21
  plt.ylim(0.475, 0.8)
22
 
23
  # larger font (16)
24
+ plt.rcParams.update({'font.size': 12, 'axes.labelsize': 14,'axes.titlesize': 14})
25
  # plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)
26
  # plt.tight_layout()
27
  # plt.margins(0,0)
28
 
29
  for model in common_models:
30
  # Filter data for the current model
31
+ df1_model_data = df1[df1['model'] == model]['average'].values
32
+ df2_model_data = df2[df2['model'] == model]['average'].values
33
+
34
  # Plotting
35
  plt.scatter(df1_model_data, df2_model_data, label=model)
36
  m_name = undo_hyperlink(model)
37
  if m_name == "No text found":
38
  m_name = "Random"
39
+ # Add text above each point like
40
  # plt.text(x[i] + 0.1, y[i] + 0.1, label, ha='left', va='bottom')
41
+ plt.text(df1_model_data - .005, df2_model_data, m_name, horizontalalignment='right', verticalalignment='center')
 
 
42
 
43
  # add correlation line to scatter plot
44
  # first, compute correlation
45
+ corr = df1['average'].corr(df2['average'])
46
  # add correlation line based on corr
47
+
48
+
49
 
50
+ plt.xlabel('HERM Eval. Set Avg.', fontsize=16)
51
+ plt.ylabel('Pref. Test Sets Avg.', fontsize=16)
52
  # plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
53
+ return plt
{leaderboard → src}/utils.py RENAMED
@@ -1,78 +1,25 @@
1
- import os
2
- import re
3
- from pathlib import Path
4
-
5
- import numpy as np
6
  import pandas as pd
 
7
  from datasets import load_dataset
8
-
9
- UNVERIFIED_MODELS = []
10
- CONTAMINATED_MODELS = []
11
-
12
- UNVERIFIED_MODELS_V1 = [
13
- "nvidia/Nemotron-4-340B-Reward",
14
- "nvidia/Llama3-70B-SteerLM-RM",
15
- "Cohere May 2024",
16
- "google/gemini-1.5-pro-0514",
17
- "google/flame-24b-july-2024",
18
- "Cohere March 2024",
19
- "facebook/Self-taught-Llama-3-70B",
20
- "facebook/Self-taught-evaluator-llama3.1-70B",
21
- "google/flame-1.0-24B-july-2024",
22
- "Salesforce/SFR-LLaMa-3.1-70B-Judge-r",
23
- "Salesforce/SFR-nemo-12B-Judge-r",
24
- "Salesforce/SFR-LLaMa-3.1-8B-Judge-r",
25
- "SF-Foundation/TextEval-OffsetBias-12B",
26
- "SF-Foundation/TextEval-Llama3.1-70B",
27
- "nvidia/Llama-3.1-Nemotron-70B-Reward",
28
- ]
29
-
30
- # No longer used
31
- CONTAMINATED_MODELS_V1 = [
32
- "Skywork/Skywork-Reward-Gemma-2-27B",
33
- "Skywork/Skywork-Critic-Llama-3.1-70B",
34
- "LxzGordon/URM-LLaMa-3.1-8B",
35
- "Skywork/Skywork-Reward-Llama-3.1-8B",
36
- "Ray2333/GRM-Llama3-8B-rewardmodel-ft",
37
- "nicolinho/QRM-Llama3.1-8B",
38
- "nicolinho/QRM-Llama3-8B",
39
- "general-preference/GPM-Llama-3.1-8B",
40
- "SF-Foundation/TextEval-Llama3.1-70B",
41
- "ZiyiYe/Con-J-Qwen2-7B",
42
- "Ray2333/Gemma-2B-rewardmodel-ft",
43
- "Ray2333/GRM-Gemma-2B-rewardmodel-ft",
44
- ]
45
-
46
 
47
  # From Open LLM Leaderboard
48
  def model_hyperlink(link, model_name):
49
- # if model_name is above 50 characters, return first 47 characters and "..."
50
- if len(model_name) > 50:
51
- model_name = model_name[:47] + "..."
52
  if model_name == "random":
53
- output = "random"
54
  elif model_name == "Cohere March 2024":
55
- output = f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
56
  elif "openai" == model_name.split("/")[0]:
57
- output = f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
58
  elif "Anthropic" == model_name.split("/")[0]:
59
- output = f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
60
- elif "google" == model_name.split("/")[0]:
61
- output = f'<a target="_blank" href="https://huggingface.co/google" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
62
- elif "PoLL" == model_name.split("/")[0]:
63
- output = model_name
64
- output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
65
-
66
- if model_name in UNVERIFIED_MODELS:
67
- output += " *"
68
- if model_name in CONTAMINATED_MODELS:
69
- output += " ⚠️"
70
- return output
71
-
72
 
73
  def undo_hyperlink(html_string):
74
  # Regex pattern to match content inside > and <
75
- pattern = r">[^<]+<"
76
  match = re.search(pattern, html_string)
77
  if match:
78
  # Extract the matched text and remove leading '>' and trailing '<'
@@ -82,7 +29,7 @@ def undo_hyperlink(html_string):
82
 
83
 
84
  # Define a function to fetch and process data
85
- def load_all_data(data_repo, subdir: str, subsubsets=False): # use HF api to pull the git repo
86
  dir = Path(data_repo)
87
  data_dir = dir / subdir
88
  orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
@@ -100,20 +47,21 @@ def load_all_data(data_repo, subdir: str, subsubsets=False): # use HF api to pu
100
 
101
  # load all json data in the list models_results one by one to avoid not having the same entries
102
  for model in models_results:
103
- model_data = load_dataset("json", data_files=data_repo + subdir + "/" + model, split="train")
104
  df2 = pd.DataFrame(model_data)
105
  # add to df
106
  df = pd.concat([df2, df])
107
 
 
108
  # remove chat_template comlumn
109
  df = df.drop(columns=["chat_template"])
110
 
111
  # sort columns alphabetically
112
  df = df.reindex(sorted(df.columns), axis=1)
113
-
114
  # move column "model" to the front
115
  cols = list(df.columns)
116
- cols.insert(0, cols.pop(cols.index("model")))
117
  df = df.loc[:, cols]
118
 
119
  # select all columns except "model"
@@ -129,7 +77,7 @@ def load_all_data(data_repo, subdir: str, subsubsets=False): # use HF api to pu
129
  if "model_beaker" in cols:
130
  cols.remove("model_beaker")
131
  df = df.drop(columns=["model_beaker"])
132
-
133
  # remove column xstest (outdated data)
134
  # if xstest is a column
135
  if "xstest" in cols:
@@ -154,24 +102,24 @@ def load_all_data(data_repo, subdir: str, subsubsets=False): # use HF api to pu
154
  df = df.drop(columns=["pku_safer"])
155
  cols.remove("pku_safer")
156
 
157
- # convert to score
158
- df[cols] = df[cols] * 100
159
- avg = np.nanmean(df[cols].values, axis=1)
160
  # add average column
161
  df["average"] = avg
162
-
163
  # apply model_hyperlink function to column "model"
164
  df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x))
165
 
166
  # move average column to the second
167
  cols = list(df.columns)
168
- cols.insert(1, cols.pop(cols.index("average")))
169
  df = df.loc[:, cols]
170
 
171
  # move model_type column to first
172
  if "model_type" in cols:
173
  cols = list(df.columns)
174
- cols.insert(1, cols.pop(cols.index("model_type")))
175
  df = df.loc[:, cols]
176
 
177
  # remove models with DPO Ref. Free as type (future work)
 
 
 
 
 
 
1
  import pandas as pd
2
+ from pathlib import Path
3
  from datasets import load_dataset
4
+ import numpy as np
5
+ import os
6
+ import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # From Open LLM Leaderboard
9
  def model_hyperlink(link, model_name):
 
 
 
10
  if model_name == "random":
11
+ return "random"
12
  elif model_name == "Cohere March 2024":
13
+ return f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
14
  elif "openai" == model_name.split("/")[0]:
15
+ return f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
16
  elif "Anthropic" == model_name.split("/")[0]:
17
+ return f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
18
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def undo_hyperlink(html_string):
21
  # Regex pattern to match content inside > and <
22
+ pattern = r'>[^<]+<'
23
  match = re.search(pattern, html_string)
24
  if match:
25
  # Extract the matched text and remove leading '>' and trailing '<'
 
29
 
30
 
31
  # Define a function to fetch and process data
32
+ def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to pull the git repo
33
  dir = Path(data_repo)
34
  data_dir = dir / subdir
35
  orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
 
47
 
48
  # load all json data in the list models_results one by one to avoid not having the same entries
49
  for model in models_results:
50
+ model_data = load_dataset("json", data_files=data_repo + subdir+ "/" + model, split="train")
51
  df2 = pd.DataFrame(model_data)
52
  # add to df
53
  df = pd.concat([df2, df])
54
 
55
+
56
  # remove chat_template comlumn
57
  df = df.drop(columns=["chat_template"])
58
 
59
  # sort columns alphabetically
60
  df = df.reindex(sorted(df.columns), axis=1)
61
+
62
  # move column "model" to the front
63
  cols = list(df.columns)
64
+ cols.insert(0, cols.pop(cols.index('model')))
65
  df = df.loc[:, cols]
66
 
67
  # select all columns except "model"
 
77
  if "model_beaker" in cols:
78
  cols.remove("model_beaker")
79
  df = df.drop(columns=["model_beaker"])
80
+
81
  # remove column xstest (outdated data)
82
  # if xstest is a column
83
  if "xstest" in cols:
 
102
  df = df.drop(columns=["pku_safer"])
103
  cols.remove("pku_safer")
104
 
105
+ # convert to score
106
+ df[cols] = (df[cols]*100)
107
+ avg = np.nanmean(df[cols].values,axis=1)
108
  # add average column
109
  df["average"] = avg
110
+
111
  # apply model_hyperlink function to column "model"
112
  df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x))
113
 
114
  # move average column to the second
115
  cols = list(df.columns)
116
+ cols.insert(1, cols.pop(cols.index('average')))
117
  df = df.loc[:, cols]
118
 
119
  # move model_type column to first
120
  if "model_type" in cols:
121
  cols = list(df.columns)
122
+ cols.insert(1, cols.pop(cols.index('model_type')))
123
  df = df.loc[:, cols]
124
 
125
  # remove models with DPO Ref. Free as type (future work)