|
import gradio as gr |
|
import pandas as pd |
|
from pathlib import Path |
|
from datasets import load_dataset |
|
import json |
|
import os |
|
from huggingface_hub import HfApi, Repository |
|
import numpy as np |
|
|
|
api = HfApi() |
|
|
|
COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN") |
|
evals_repo = "ai2-rlhf-collab/rm-benchmark-results" |
|
BASE_DIR = "./evals/" |
|
|
|
|
|
|
|
|
|
|
|
def model_hyperlink(link, model_name): |
|
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
|
|
|
print("Pulling evaluation results") |
|
repo = Repository( |
|
local_dir=BASE_DIR, |
|
clone_from=evals_repo, |
|
use_auth_token=COLLAB_TOKEN, |
|
repo_type="dataset", |
|
) |
|
repo.git_pull() |
|
|
|
|
|
def fetch_and_display_data(): |
|
dir = Path(BASE_DIR) |
|
data_dir = dir / "data" |
|
orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))] |
|
|
|
models_results = [] |
|
for org in orgs: |
|
org_dir = data_dir / org |
|
files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))] |
|
for file in files: |
|
if file.endswith(".json"): |
|
models_results.append(org + "/" + file) |
|
|
|
|
|
df = pd.DataFrame() |
|
|
|
|
|
for model in models_results: |
|
model_data = load_dataset("json", data_files=BASE_DIR + "data/" + model, split="train") |
|
df2 = pd.DataFrame(model_data) |
|
|
|
df = pd.concat([df2, df]) |
|
|
|
|
|
|
|
df = df.drop(columns=["chat_template"]) |
|
|
|
|
|
cols = list(df.columns) |
|
cols.insert(0, cols.pop(cols.index('model'))) |
|
df = df.loc[:, cols] |
|
|
|
|
|
cols = df.columns.tolist() |
|
cols.remove("model") |
|
|
|
df[cols] = df[cols].round(2) |
|
avg = np.mean(df[cols].values,axis=1).round(2) |
|
|
|
df["average"] = avg |
|
|
|
|
|
df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x)) |
|
|
|
|
|
cols = list(df.columns) |
|
cols.insert(1, cols.pop(cols.index('average'))) |
|
df = df.loc[:, cols] |
|
return df |
|
|
|
benchmark_text = """ |
|
# HERM Results Viewer |
|
|
|
We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt. |
|
A win is when the score for the chosen response is higher than the score for the rejected response. |
|
|
|
### Subset summary |
|
|
|
| Subset | Num. Samples (Pre-filtering, post-filtering) | Description | |
|
| :--------------------- | :------------------------------------------: | :---------------------------------------------------------------- | |
|
| alpacaeval-easy | 805 | Great model vs poor model | |
|
| alpacaeval-length | 805 | Good model vs low model, equal length | |
|
| alpacaeval-hard | 805 | Great model vs baseline model | |
|
| mt-bench-easy | 28, 28 | MT Bench 10s vs 1s | |
|
| mt-bench-medium | 45, 40 | MT Bench 9s vs 2-5s | |
|
| mt-bench-hard | 45, 37 | MT Bench 7-8 vs 5-6 | |
|
| refusals-dangerous | 505 | Dangerous response vs no response | |
|
| refusals-offensive | 704 | Offensive response vs no response | |
|
| llmbar-natural | 100 | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs | |
|
| llmbar-adver-neighbor | 134 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response | |
|
| llmbar-adver-GPTInst | 92 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response | |
|
| llmbar-adver-GPTOut | 47 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses | |
|
| llmbar-adver-manual | 46 | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected | |
|
| XSTest | 450 | TODO curate | |
|
| (?) repetitiveness | | | |
|
| (?) grammar | | | |
|
|
|
|
|
For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev). |
|
""" |
|
leaderboard_data = fetch_and_display_data() |
|
with gr.Blocks() as app: |
|
with gr.Row(): |
|
gr.Markdown(benchmark_text) |
|
|
|
with gr.Row(): |
|
output_table = gr.Dataframe( |
|
leaderboard_data.values, |
|
headers=leaderboard_data.columns.tolist(), |
|
) |
|
|
|
|
|
def load_data_on_start(): |
|
data = fetch_and_display_data() |
|
output_table.update(data) |
|
|
|
app.launch() |
|
|