Spaces:
Running
Running
File size: 10,595 Bytes
12ca829 437869e 12ca829 6b8fd7d 437869e 12ca829 6b8fd7d 12ca829 fa72a9a 12ca829 c1ec713 12ca829 c1ec713 437869e 12ca829 c1ec713 12ca829 c1ec713 12ca829 437869e 12ca829 437869e c1ec713 437869e 12ca829 437869e 12ca829 c1ec713 12ca829 c1ec713 12ca829 c1ec713 12ca829 c1ec713 12ca829 437869e 12ca829 c1ec713 12ca829 c1ec713 12ca829 c1ec713 12ca829 c1ec713 12ca829 c1ec713 12ca829 437869e 12ca829 c1ec713 12ca829 437869e c1ec713 12ca829 c1ec713 12ca829 437869e 12ca829 437869e 12ca829 437869e 12ca829 c1ec713 12ca829 437869e 12ca829 437869e c1ec713 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 |
import os
import json
import datetime
from email.utils import parseaddr
import gradio as gr
import pandas as pd
from datasets import load_dataset
from evaluation.evaluator import question_scorer as eval_scorer
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi
from content import format_error, format_warning, format_log, TITLE
# Placeholder for the question_scorer function
def question_scorer(prediction, gold_answer):
return eval_scorer(prediction, gold_answer)
# Constants and Configuration
TOKEN = os.environ.get("TOKEN", None)
OWNER = "Ori"
DATA_DATASET = f"Ori/AssistantBench_V1.0"
RESULTS_DATASET = f"Ori/results"
SUBMISSION_DATASET = f"{OWNER}/submissions"
LEADERBOARD_PATH = f"{OWNER}/leaderboard"
api = HfApi()
YEAR_VERSION = "default"
os.makedirs("scored", exist_ok=True)
# Load datasets
eval_results = load_dataset(RESULTS_DATASET, token=TOKEN, download_mode="force_redownload",
ignore_verifications=True, trust_remote_code=True)
gold_results = load_dataset(DATA_DATASET, token=TOKEN, trust_remote_code=True)
gold_answers = {split: {row["id"]: row["answer"] for row in gold_results[split]} for split in ["test"]}
gold_difficulties = {split: {row["id"]: row["difficulty"] for row in gold_results[split]} for split in ["test"]}
# Function to get dataframe from results
def get_dataframe_from_results(eval_results, split):
local_df = eval_results[split]
df = pd.DataFrame(local_df)
df = df.sort_values(by=["Accuracy"], ascending=False)
numeric_cols = [c for c in local_df.column_names if "score" in c]
df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
return df
# Update function to format dataframe
def format_dataframe(df):
df["Accuracy"] = df["Accuracy"].apply(lambda x: f"**{x:.2f}**")
if "URL" in df.columns:
df["Model Name"] = df.apply(lambda row: f"[{row['Model Name']}]({row['URL']})", axis=1)
df = df.drop(columns=["URL"])
df = df.rename(columns={"Model Family": "Base Model"})
df = df[["Model Name", "Accuracy", "Accuracy (easy)", "Accuracy (medium)", "Accuracy (hard)", "Answer rate", "Precision", "EM", "Base Model", "Organization"]]
return df
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
eval_dataframe_test = format_dataframe(eval_dataframe_test)
# Function to restart the space
def restart_space():
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
TYPES = ["markdown", "markdown", "number", "number", "number", "number", "number", "number", "str", "str"]
# Function to add a new evaluation
def add_new_eval(
model_name: str,
model_family: str,
url: str,
path_to_file: str,
organization: str,
mail: str,
):
_, parsed_mail = parseaddr(mail)
if "@" not in parsed_mail:
return format_warning("Please provide a valid email address.")
print("Adding new eval")
if model_name.lower() in set(
[m.lower() for m in eval_results["test"]["Model Name"]]) and organization.lower() in set(
[o.lower() for o in eval_results["test"]["Organization"]]):
return format_warning("This model has already been submitted.")
if path_to_file is None:
return format_warning("Please attach a file.")
api.upload_file(
repo_id=SUBMISSION_DATASET,
path_or_fileobj=path_to_file.name,
path_in_repo=f"{organization}/{model_name}/{YEAR_VERSION}_test_raw_{datetime.datetime.today()}.jsonl",
repo_type="dataset",
token=TOKEN
)
file_path = path_to_file.name
scores = 0
num_questions = 0
difficulty_scores = {"Easy": 0, "Medium": 0, "Hard": 0}
difficulty_counts = {"Easy": 0, "Medium": 0, "Hard": 0}
with open(f"scored/{organization}_{model_name}.jsonl", "w") as scored_file:
with open(file_path, 'r') as f:
for ix, line in enumerate(f):
try:
task = json.loads(line)
except Exception:
return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
if "answer" not in task:
return format_error(
f"Line {ix} contains no answer key. Please fix it and resubmit your file.")
answer = task["answer"]
task_id = task["id"]
if task_id not in gold_answers["test"]:
return format_error(
f"{task_id} not found in test set. Are you sure you submitted the correct file?")
score = question_scorer(task['answer'], gold_answers["test"][task_id])
difficulty = gold_difficulties["test"][task_id]
scored_file.write(
json.dumps({
"id": task_id,
"model_answer": answer,
"score": score
}) + "\n"
)
scores += score
num_questions += 1
difficulty_scores[difficulty] += score
difficulty_counts[difficulty] += 1
accuracy_easy = difficulty_scores["Easy"] / difficulty_counts["Easy"] if difficulty_counts["Easy"] > 0 else 0
accuracy_medium = difficulty_scores["Medium"] / difficulty_counts["Medium"] if difficulty_counts["Medium"] > 0 else 0
accuracy_hard = difficulty_scores["Hard"] / difficulty_counts["Hard"] if difficulty_counts["Hard"] > 0 else 0
api.upload_file(
repo_id=SUBMISSION_DATASET,
path_or_fileobj=f"scored/{organization}_{model_name}.jsonl",
path_in_repo=f"{organization}/{model_name}/{YEAR_VERSION}_test_scored_{datetime.datetime.today()}.jsonl",
repo_type="dataset",
token=TOKEN
)
eval_entry = {
"Model Name": model_name,
"Base Model": model_family,
"URL": url,
"Organization": organization,
"Accuracy": scores / num_questions if num_questions > 0 else 0,
"Accuracy (easy)": accuracy_easy,
"Accuracy (medium)": accuracy_medium,
"Accuracy (hard)": accuracy_hard,
"Answer rate": scores / num_questions if num_questions > 0 else 0,
"Precision": scores / num_questions if num_questions > 0 else 0,
"EM": scores if num_questions > 0 else 0
}
eval_results["test"] = eval_results["test"].add_item(eval_entry)
eval_results.push_to_hub(RESULTS_DATASET, config_name=YEAR_VERSION, token=TOKEN)
return format_log(
f"Model {model_name} submitted by {organization} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
# Function to refresh the results
def refresh():
eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload",
ignore_verifications=True, trust_remote_code=True)
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
eval_dataframe_test = format_dataframe(eval_dataframe_test)
return eval_dataframe_test
# Gradio interface
demo = gr.Blocks()
with demo:
gr.HTML("<h1>AssistantBench</h1>")
gr.Markdown("""
AssistantBench aims to evaluate the ability of web agents to assist with real and time-consuming tasks.
For more information, please check out our paper or the official website.
To download AssistantBench, press [here](https://huggingface.co/datasets/Ori/AssistantBench_V1.0).
""")
gr.HTML("<h2>AssistantBench Leaderboard</h2>")
with gr.Tab("Results: Test"):
leaderboard_table_test = gr.Dataframe(
value=eval_dataframe_test, datatype=TYPES, interactive=False,
column_widths=["20%"]
)
refresh_button = gr.Button("Refresh")
refresh_button.click(
refresh,
inputs=[],
outputs=[
leaderboard_table_test,
],
)
gr.HTML("<h2>Making a New Submission</h2>")
with gr.Accordion("Submit a new model for evaluation"):
with gr.Row():
gr.Markdown("""
To make a new submission, upload a predictions file. Our scoring function can be found [here](https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/scorer.py). We support JSONL files with the following format:
```
{"id": "task_id_1", "answer": "Answer 1 from your model"}
{"id": "task_id_2", "answer": "Answer 2 from your model"}
```
""")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model Name")
model_family_textbox = gr.Textbox(label="Base Model")
url_textbox = gr.Textbox(label="URL to Model Information")
with gr.Column():
organization = gr.Textbox(label="Organization")
mail = gr.Textbox(
label="Contact Email (will be stored privately & used if there is an issue with your submission)")
file_output = gr.File()
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
[
model_name_textbox,
model_family_textbox,
url_textbox,
file_output,
organization,
mail
],
submission_result,
)
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_text = """@article{yoran-etal-2024-assistantbench,
title={AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?},
author={Ori Yoran and Samuel Amouyal and Chaitanya Malaviya and Ben Bogin and Ofir Press and Jonathan Berant},
year={2024},
eprint={?},
archivePrefix={arXiv},
primaryClass={cs.CL}
}"""
citation_button = gr.Textbox(
value=citation_text,
label="Citation",
lines=20,
elem_id="citation-button",
show_copy_button=True
)
gr.HTML(
"<p>We would like to thank the GAIA team for sharing the source code for their leaderboard which we used as a template and HuggingFace for hosting the leaderboard.</p>")
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()
demo.launch(debug=True)
|