Spaces:

AssistantBench
/

leaderboard

Running

File size: 10,595 Bytes

import os
import json
import datetime
from email.utils import parseaddr

import gradio as gr
import pandas as pd
from datasets import load_dataset
from evaluation.evaluator import question_scorer as eval_scorer
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi
from content import format_error, format_warning, format_log, TITLE

# Placeholder for the question_scorer function
def question_scorer(prediction, gold_answer):
    return eval_scorer(prediction, gold_answer)


# Constants and Configuration
TOKEN = os.environ.get("TOKEN", None)
OWNER = "Ori"
DATA_DATASET = f"Ori/AssistantBench_V1.0"
RESULTS_DATASET = f"Ori/results"
SUBMISSION_DATASET = f"{OWNER}/submissions"
LEADERBOARD_PATH = f"{OWNER}/leaderboard"
api = HfApi()

YEAR_VERSION = "default"

os.makedirs("scored", exist_ok=True)

# Load datasets
eval_results = load_dataset(RESULTS_DATASET, token=TOKEN, download_mode="force_redownload",
                            ignore_verifications=True, trust_remote_code=True)
gold_results = load_dataset(DATA_DATASET, token=TOKEN, trust_remote_code=True)

gold_answers = {split: {row["id"]: row["answer"] for row in gold_results[split]} for split in ["test"]}
gold_difficulties = {split: {row["id"]: row["difficulty"] for row in gold_results[split]} for split in ["test"]}


# Function to get dataframe from results
def get_dataframe_from_results(eval_results, split):
    local_df = eval_results[split]
    df = pd.DataFrame(local_df)
    df = df.sort_values(by=["Accuracy"], ascending=False)
    numeric_cols = [c for c in local_df.column_names if "score" in c]
    df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
    return df

# Update function to format dataframe
def format_dataframe(df):
    df["Accuracy"] = df["Accuracy"].apply(lambda x: f"**{x:.2f}**")
    if "URL" in df.columns:
        df["Model Name"] = df.apply(lambda row: f"[{row['Model Name']}]({row['URL']})", axis=1)
        df = df.drop(columns=["URL"])
    df = df.rename(columns={"Model Family": "Base Model"})
    df = df[["Model Name", "Accuracy", "Accuracy (easy)", "Accuracy (medium)", "Accuracy (hard)", "Answer rate", "Precision", "EM", "Base Model", "Organization"]]
    return df

eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
eval_dataframe_test = format_dataframe(eval_dataframe_test)


# Function to restart the space
def restart_space():
    api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)


TYPES = ["markdown", "markdown", "number", "number", "number", "number", "number", "number", "str", "str"]


# Function to add a new evaluation
def add_new_eval(
        model_name: str,
        model_family: str,
        url: str,
        path_to_file: str,
        organization: str,
        mail: str,
):
    _, parsed_mail = parseaddr(mail)
    if "@" not in parsed_mail:
        return format_warning("Please provide a valid email address.")

    print("Adding new eval")

    if model_name.lower() in set(
            [m.lower() for m in eval_results["test"]["Model Name"]]) and organization.lower() in set(
            [o.lower() for o in eval_results["test"]["Organization"]]):
        return format_warning("This model has already been submitted.")

    if path_to_file is None:
        return format_warning("Please attach a file.")

    api.upload_file(
        repo_id=SUBMISSION_DATASET,
        path_or_fileobj=path_to_file.name,
        path_in_repo=f"{organization}/{model_name}/{YEAR_VERSION}_test_raw_{datetime.datetime.today()}.jsonl",
        repo_type="dataset",
        token=TOKEN
    )

    file_path = path_to_file.name
    scores = 0
    num_questions = 0

    difficulty_scores = {"Easy": 0, "Medium": 0, "Hard": 0}
    difficulty_counts = {"Easy": 0, "Medium": 0, "Hard": 0}

    with open(f"scored/{organization}_{model_name}.jsonl", "w") as scored_file:
        with open(file_path, 'r') as f:
            for ix, line in enumerate(f):
                try:
                    task = json.loads(line)
                except Exception:
                    return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")

                if "answer" not in task:
                    return format_error(
                        f"Line {ix} contains no answer key. Please fix it and resubmit your file.")

                answer = task["answer"]
                task_id = task["id"]
                if task_id not in gold_answers["test"]:
                    return format_error(
                        f"{task_id} not found in test set. Are you sure you submitted the correct file?")

                score = question_scorer(task['answer'], gold_answers["test"][task_id])
                difficulty = gold_difficulties["test"][task_id]

                scored_file.write(
                    json.dumps({
                        "id": task_id,
                        "model_answer": answer,
                        "score": score
                    }) + "\n"
                )

                scores += score
                num_questions += 1
                difficulty_scores[difficulty] += score
                difficulty_counts[difficulty] += 1

    accuracy_easy = difficulty_scores["Easy"] / difficulty_counts["Easy"] if difficulty_counts["Easy"] > 0 else 0
    accuracy_medium = difficulty_scores["Medium"] / difficulty_counts["Medium"] if difficulty_counts["Medium"] > 0 else 0
    accuracy_hard = difficulty_scores["Hard"] / difficulty_counts["Hard"] if difficulty_counts["Hard"] > 0 else 0

    api.upload_file(
        repo_id=SUBMISSION_DATASET,
        path_or_fileobj=f"scored/{organization}_{model_name}.jsonl",
        path_in_repo=f"{organization}/{model_name}/{YEAR_VERSION}_test_scored_{datetime.datetime.today()}.jsonl",
        repo_type="dataset",
        token=TOKEN
    )

    eval_entry = {
        "Model Name": model_name,
        "Base Model": model_family,
        "URL": url,
        "Organization": organization,
        "Accuracy": scores / num_questions if num_questions > 0 else 0,
        "Accuracy (easy)": accuracy_easy,
        "Accuracy (medium)": accuracy_medium,
        "Accuracy (hard)": accuracy_hard,
        "Answer rate": scores / num_questions if num_questions > 0 else 0,
        "Precision": scores / num_questions if num_questions > 0 else 0,
        "EM": scores if num_questions > 0 else 0
    }
    eval_results["test"] = eval_results["test"].add_item(eval_entry)
    eval_results.push_to_hub(RESULTS_DATASET, config_name=YEAR_VERSION, token=TOKEN)

    return format_log(
        f"Model {model_name} submitted by {organization} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")


# Function to refresh the results
def refresh():
    eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload",
                                ignore_verifications=True, trust_remote_code=True)
    eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
    eval_dataframe_test = format_dataframe(eval_dataframe_test)
    return eval_dataframe_test


# Gradio interface
demo = gr.Blocks()
with demo:
    gr.HTML("<h1>AssistantBench</h1>")
    gr.Markdown("""
        AssistantBench aims to evaluate the ability of web agents to assist with real and time-consuming tasks.
        For more information, please check out our paper or the official website.
        To download AssistantBench, press [here](https://huggingface.co/datasets/Ori/AssistantBench_V1.0).
    """)

    gr.HTML("<h2>AssistantBench Leaderboard</h2>")
    with gr.Tab("Results: Test"):
        leaderboard_table_test = gr.Dataframe(
            value=eval_dataframe_test, datatype=TYPES, interactive=False,
            column_widths=["20%"]
        )

    refresh_button = gr.Button("Refresh")
    refresh_button.click(
        refresh,
        inputs=[],
        outputs=[
            leaderboard_table_test,
        ],
    )

    gr.HTML("<h2>Making a New Submission</h2>")
    with gr.Accordion("Submit a new model for evaluation"):
        with gr.Row():
            gr.Markdown("""
                To make a new submission, upload a predictions file. Our scoring function can be found [here](https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/scorer.py). We support JSONL files with the following format:
                ```
                {"id": "task_id_1", "answer": "Answer 1 from your model"}
                {"id": "task_id_2", "answer": "Answer 2 from your model"}
                ```
            """)
        with gr.Row():
            with gr.Column():
                model_name_textbox = gr.Textbox(label="Model Name")
                model_family_textbox = gr.Textbox(label="Base Model")
                url_textbox = gr.Textbox(label="URL to Model Information")
            with gr.Column():
                organization = gr.Textbox(label="Organization")
                mail = gr.Textbox(
                    label="Contact Email (will be stored privately & used if there is an issue with your submission)")
                file_output = gr.File()

        submit_button = gr.Button("Submit Eval")
        submission_result = gr.Markdown()
        submit_button.click(
            add_new_eval,
            [
                model_name_textbox,
                model_family_textbox,
                url_textbox,
                file_output,
                organization,
                mail
            ],
            submission_result,
        )

    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            citation_text = """@article{yoran-etal-2024-assistantbench,
    title={AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?}, 
    author={Ori Yoran and Samuel Amouyal and Chaitanya Malaviya and Ben Bogin and Ofir Press and Jonathan Berant},
    year={2024},
    eprint={?},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}"""
            citation_button = gr.Textbox(
                value=citation_text,
                label="Citation",
                lines=20,
                elem_id="citation-button",
                show_copy_button=True
            )

    gr.HTML(
        "<p>We would like to thank the GAIA team for sharing the source code for their leaderboard which we used as a template and HuggingFace for hosting the leaderboard.</p>")

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()
demo.launch(debug=True)