Spaces:

stemdataset
/

stem-leaderboard

Runtime error

File size: 9,443 Bytes

c9ecc6b

import os
import json
import datetime
from email.utils import parseaddr

import gradio as gr
import pandas as pd
import numpy as np

from datasets import load_dataset, DatasetDict
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi

# InfoStrings
from scorer import question_scorer
from content import (
    format_error,
    format_warning,
    format_log,
    TITLE,
    INTRODUCTION_TEXT,
    model_hyperlink,
)

TOKEN = os.environ.get("TOKEN", None)

OWNER = "stemdataset"
INTERNAL_DATA_DATASET = f"{OWNER}/STEM-Labels-Private"
SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
CONTACT_DATASET = f"{OWNER}/contact_info"
RESULTS_DATASET = f"{OWNER}/results"
LEADERBOARD_PATH = f"{OWNER}/stem-leaderboard"
api = HfApi()

os.makedirs("scored", exist_ok=True)

# Display the results
eval_results = load_dataset(
    RESULTS_DATASET,
    token=TOKEN,
    download_mode="force_redownload",
    verification_mode="no_checks",
)
contact_infos = load_dataset(
    CONTACT_DATASET,
    token=TOKEN,
    download_mode="force_redownload",
    verification_mode="no_checks",
)


def get_dataframe_from_results(eval_results: DatasetDict, split):
    local_df = eval_results[split]
    local_df = local_df.map(
        lambda row: {"model": model_hyperlink(row["url"], row["model"])}
    )
    local_df = local_df.remove_columns(["url"])
    local_df = local_df.rename_column("model", "Model Name")
    local_df = local_df.rename_column("model_family", "Model Family")
    local_df = local_df.rename_column("average", "Average")
    local_df = local_df.rename_column("science", "Science")
    local_df = local_df.rename_column("technology", "Technology")
    local_df = local_df.rename_column("engineering", "Engineering")
    local_df = local_df.rename_column("math", "Math")
    local_df = local_df.rename_column("organisation", "Organisation")
    local_df = local_df.rename_column("submit_date", "Submit Date")
    df = pd.DataFrame(local_df)
    df = df[[
        "Model Name",
        "Model Family",
        "Science",
        "Technology",
        "Engineering",
        "Math",
        "Average",
        "Organisation",
        "Submit Date",
    ]]
    df = df.sort_values(by=["Average"], ascending=False)

    numeric_cols = ["Science", "Technology", "Engineering", "Math", "Average"]
    df[numeric_cols] = df[numeric_cols].round(decimals=1)
    for col in numeric_cols:
        df[col] = df[col].apply(lambda x: f"{x:.1f}")
    return df


eval_dataframe_test = get_dataframe_from_results(
    eval_results=eval_results, split="basic"
)

# Gold answers
gold_dataset = load_dataset(INTERNAL_DATA_DATASET, token=TOKEN)["labels"]


def restart_space():
    api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)


TYPES = ["markdown", "number", "number", "number", "number", "str", "str"]


def calc_test_acc(preds: list[int]) -> dict[str, float]:
    tmp_accs = {
        "science": [0, 0],
        "technology": [0, 0],
        "engineer": [0, 0],
        "math": [0, 0],
    }
    labels = gold_dataset
    for pred, label in zip(preds, labels):
        subject = label["subject"]
        tmp_accs[subject][1] += 1
        if pred == label["answer_idx"]:
            tmp_accs[subject][0] += 1
    accs = {k: v[0] / v[1] for k, v in tmp_accs.items()}
    accs["average"] = np.mean(list(accs.values()))
    accs = {k: round(v * 100, 1) for k, v in accs.items()}
    return accs


def add_new_eval(
    val_or_test: str,
    model: str,
    model_family: str,
    url: str,
    path_to_file: gr.File,
    organisation: str,
    mail: str,
):
    curr_timestamp = datetime.datetime.today()
    # Very basic email parsing
    _, parsed_mail = parseaddr(mail)
    if not "@" in parsed_mail:
        return format_warning("Please provide a valid email adress.")
    if model == "":
        return format_warning("Please provide a model name.")
    if model_family == "":
        return format_warning("Please provide a model family.")
    print(
        json.dumps(
            {
                "val_or_test": val_or_test,
                "model": model,
                "model_family": model_family,
                "url": url,
                "path_to_file": path_to_file,
                "organisation": organisation,
                "mail": mail,
            },
            indent=2,
        )
    )

    print("Adding new eval")

    # Check if the combination model/org already exists and prints a warning message if yes
    if model.lower() in set(
        [m.lower() for m in eval_results["basic"]["model"]]
    ) and organisation.lower() in set(
        [l.lower() for l in eval_results["basic"]["organisation"]]
    ):
        return format_warning("This model has been already submitted.")

    if path_to_file is None:
        return format_warning("Please attach a file.")

    # Save submitted file
    api.upload_file(
        repo_id=SUBMISSION_DATASET,
        path_or_fileobj=path_to_file.name,
        path_in_repo=f"{organisation}/{model}/{val_or_test}_raw_{curr_timestamp}.txt",
        repo_type="dataset",
        token=TOKEN,
    )

    # Compute score
    file_path = path_to_file.name
    with open(f"scored/{organisation}_{model}.json", "w") as scored_file:
        with open(file_path, "r") as f:
            preds = []
            for ix, line in enumerate(f):
                try:
                    pred_idx = int(line.strip())
                except Exception:
                    return format_error(
                        f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file."
                    )
                preds.append(pred_idx)
            stem_scores = calc_test_acc(preds)
            scored_file.write(json.dumps(stem_scores, indent=2))

    # Save scored file
    api.upload_file(
        repo_id=SUBMISSION_DATASET,
        path_or_fileobj=f"scored/{organisation}_{model}.json",
        path_in_repo=f"{organisation}/{model}/{val_or_test}_scored_{curr_timestamp}.json",
        repo_type="dataset",
        token=TOKEN,
    )

    # Actual submission
    eval_entry = {
        "model": model,
        "model_family": model_family,
        "url": url,
        "organisation": organisation,
        "submit_date": "\n".join(str(curr_timestamp).split(" ")),
        "science": stem_scores["science"],
        "technology": stem_scores["technology"],
        "engineering": stem_scores["engineer"],
        "math": stem_scores["math"],
        "average": stem_scores["average"],
    }
    eval_results["basic"] = eval_results["basic"].add_item(eval_entry)
    print(eval_results)
    eval_results.push_to_hub(RESULTS_DATASET, token=TOKEN)

    contact_info = {
        "model": model,
        "model_family": model_family,
        "url": url,
        "organisation": organisation,
        "mail": mail,
        "submit_date": "\n".join(str(curr_timestamp).split(" ")),
    }
    contact_infos["basic"] = contact_infos["basic"].add_item(contact_info)
    contact_infos.push_to_hub(CONTACT_DATASET, token=TOKEN)

    return format_log(
        f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed"
    )


def refresh():
    eval_results = load_dataset(
        RESULTS_DATASET,
        token=TOKEN,
        download_mode="force_redownload",
        verification_mode="no_checks",
    )
    eval_dataframe_test = get_dataframe_from_results(
        eval_results=eval_results, split="basic"
    )
    return eval_dataframe_test


def upload_file(files):
    file_paths = [file.name for file in files]
    return file_paths


demo = gr.Blocks()
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tab("Results: Test"):
        leaderboard_table_test = gr.components.Dataframe(
            value=eval_dataframe_test,
            datatype=TYPES,
            interactive=False,
            wrap=True,
        )

    refresh_button = gr.Button("Refresh")
    refresh_button.click(
        refresh,
        inputs=[],
        outputs=[
            leaderboard_table_test,
        ],
    )
    with gr.Accordion("Submit a new model for evaluation"):
        with gr.Row():
            with gr.Column():
                level_of_test = gr.Radio(["test"], value="test", label="Split")
                model_name_textbox = gr.Textbox(label="Model name")
                model_family_textbox = gr.Textbox(label="Model family")
                url_textbox = gr.Textbox(label="Url to model information")
            with gr.Column():
                organisation = gr.Textbox(label="Organisation")
                mail = gr.Textbox(
                    label="Contact email (will be stored privately, & used if there is an issue with your submission)"
                )
                file_output = gr.File()

        submit_button = gr.Button("Submit Eval")
        submission_result = gr.Markdown()
        submit_button.click(
            add_new_eval,
            [
                level_of_test,
                model_name_textbox,
                model_family_textbox,
                url_textbox,
                file_output,
                organisation,
                mail,
            ],
            submission_result,
        )

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()
demo.launch(debug=True, server_name="0.0.0.0")