model-evaluator

Runtime error

File size: 28,850 Bytes

3ce0948

import os
import time
from pathlib import Path

import pandas as pd
import streamlit as st
import yaml
from datasets import get_dataset_config_names
from dotenv import load_dotenv
from huggingface_hub import list_datasets

from evaluation import filter_evaluated_models
from utils import (
    AUTOTRAIN_TASK_TO_HUB_TASK,
    commit_evaluation_log,
    create_autotrain_project_name,
    format_col_mapping,
    get_compatible_models,
    get_config_metadata,
    get_dataset_card_url,
    get_key,
    get_metadata,
    http_get,
    http_post,
)

if Path(".env").is_file():
    load_dotenv(".env")

HF_TOKEN = os.getenv("HF_TOKEN")
AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
DATASETS_PREVIEW_API = os.getenv("DATASETS_PREVIEW_API")

# Put image tasks on top
TASK_TO_ID = {
    "image_binary_classification": 17,
    "image_multi_class_classification": 18,
    "binary_classification": 1,
    "multi_class_classification": 2,
    "natural_language_inference": 22,
    "entity_extraction": 4,
    "extractive_question_answering": 5,
    "translation": 6,
    "summarization": 8,
    "text_zero_shot_classification": 23,
}

TASK_TO_DEFAULT_METRICS = {
    "binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
    "multi_class_classification": [
        "f1",
        "precision",
        "recall",
        "accuracy",
    ],
    "natural_language_inference": ["f1", "precision", "recall", "auc", "accuracy"],
    "entity_extraction": ["precision", "recall", "f1", "accuracy"],
    "extractive_question_answering": ["f1", "exact_match"],
    "translation": ["sacrebleu"],
    "summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
    "image_binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
    "image_multi_class_classification": [
        "f1",
        "precision",
        "recall",
        "accuracy",
    ],
    "text_zero_shot_classification": ["accuracy", "loss"],
}

AUTOTRAIN_TASK_TO_LANG = {
    "translation": "en2de",
    "image_binary_classification": "unk",
    "image_multi_class_classification": "unk",
}

AUTOTRAIN_MACHINE = {"text_zero_shot_classification": "r5.16x"}


SUPPORTED_TASKS = list(TASK_TO_ID.keys())

# Extracted from utils.get_supported_metrics
# Hardcoded for now due to speed / caching constraints
SUPPORTED_METRICS = [
    "accuracy",
    "bertscore",
    "bleu",
    "cer",
    "chrf",
    "code_eval",
    "comet",
    "competition_math",
    "coval",
    "cuad",
    "exact_match",
    "f1",
    "frugalscore",
    "google_bleu",
    "mae",
    "mahalanobis",
    "matthews_correlation",
    "mean_iou",
    "meteor",
    "mse",
    "pearsonr",
    "perplexity",
    "precision",
    "recall",
    "roc_auc",
    "rouge",
    "sacrebleu",
    "sari",
    "seqeval",
    "spearmanr",
    "squad",
    "squad_v2",
    "ter",
    "trec_eval",
    "wer",
    "wiki_split",
    "xnli",
    "angelina-wang/directional_bias_amplification",
    "jordyvl/ece",
    "lvwerra/ai4code",
    "lvwerra/amex",
]


#######
# APP #
#######
st.title("Evaluation on the Hub")
st.markdown(
    """
    Welcome to Hugging Face's automatic model evaluator 👋!

    This application allows you to evaluate 🤗 Transformers
    [models](https://huggingface.co/models?library=transformers&sort=downloads)
    across a wide variety of [datasets](https://huggingface.co/datasets) on the
    Hub. Please select the dataset and configuration below. The results of your
    evaluation will be displayed on the [public
    leaderboards](https://huggingface.co/spaces/autoevaluate/leaderboards). For
    more details, check out out our [blog
    post](https://huggingface.co/blog/eval-on-the-hub).
    """
)

all_datasets = [d.id for d in list_datasets()]
query_params = st.experimental_get_query_params()
if "first_query_params" not in st.session_state:
    st.session_state.first_query_params = query_params
first_query_params = st.session_state.first_query_params
default_dataset = all_datasets[0]
if "dataset" in first_query_params:
    if len(first_query_params["dataset"]) > 0 and first_query_params["dataset"][0] in all_datasets:
        default_dataset = first_query_params["dataset"][0]

selected_dataset = st.selectbox(
    "Select a dataset",
    all_datasets,
    index=all_datasets.index(default_dataset),
    help="""Datasets with metadata can be evaluated with 1-click. Configure an evaluation job to add \
        new metadata to a dataset card.""",
)
st.experimental_set_query_params(**{"dataset": [selected_dataset]})

# Check if selected dataset can be streamed
is_valid_dataset = http_get(
    path="/is-valid",
    domain=DATASETS_PREVIEW_API,
    params={"dataset": selected_dataset},
).json()
if is_valid_dataset["valid"] is False:
    st.error(
        """The dataset you selected is not currently supported. Open a \
            [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) for support."""
    )

metadata = get_metadata(selected_dataset, token=HF_TOKEN)
print(f"INFO -- Dataset metadata: {metadata}")
if metadata is None:
    st.warning("No evaluation metadata found. Please configure the evaluation job below.")

with st.expander("Advanced configuration"):
    # Select task
    selected_task = st.selectbox(
        "Select a task",
        SUPPORTED_TASKS,
        index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0,
        help="""Don't see your favourite task here? Open a \
            [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) to request it!""",
    )
    # Select config
    configs = get_dataset_config_names(selected_dataset)
    selected_config = st.selectbox(
        "Select a config",
        configs,
        help="""Some datasets contain several sub-datasets, known as _configurations_. \
            Select one to evaluate your models on. \
            See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details.
            """,
    )
    # Some datasets have multiple metadata (one per config), so we grab the one associated with the selected config
    config_metadata = get_config_metadata(selected_config, metadata)
    print(f"INFO -- Config metadata: {config_metadata}")

    # Select splits
    splits_resp = http_get(
        path="/splits",
        domain=DATASETS_PREVIEW_API,
        params={"dataset": selected_dataset},
    )
    if splits_resp.status_code == 200:
        split_names = []
        all_splits = splits_resp.json()
        for split in all_splits["splits"]:
            if split["config"] == selected_config:
                split_names.append(split["split"])

        if config_metadata is not None:
            eval_split = config_metadata["splits"].get("eval_split", None)
        else:
            eval_split = None
        selected_split = st.selectbox(
            "Select a split",
            split_names,
            index=split_names.index(eval_split) if eval_split is not None else 0,
            help="Be wary when evaluating models on the `train` split.",
        )

    # Select columns
    rows_resp = http_get(
        path="/first-rows",
        domain=DATASETS_PREVIEW_API,
        params={
            "dataset": selected_dataset,
            "config": selected_config,
            "split": selected_split,
        },
    ).json()
    col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)

    st.markdown("**Map your dataset columns**")
    st.markdown(
        """The model evaluator uses a standardised set of column names for the input examples and labels. \
        Please define the mapping between your dataset columns (right) and the standardised column names (left)."""
    )
    col1, col2 = st.columns(2)

    # TODO: find a better way to layout these items
    # TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
    col_mapping = {}
    if selected_task in ["binary_classification", "multi_class_classification"]:
        with col1:
            st.markdown("`text` column")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.markdown("`target` column")
        with col2:
            text_col = st.selectbox(
                "This column should contain the text to be classified",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
                if config_metadata is not None
                else 0,
            )
            target_col = st.selectbox(
                "This column should contain the labels associated with the text",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
                if config_metadata is not None
                else 0,
            )
            col_mapping[text_col] = "text"
            col_mapping[target_col] = "target"

    elif selected_task == "text_zero_shot_classification":
        with col1:
            st.markdown("`text` column")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.markdown("`classes` column")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.markdown("`target` column")
        with col2:
            text_col = st.selectbox(
                "This column should contain the text to be classified",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
                if config_metadata is not None
                else 0,
            )
            classes_col = st.selectbox(
                "This column should contain the classes associated with the text",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "classes"))
                if config_metadata is not None
                else 0,
            )
            target_col = st.selectbox(
                "This column should contain the index of the correct class",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
                if config_metadata is not None
                else 0,
            )
            col_mapping[text_col] = "text"
            col_mapping[classes_col] = "classes"
            col_mapping[target_col] = "target"

    if selected_task in ["natural_language_inference"]:
        config_metadata = get_config_metadata(selected_config, metadata)
        with col1:
            st.markdown("`text1` column")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.markdown("`text2` column")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.markdown("`target` column")
        with col2:
            text1_col = st.selectbox(
                "This column should contain the first text passage to be classified",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "text1"))
                if config_metadata is not None
                else 0,
            )
            text2_col = st.selectbox(
                "This column should contain the second text passage to be classified",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "text2"))
                if config_metadata is not None
                else 0,
            )
            target_col = st.selectbox(
                "This column should contain the labels associated with the text",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
                if config_metadata is not None
                else 0,
            )
            col_mapping[text1_col] = "text1"
            col_mapping[text2_col] = "text2"
            col_mapping[target_col] = "target"

    elif selected_task == "entity_extraction":
        with col1:
            st.markdown("`tokens` column")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.markdown("`tags` column")
        with col2:
            tokens_col = st.selectbox(
                "This column should contain the array of tokens to be classified",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "tokens"))
                if config_metadata is not None
                else 0,
            )
            tags_col = st.selectbox(
                "This column should contain the labels associated with each part of the text",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "tags"))
                if config_metadata is not None
                else 0,
            )
            col_mapping[tokens_col] = "tokens"
            col_mapping[tags_col] = "tags"

    elif selected_task == "translation":
        with col1:
            st.markdown("`source` column")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.markdown("`target` column")
        with col2:
            text_col = st.selectbox(
                "This column should contain the text to be translated",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "source"))
                if config_metadata is not None
                else 0,
            )
            target_col = st.selectbox(
                "This column should contain the target translation",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
                if config_metadata is not None
                else 0,
            )
            col_mapping[text_col] = "source"
            col_mapping[target_col] = "target"

    elif selected_task == "summarization":
        with col1:
            st.markdown("`text` column")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.markdown("`target` column")
        with col2:
            text_col = st.selectbox(
                "This column should contain the text to be summarized",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
                if config_metadata is not None
                else 0,
            )
            target_col = st.selectbox(
                "This column should contain the target summary",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
                if config_metadata is not None
                else 0,
            )
            col_mapping[text_col] = "text"
            col_mapping[target_col] = "target"

    elif selected_task == "extractive_question_answering":
        if config_metadata is not None:
            col_mapping = config_metadata["col_mapping"]
            # Hub YAML parser converts periods to hyphens, so we remap them here
            col_mapping = format_col_mapping(col_mapping)
        with col1:
            st.markdown("`context` column")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.markdown("`question` column")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.markdown("`answers.text` column")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.markdown("`answers.answer_start` column")
        with col2:
            context_col = st.selectbox(
                "This column should contain the question's context",
                col_names,
                index=col_names.index(get_key(col_mapping, "context")) if config_metadata is not None else 0,
            )
            question_col = st.selectbox(
                "This column should contain the question to be answered, given the context",
                col_names,
                index=col_names.index(get_key(col_mapping, "question")) if config_metadata is not None else 0,
            )
            answers_text_col = st.selectbox(
                "This column should contain example answers to the question, extracted from the context",
                col_names,
                index=col_names.index(get_key(col_mapping, "answers.text")) if config_metadata is not None else 0,
            )
            answers_start_col = st.selectbox(
                "This column should contain the indices in the context of the first character of each `answers.text`",
                col_names,
                index=col_names.index(get_key(col_mapping, "answers.answer_start"))
                if config_metadata is not None
                else 0,
            )
            col_mapping[context_col] = "context"
            col_mapping[question_col] = "question"
            col_mapping[answers_text_col] = "answers.text"
            col_mapping[answers_start_col] = "answers.answer_start"
    elif selected_task in ["image_binary_classification", "image_multi_class_classification"]:
        with col1:
            st.markdown("`image` column")
            st.text("")
            st.text("")
            st.text("")
            st.text("")
            st.markdown("`target` column")
        with col2:
            image_col = st.selectbox(
                "This column should contain the images to be classified",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "image"))
                if config_metadata is not None
                else 0,
            )
            target_col = st.selectbox(
                "This column should contain the labels associated with the images",
                col_names,
                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
                if config_metadata is not None
                else 0,
            )
            col_mapping[image_col] = "image"
            col_mapping[target_col] = "target"

    # Select metrics
    st.markdown("**Select metrics**")
    st.markdown("The following metrics will be computed")
    html_string = " ".join(
        [
            '<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
            + '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
            + 'padding-left:5px;color:white">'
            + metric
            + "</div></div>"
            for metric in TASK_TO_DEFAULT_METRICS[selected_task]
        ]
    )
    st.markdown(html_string, unsafe_allow_html=True)
    selected_metrics = st.multiselect(
        "(Optional) Select additional metrics",
        sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))),
        help="""User-selected metrics will be computed with their default arguments. \
            For example, `f1` will report results for binary labels. \
            Check out the [available metrics](https://huggingface.co/metrics) for more details.""",
    )

with st.form(key="form"):
    compatible_models = get_compatible_models(selected_task, [selected_dataset])
    selected_models = st.multiselect(
        "Select the models you wish to evaluate",
        compatible_models,
        help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \
            [model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
    )
    print("INFO -- Selected models before filter:", selected_models)

    hf_username = st.text_input("Enter your 🤗 Hub username to be notified when the evaluation is finished")

    submit_button = st.form_submit_button("Evaluate models 🚀")

    if submit_button:
        if len(hf_username) == 0:
            st.warning("No 🤗 Hub username provided! Please enter your username and try again.")
        elif len(selected_models) == 0:
            st.warning("⚠️ No models were selected for evaluation! Please select at least one model and try again.")
        elif len(selected_models) > 10:
            st.warning("Only 10 models can be evaluated at once. Please select fewer models and try again.")
        else:
            # Filter out previously evaluated models
            selected_models = filter_evaluated_models(
                selected_models,
                selected_task,
                selected_dataset,
                selected_config,
                selected_split,
                selected_metrics,
            )
            print("INFO -- Selected models after filter:", selected_models)
            if len(selected_models) > 0:
                project_payload = {
                    "username": AUTOTRAIN_USERNAME,
                    "proj_name": create_autotrain_project_name(selected_dataset, selected_config),
                    "task": TASK_TO_ID[selected_task],
                    "config": {
                        "language": AUTOTRAIN_TASK_TO_LANG[selected_task]
                        if selected_task in AUTOTRAIN_TASK_TO_LANG
                        else "en",
                        "max_models": 5,
                        "instance": {
                            "provider": "sagemaker" if selected_task in AUTOTRAIN_MACHINE.keys() else "ovh",
                            "instance_type": AUTOTRAIN_MACHINE[selected_task]
                            if selected_task in AUTOTRAIN_MACHINE.keys()
                            else "p3",
                            "max_runtime_seconds": 172800,
                            "num_instances": 1,
                            "disk_size_gb": 200,
                        },
                        "evaluation": {
                            "metrics": selected_metrics,
                            "models": selected_models,
                            "hf_username": hf_username,
                        },
                    },
                }
                print(f"INFO -- Payload: {project_payload}")
                project_json_resp = http_post(
                    path="/projects/create",
                    payload=project_payload,
                    token=HF_TOKEN,
                    domain=AUTOTRAIN_BACKEND_API,
                ).json()
                print(f"INFO -- Project creation response: {project_json_resp}")

                if project_json_resp["created"]:
                    data_payload = {
                        "split": 4,  # use "auto" split choice in AutoTrain
                        "col_mapping": col_mapping,
                        "load_config": {"max_size_bytes": 0, "shuffle": False},
                        "dataset_id": selected_dataset,
                        "dataset_config": selected_config,
                        "dataset_split": selected_split,
                    }
                    data_json_resp = http_post(
                        path=f"/projects/{project_json_resp['id']}/data/dataset",
                        payload=data_payload,
                        token=HF_TOKEN,
                        domain=AUTOTRAIN_BACKEND_API,
                    ).json()
                    print(f"INFO -- Dataset creation response: {data_json_resp}")
                    if data_json_resp["download_status"] == 1:
                        train_json_resp = http_post(
                            path=f"/projects/{project_json_resp['id']}/data/start_processing",
                            token=HF_TOKEN,
                            domain=AUTOTRAIN_BACKEND_API,
                        ).json()
                        # For local development we process and approve projects on-the-fly
                        if "localhost" in AUTOTRAIN_BACKEND_API:
                            with st.spinner("⏳ Waiting for data processing to complete ..."):
                                is_data_processing_success = False
                                while is_data_processing_success is not True:
                                    project_status = http_get(
                                        path=f"/projects/{project_json_resp['id']}",
                                        token=HF_TOKEN,
                                        domain=AUTOTRAIN_BACKEND_API,
                                    ).json()
                                    if project_status["status"] == 3:
                                        is_data_processing_success = True
                                    time.sleep(10)

                            # Approve training job
                            train_job_resp = http_post(
                                path=f"/projects/{project_json_resp['id']}/start_training",
                                token=HF_TOKEN,
                                domain=AUTOTRAIN_BACKEND_API,
                            ).json()
                            st.success("✅  Data processing and project approval complete - go forth and evaluate!")
                        else:
                            # Prod/staging submissions are evaluated in a cron job via run_evaluation_jobs.py
                            print(f"INFO -- AutoTrain job response: {train_json_resp}")
                            if train_json_resp["success"]:
                                train_eval_index = {
                                    "train-eval-index": [
                                        {
                                            "config": selected_config,
                                            "task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
                                            "task_id": selected_task,
                                            "splits": {"eval_split": selected_split},
                                            "col_mapping": col_mapping,
                                        }
                                    ]
                                }
                                selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
                                dataset_card_url = get_dataset_card_url(selected_dataset)
                                st.success("✅ Successfully submitted evaluation job!")
                                st.markdown(
                                    f"""
                                Evaluation can take up to 1 hour to complete, so grab a ☕️ or 🍵 while you wait:

                                * 🔔 A [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) with the evaluation results will be opened for each model you selected. Check your email for notifications.
                                * 📊 Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) to view the results from your submission once the Hub pull request is merged.
                                * 🥱 Tired of configuring evaluations? Add the following metadata to the [dataset card]({dataset_card_url}) to enable 1-click evaluations:
                                """  # noqa
                                )
                                st.markdown(
                                    f"""
                                ```yaml
                                {selected_metadata}
                                """
                                )
                                print("INFO -- Pushing evaluation job logs to the Hub")
                                evaluation_log = {}
                                evaluation_log["project_id"] = project_json_resp["id"]
                                evaluation_log["autotrain_env"] = (
                                    "staging" if "staging" in AUTOTRAIN_BACKEND_API else "prod"
                                )
                                evaluation_log["payload"] = project_payload
                                evaluation_log["project_creation_response"] = project_json_resp
                                evaluation_log["dataset_creation_response"] = data_json_resp
                                evaluation_log["autotrain_job_response"] = train_json_resp
                                commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
                            else:
                                st.error("🙈 Oh no, there was an error submitting your evaluation job!")
            else:
                st.warning("⚠️ No models left to evaluate! Please select other models and try again.")