model-evaluator

Runtime error

App Files Files Community

HadrienByr

lewtun HF Staff commited on Feb 6, 2023

Commit

3ce0948

0 Parent(s):

Duplicate from autoevaluate/model-evaluator

Browse files

Co-authored-by: Lewis Tunstall <[email protected]>

Files changed (18) hide show

.env.template +4 -0
.github/workflows/check_filesize.yml +16 -0
.github/workflows/quality.yml +29 -0
.github/workflows/run_evaluation_jobs.yml +30 -0
.github/workflows/sync_with_spaces.yml +20 -0
.gitignore +134 -0
LICENSE +201 -0
Makefile +8 -0
README.md +114 -0
app.py +693 -0
evaluation.py +57 -0
images/autotrain_job.png +0 -0
images/autotrain_projects.png +0 -0
notebooks/flush-prediction-repos.ipynb +177 -0
pyproject.toml +2 -0
requirements.txt +12 -0
run_evaluation_jobs.py +64 -0
utils.py +215 -0

.env.template ADDED Viewed

	@@ -0,0 +1,4 @@

+AUTOTRAIN_USERNAME=autoevaluator                                    # The bot or user that authors evaluation jobs
+HF_TOKEN=hf_xxx                                                     # An API token of the `autoevaluator` user
+AUTOTRAIN_BACKEND_API=https://api-staging.autotrain.huggingface.co  # The AutoTrain backend to send jobs to. Use https://api.autotrain.huggingface.co for prod or http://localhost:8000 for local development
+DATASETS_PREVIEW_API=https://datasets-server.huggingface.co         # The API to grab dataset information from

.github/workflows/check_filesize.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+name: Check file size
+on:               # or directly `on: [push]` to run the action on every push on any branch
+  pull_request:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check large files
+        uses: ActionsDesk/[email protected]
+        with:
+          filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces

.github/workflows/quality.yml ADDED Viewed

	@@ -0,0 +1,29 @@

+name: Code quality
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+jobs:
+  check_code_quality:
+    name: Check code quality
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Setup Python environment
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install black isort flake8
+      - name: Code quality
+        run: |
+          make quality

.github/workflows/run_evaluation_jobs.yml ADDED Viewed

	@@ -0,0 +1,30 @@

+name: Start evaluation jobs
+on:
+  schedule:
+    - cron:  '*/15 * * * *' # Start evaluations every 15th minute
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Setup Python Environment
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+      - name: Install requirements
+        run: pip install -r requirements.txt
+      - name: Execute scoring script
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          AUTOTRAIN_USERNAME: ${{ secrets.AUTOTRAIN_USERNAME }}
+          AUTOTRAIN_BACKEND_API: ${{ secrets.AUTOTRAIN_BACKEND_API }}
+        run: |
+          HF_TOKEN=$HF_TOKEN AUTOTRAIN_USERNAME=$AUTOTRAIN_USERNAME AUTOTRAIN_BACKEND_API=$AUTOTRAIN_BACKEND_API python run_evaluation_jobs.py

.github/workflows/sync_with_spaces.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          git push https://lewtun:[email protected]/spaces/autoevaluate/model-evaluator main

.gitignore ADDED Viewed

	@@ -0,0 +1,134 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+scratch/
+# Evaluation job logs
+evaluation-job-logs/

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Makefile ADDED Viewed

	@@ -0,0 +1,8 @@

+style:
+	python -m black --line-length 119 --target-version py39 .
+	python -m isort .
+quality:
+	python -m black --check --line-length 119 --target-version py39 .
+	python -m isort --check-only .
+	python -m flake8 --max-line-length 119

README.md ADDED Viewed

	@@ -0,0 +1,114 @@

+---
+title: Model Evaluator
+emoji: 📊
+colorFrom: red
+colorTo: red
+sdk: streamlit
+sdk_version: 1.10.0
+app_file: app.py
+duplicated_from: autoevaluate/model-evaluator
+---
+# Model Evaluator
+> Submit evaluation jobs to AutoTrain from the Hugging Face Hub
+## Supported tasks
+The table below shows which tasks are currently supported for evaluation in the AutoTrain backend:
+| Task                               | Supported |
+|:-----------------------------------|:---------:|
+| `binary_classification`            |     ✅     |
+| `multi_class_classification`       |     ✅     |
+| `multi_label_classification`       |     ❌     |
+| `entity_extraction`                |     ✅     |
+| `extractive_question_answering`    |     ✅     |
+| `translation`                      |     ✅     |
+| `summarization`                    |     ✅     |
+| `image_binary_classification`      |     ✅     |
+| `image_multi_class_classification` |     ✅     |
+| `text_zero_shot_evaluation`        |     ✅     |
+## Installation
+To run the application locally, first clone this repository and install the dependencies as follows:
+```
+pip install -r requirements.txt
+```
+Next, copy the example file of environment variables:
+```
+cp .env.template .env
+```
+and set the `HF_TOKEN` variable with a valid API token from the [`autoevaluator`](https://huggingface.co/autoevaluator) bot user. Finally, spin up the application by running:
+```
+streamlit run app.py
+```
+## Usage
+Evaluation on the Hub involves two main steps:
+1. Submitting an evaluation job via the UI. This creates an AutoTrain project with `N` models for evaluation. At this stage, the dataset is also processed and prepared for evaluation.
+2. Triggering the evaluation itself once the dataset is processed.
+From the user perspective, only step (1) is needed since step (2) is handled by a cron job on GitHub Actions that executes the `run_evaluation_jobs.py` script every 15 minutes.
+See below for details on manually triggering evaluation jobs.
+### Triggering an evaluation
+To evaluate the models in an AutoTrain project, run:
+```
+python run_evaluation_jobs.py
+```
+This will download the [`autoevaluate/evaluation-job-logs`](https://huggingface.co/datasets/autoevaluate/evaluation-job-logs) dataset from the Hub and check which evaluation projects are ready for evaluation (i.e. those whose dataset has been processed).
+## AutoTrain configuration details
+Models are evaluated by the [`autoevaluator`](https://huggingface.co/autoevaluator) bot user in AutoTrain, with the payload sent to the `AUTOTRAIN_BACKEND_API` environment variable. Evaluation projects are created and run on either the `prod` or `staging` environments. You can view the status of projects in the AutoTrain UI by navigating to one of the links below (ask internally for access to the staging UI):
+| AutoTrain environment |                                                AutoTrain UI URL                                                |           `AUTOTRAIN_BACKEND_API`            |
+|:---------------------:|:--------------------------------------------------------------------------------------------------------------:|:--------------------------------------------:|
+|        `prod`         |         [`https://ui.autotrain.huggingface.co/projects`](https://ui.autotrain.huggingface.co/projects)         |     https://api.autotrain.huggingface.co     |
+|       `staging`       | [`https://ui-staging.autotrain.huggingface.co/projects`](https://ui-staging.autotrain.huggingface.co/projects) | https://api-staging.autotrain.huggingface.co |
+The current configuration for evaluation jobs running on [Spaces](https://huggingface.co/spaces/autoevaluate/model-evaluator) is:
+```
+AUTOTRAIN_BACKEND_API=https://api.autotrain.huggingface.co
+```
+To evaluate models with a _local_ instance of AutoTrain, change the environment to:
+```
+AUTOTRAIN_BACKEND_API=http://localhost:8000
+```
+### Migrating from staging to production (and vice versa)
+In general, evaluation jobs should run in AutoTrain's `prod` environment, which is defined by the following environment variable:
+```
+AUTOTRAIN_BACKEND_API=https://api.autotrain.huggingface.co
+```
+However, there are times when it is necessary to run evaluation jobs in AutoTrain's `staging` environment (e.g. because a new evaluation pipeline is being deployed). In these cases the corresponding environement variable is:
+```
+AUTOTRAIN_BACKEND_API=https://api-staging.autotrain.huggingface.co
+```
+To migrate between these two environments, update the `AUTOTRAIN_BACKEND_API` in two places:
+* In the [repo secrets](https://huggingface.co/spaces/autoevaluate/model-evaluator/settings) associated with the `model-evaluator` Space. This will ensure evaluation projects are created in the desired environment.
+* In the [GitHub Actions secrets](https://github.com/huggingface/model-evaluator/settings/secrets/actions) associated with this repo. This will ensure that the correct evaluation jobs are approved and launched via the `run_evaluation_jobs.py` script.

app.py ADDED Viewed

	@@ -0,0 +1,693 @@

+import os
+import time
+from pathlib import Path
+import pandas as pd
+import streamlit as st
+import yaml
+from datasets import get_dataset_config_names
+from dotenv import load_dotenv
+from huggingface_hub import list_datasets
+from evaluation import filter_evaluated_models
+from utils import (
+    AUTOTRAIN_TASK_TO_HUB_TASK,
+    commit_evaluation_log,
+    create_autotrain_project_name,
+    format_col_mapping,
+    get_compatible_models,
+    get_config_metadata,
+    get_dataset_card_url,
+    get_key,
+    get_metadata,
+    http_get,
+    http_post,
+)
+if Path(".env").is_file():
+    load_dotenv(".env")
+HF_TOKEN = os.getenv("HF_TOKEN")
+AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
+AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
+DATASETS_PREVIEW_API = os.getenv("DATASETS_PREVIEW_API")
+# Put image tasks on top
+TASK_TO_ID = {
+    "image_binary_classification": 17,
+    "image_multi_class_classification": 18,
+    "binary_classification": 1,
+    "multi_class_classification": 2,
+    "natural_language_inference": 22,
+    "entity_extraction": 4,
+    "extractive_question_answering": 5,
+    "translation": 6,
+    "summarization": 8,
+    "text_zero_shot_classification": 23,
+}
+TASK_TO_DEFAULT_METRICS = {
+    "binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
+    "multi_class_classification": [
+        "f1",
+        "precision",
+        "recall",
+        "accuracy",
+    ],
+    "natural_language_inference": ["f1", "precision", "recall", "auc", "accuracy"],
+    "entity_extraction": ["precision", "recall", "f1", "accuracy"],
+    "extractive_question_answering": ["f1", "exact_match"],
+    "translation": ["sacrebleu"],
+    "summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
+    "image_binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
+    "image_multi_class_classification": [
+        "f1",
+        "precision",
+        "recall",
+        "accuracy",
+    ],
+    "text_zero_shot_classification": ["accuracy", "loss"],
+}
+AUTOTRAIN_TASK_TO_LANG = {
+    "translation": "en2de",
+    "image_binary_classification": "unk",
+    "image_multi_class_classification": "unk",
+}
+AUTOTRAIN_MACHINE = {"text_zero_shot_classification": "r5.16x"}
+SUPPORTED_TASKS = list(TASK_TO_ID.keys())
+# Extracted from utils.get_supported_metrics
+# Hardcoded for now due to speed / caching constraints
+SUPPORTED_METRICS = [
+    "accuracy",
+    "bertscore",
+    "bleu",
+    "cer",
+    "chrf",
+    "code_eval",
+    "comet",
+    "competition_math",
+    "coval",
+    "cuad",
+    "exact_match",
+    "f1",
+    "frugalscore",
+    "google_bleu",
+    "mae",
+    "mahalanobis",
+    "matthews_correlation",
+    "mean_iou",
+    "meteor",
+    "mse",
+    "pearsonr",
+    "perplexity",
+    "precision",
+    "recall",
+    "roc_auc",
+    "rouge",
+    "sacrebleu",
+    "sari",
+    "seqeval",
+    "spearmanr",
+    "squad",
+    "squad_v2",
+    "ter",
+    "trec_eval",
+    "wer",
+    "wiki_split",
+    "xnli",
+    "angelina-wang/directional_bias_amplification",
+    "jordyvl/ece",
+    "lvwerra/ai4code",
+    "lvwerra/amex",
+]
+#######
+# APP #
+#######
+st.title("Evaluation on the Hub")
+st.markdown(
+    """
+    Welcome to Hugging Face's automatic model evaluator 👋!
+    This application allows you to evaluate 🤗 Transformers
+    [models](https://huggingface.co/models?library=transformers&sort=downloads)
+    across a wide variety of [datasets](https://huggingface.co/datasets) on the
+    Hub. Please select the dataset and configuration below. The results of your
+    evaluation will be displayed on the [public
+    leaderboards](https://huggingface.co/spaces/autoevaluate/leaderboards). For
+    more details, check out out our [blog
+    post](https://huggingface.co/blog/eval-on-the-hub).
+    """
+)
+all_datasets = [d.id for d in list_datasets()]
+query_params = st.experimental_get_query_params()
+if "first_query_params" not in st.session_state:
+    st.session_state.first_query_params = query_params
+first_query_params = st.session_state.first_query_params
+default_dataset = all_datasets[0]
+if "dataset" in first_query_params:
+    if len(first_query_params["dataset"]) > 0 and first_query_params["dataset"][0] in all_datasets:
+        default_dataset = first_query_params["dataset"][0]
+selected_dataset = st.selectbox(
+    "Select a dataset",
+    all_datasets,
+    index=all_datasets.index(default_dataset),
+    help="""Datasets with metadata can be evaluated with 1-click. Configure an evaluation job to add \
+        new metadata to a dataset card.""",
+)
+st.experimental_set_query_params(**{"dataset": [selected_dataset]})
+# Check if selected dataset can be streamed
+is_valid_dataset = http_get(
+    path="/is-valid",
+    domain=DATASETS_PREVIEW_API,
+    params={"dataset": selected_dataset},
+).json()
+if is_valid_dataset["valid"] is False:
+    st.error(
+        """The dataset you selected is not currently supported. Open a \
+            [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) for support."""
+    )
+metadata = get_metadata(selected_dataset, token=HF_TOKEN)
+print(f"INFO -- Dataset metadata: {metadata}")
+if metadata is None:
+    st.warning("No evaluation metadata found. Please configure the evaluation job below.")
+with st.expander("Advanced configuration"):
+    # Select task
+    selected_task = st.selectbox(
+        "Select a task",
+        SUPPORTED_TASKS,
+        index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0,
+        help="""Don't see your favourite task here? Open a \
+            [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) to request it!""",
+    )
+    # Select config
+    configs = get_dataset_config_names(selected_dataset)
+    selected_config = st.selectbox(
+        "Select a config",
+        configs,
+        help="""Some datasets contain several sub-datasets, known as _configurations_. \
+            Select one to evaluate your models on. \
+            See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details.
+            """,
+    )
+    # Some datasets have multiple metadata (one per config), so we grab the one associated with the selected config
+    config_metadata = get_config_metadata(selected_config, metadata)
+    print(f"INFO -- Config metadata: {config_metadata}")
+    # Select splits
+    splits_resp = http_get(
+        path="/splits",
+        domain=DATASETS_PREVIEW_API,
+        params={"dataset": selected_dataset},
+    )
+    if splits_resp.status_code == 200:
+        split_names = []
+        all_splits = splits_resp.json()
+        for split in all_splits["splits"]:
+            if split["config"] == selected_config:
+                split_names.append(split["split"])
+        if config_metadata is not None:
+            eval_split = config_metadata["splits"].get("eval_split", None)
+        else:
+            eval_split = None
+        selected_split = st.selectbox(
+            "Select a split",
+            split_names,
+            index=split_names.index(eval_split) if eval_split is not None else 0,
+            help="Be wary when evaluating models on the `train` split.",
+        )
+    # Select columns
+    rows_resp = http_get(
+        path="/first-rows",
+        domain=DATASETS_PREVIEW_API,
+        params={
+            "dataset": selected_dataset,
+            "config": selected_config,
+            "split": selected_split,
+        },
+    ).json()
+    col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
+    st.markdown("**Map your dataset columns**")
+    st.markdown(
+        """The model evaluator uses a standardised set of column names for the input examples and labels. \
+        Please define the mapping between your dataset columns (right) and the standardised column names (left)."""
+    )
+    col1, col2 = st.columns(2)
+    # TODO: find a better way to layout these items
+    # TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
+    col_mapping = {}
+    if selected_task in ["binary_classification", "multi_class_classification"]:
+        with col1:
+            st.markdown("`text` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`target` column")
+        with col2:
+            text_col = st.selectbox(
+                "This column should contain the text to be classified",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
+                if config_metadata is not None
+                else 0,
+            )
+            target_col = st.selectbox(
+                "This column should contain the labels associated with the text",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
+                if config_metadata is not None
+                else 0,
+            )
+            col_mapping[text_col] = "text"
+            col_mapping[target_col] = "target"
+    elif selected_task == "text_zero_shot_classification":
+        with col1:
+            st.markdown("`text` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`classes` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`target` column")
+        with col2:
+            text_col = st.selectbox(
+                "This column should contain the text to be classified",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
+                if config_metadata is not None
+                else 0,
+            )
+            classes_col = st.selectbox(
+                "This column should contain the classes associated with the text",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "classes"))
+                if config_metadata is not None
+                else 0,
+            )
+            target_col = st.selectbox(
+                "This column should contain the index of the correct class",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
+                if config_metadata is not None
+                else 0,
+            )
+            col_mapping[text_col] = "text"
+            col_mapping[classes_col] = "classes"
+            col_mapping[target_col] = "target"
+    if selected_task in ["natural_language_inference"]:
+        config_metadata = get_config_metadata(selected_config, metadata)
+        with col1:
+            st.markdown("`text1` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`text2` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`target` column")
+        with col2:
+            text1_col = st.selectbox(
+                "This column should contain the first text passage to be classified",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "text1"))
+                if config_metadata is not None
+                else 0,
+            )
+            text2_col = st.selectbox(
+                "This column should contain the second text passage to be classified",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "text2"))
+                if config_metadata is not None
+                else 0,
+            )
+            target_col = st.selectbox(
+                "This column should contain the labels associated with the text",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
+                if config_metadata is not None
+                else 0,
+            )
+            col_mapping[text1_col] = "text1"
+            col_mapping[text2_col] = "text2"
+            col_mapping[target_col] = "target"
+    elif selected_task == "entity_extraction":
+        with col1:
+            st.markdown("`tokens` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`tags` column")
+        with col2:
+            tokens_col = st.selectbox(
+                "This column should contain the array of tokens to be classified",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "tokens"))
+                if config_metadata is not None
+                else 0,
+            )
+            tags_col = st.selectbox(
+                "This column should contain the labels associated with each part of the text",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "tags"))
+                if config_metadata is not None
+                else 0,
+            )
+            col_mapping[tokens_col] = "tokens"
+            col_mapping[tags_col] = "tags"
+    elif selected_task == "translation":
+        with col1:
+            st.markdown("`source` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`target` column")
+        with col2:
+            text_col = st.selectbox(
+                "This column should contain the text to be translated",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "source"))
+                if config_metadata is not None
+                else 0,
+            )
+            target_col = st.selectbox(
+                "This column should contain the target translation",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
+                if config_metadata is not None
+                else 0,
+            )
+            col_mapping[text_col] = "source"
+            col_mapping[target_col] = "target"
+    elif selected_task == "summarization":
+        with col1:
+            st.markdown("`text` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`target` column")
+        with col2:
+            text_col = st.selectbox(
+                "This column should contain the text to be summarized",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
+                if config_metadata is not None
+                else 0,
+            )
+            target_col = st.selectbox(
+                "This column should contain the target summary",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
+                if config_metadata is not None
+                else 0,
+            )
+            col_mapping[text_col] = "text"
+            col_mapping[target_col] = "target"
+    elif selected_task == "extractive_question_answering":
+        if config_metadata is not None:
+            col_mapping = config_metadata["col_mapping"]
+            # Hub YAML parser converts periods to hyphens, so we remap them here
+            col_mapping = format_col_mapping(col_mapping)
+        with col1:
+            st.markdown("`context` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`question` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`answers.text` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`answers.answer_start` column")
+        with col2:
+            context_col = st.selectbox(
+                "This column should contain the question's context",
+                col_names,
+                index=col_names.index(get_key(col_mapping, "context")) if config_metadata is not None else 0,
+            )
+            question_col = st.selectbox(
+                "This column should contain the question to be answered, given the context",
+                col_names,
+                index=col_names.index(get_key(col_mapping, "question")) if config_metadata is not None else 0,
+            )
+            answers_text_col = st.selectbox(
+                "This column should contain example answers to the question, extracted from the context",
+                col_names,
+                index=col_names.index(get_key(col_mapping, "answers.text")) if config_metadata is not None else 0,
+            )
+            answers_start_col = st.selectbox(
+                "This column should contain the indices in the context of the first character of each `answers.text`",
+                col_names,
+                index=col_names.index(get_key(col_mapping, "answers.answer_start"))
+                if config_metadata is not None
+                else 0,
+            )
+            col_mapping[context_col] = "context"
+            col_mapping[question_col] = "question"
+            col_mapping[answers_text_col] = "answers.text"
+            col_mapping[answers_start_col] = "answers.answer_start"
+    elif selected_task in ["image_binary_classification", "image_multi_class_classification"]:
+        with col1:
+            st.markdown("`image` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`target` column")
+        with col2:
+            image_col = st.selectbox(
+                "This column should contain the images to be classified",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "image"))
+                if config_metadata is not None
+                else 0,
+            )
+            target_col = st.selectbox(
+                "This column should contain the labels associated with the images",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
+                if config_metadata is not None
+                else 0,
+            )
+            col_mapping[image_col] = "image"
+            col_mapping[target_col] = "target"
+    # Select metrics
+    st.markdown("**Select metrics**")
+    st.markdown("The following metrics will be computed")
+    html_string = " ".join(
+        [
+            '<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
+            + '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
+            + 'padding-left:5px;color:white">'
+            + metric
+            + "</div></div>"
+            for metric in TASK_TO_DEFAULT_METRICS[selected_task]
+        ]
+    )
+    st.markdown(html_string, unsafe_allow_html=True)
+    selected_metrics = st.multiselect(
+        "(Optional) Select additional metrics",
+        sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))),
+        help="""User-selected metrics will be computed with their default arguments. \
+            For example, `f1` will report results for binary labels. \
+            Check out the [available metrics](https://huggingface.co/metrics) for more details.""",
+    )
+with st.form(key="form"):
+    compatible_models = get_compatible_models(selected_task, [selected_dataset])
+    selected_models = st.multiselect(
+        "Select the models you wish to evaluate",
+        compatible_models,
+        help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \
+            [model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
+    )
+    print("INFO -- Selected models before filter:", selected_models)
+    hf_username = st.text_input("Enter your 🤗 Hub username to be notified when the evaluation is finished")
+    submit_button = st.form_submit_button("Evaluate models 🚀")
+    if submit_button:
+        if len(hf_username) == 0:
+            st.warning("No 🤗 Hub username provided! Please enter your username and try again.")
+        elif len(selected_models) == 0:
+            st.warning("⚠️ No models were selected for evaluation! Please select at least one model and try again.")
+        elif len(selected_models) > 10:
+            st.warning("Only 10 models can be evaluated at once. Please select fewer models and try again.")
+        else:
+            # Filter out previously evaluated models
+            selected_models = filter_evaluated_models(
+                selected_models,
+                selected_task,
+                selected_dataset,
+                selected_config,
+                selected_split,
+                selected_metrics,
+            )
+            print("INFO -- Selected models after filter:", selected_models)
+            if len(selected_models) > 0:
+                project_payload = {
+                    "username": AUTOTRAIN_USERNAME,
+                    "proj_name": create_autotrain_project_name(selected_dataset, selected_config),
+                    "task": TASK_TO_ID[selected_task],
+                    "config": {
+                        "language": AUTOTRAIN_TASK_TO_LANG[selected_task]
+                        if selected_task in AUTOTRAIN_TASK_TO_LANG
+                        else "en",
+                        "max_models": 5,
+                        "instance": {
+                            "provider": "sagemaker" if selected_task in AUTOTRAIN_MACHINE.keys() else "ovh",
+                            "instance_type": AUTOTRAIN_MACHINE[selected_task]
+                            if selected_task in AUTOTRAIN_MACHINE.keys()
+                            else "p3",
+                            "max_runtime_seconds": 172800,
+                            "num_instances": 1,
+                            "disk_size_gb": 200,
+                        },
+                        "evaluation": {
+                            "metrics": selected_metrics,
+                            "models": selected_models,
+                            "hf_username": hf_username,
+                        },
+                    },
+                }
+                print(f"INFO -- Payload: {project_payload}")
+                project_json_resp = http_post(
+                    path="/projects/create",
+                    payload=project_payload,
+                    token=HF_TOKEN,
+                    domain=AUTOTRAIN_BACKEND_API,
+                ).json()
+                print(f"INFO -- Project creation response: {project_json_resp}")
+                if project_json_resp["created"]:
+                    data_payload = {
+                        "split": 4,  # use "auto" split choice in AutoTrain
+                        "col_mapping": col_mapping,
+                        "load_config": {"max_size_bytes": 0, "shuffle": False},
+                        "dataset_id": selected_dataset,
+                        "dataset_config": selected_config,
+                        "dataset_split": selected_split,
+                    }
+                    data_json_resp = http_post(
+                        path=f"/projects/{project_json_resp['id']}/data/dataset",
+                        payload=data_payload,
+                        token=HF_TOKEN,
+                        domain=AUTOTRAIN_BACKEND_API,
+                    ).json()
+                    print(f"INFO -- Dataset creation response: {data_json_resp}")
+                    if data_json_resp["download_status"] == 1:
+                        train_json_resp = http_post(
+                            path=f"/projects/{project_json_resp['id']}/data/start_processing",
+                            token=HF_TOKEN,
+                            domain=AUTOTRAIN_BACKEND_API,
+                        ).json()
+                        # For local development we process and approve projects on-the-fly
+                        if "localhost" in AUTOTRAIN_BACKEND_API:
+                            with st.spinner("⏳ Waiting for data processing to complete ..."):
+                                is_data_processing_success = False
+                                while is_data_processing_success is not True:
+                                    project_status = http_get(
+                                        path=f"/projects/{project_json_resp['id']}",
+                                        token=HF_TOKEN,
+                                        domain=AUTOTRAIN_BACKEND_API,
+                                    ).json()
+                                    if project_status["status"] == 3:
+                                        is_data_processing_success = True
+                                    time.sleep(10)
+                            # Approve training job
+                            train_job_resp = http_post(
+                                path=f"/projects/{project_json_resp['id']}/start_training",
+                                token=HF_TOKEN,
+                                domain=AUTOTRAIN_BACKEND_API,
+                            ).json()
+                            st.success("✅  Data processing and project approval complete - go forth and evaluate!")
+                        else:
+                            # Prod/staging submissions are evaluated in a cron job via run_evaluation_jobs.py
+                            print(f"INFO -- AutoTrain job response: {train_json_resp}")
+                            if train_json_resp["success"]:
+                                train_eval_index = {
+                                    "train-eval-index": [
+                                        {
+                                            "config": selected_config,
+                                            "task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
+                                            "task_id": selected_task,
+                                            "splits": {"eval_split": selected_split},
+                                            "col_mapping": col_mapping,
+                                        }
+                                    ]
+                                }
+                                selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
+                                dataset_card_url = get_dataset_card_url(selected_dataset)
+                                st.success("✅ Successfully submitted evaluation job!")
+                                st.markdown(
+                                    f"""
+                                Evaluation can take up to 1 hour to complete, so grab a ☕️ or 🍵 while you wait:
+                                * 🔔 A [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) with the evaluation results will be opened for each model you selected. Check your email for notifications.
+                                * 📊 Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) to view the results from your submission once the Hub pull request is merged.
+                                * 🥱 Tired of configuring evaluations? Add the following metadata to the [dataset card]({dataset_card_url}) to enable 1-click evaluations:
+                                """  # noqa
+                                )
+                                st.markdown(
+                                    f"""
+                                ```yaml
+                                {selected_metadata}
+                                """
+                                )
+                                print("INFO -- Pushing evaluation job logs to the Hub")
+                                evaluation_log = {}
+                                evaluation_log["project_id"] = project_json_resp["id"]
+                                evaluation_log["autotrain_env"] = (
+                                    "staging" if "staging" in AUTOTRAIN_BACKEND_API else "prod"
+                                )
+                                evaluation_log["payload"] = project_payload
+                                evaluation_log["project_creation_response"] = project_json_resp
+                                evaluation_log["dataset_creation_response"] = data_json_resp
+                                evaluation_log["autotrain_job_response"] = train_json_resp
+                                commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
+                            else:
+                                st.error("🙈 Oh no, there was an error submitting your evaluation job!")
+            else:
+                st.warning("⚠️ No models left to evaluate! Please select other models and try again.")

evaluation.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import copy
+from dataclasses import dataclass
+import streamlit as st
+from huggingface_hub import DatasetFilter, HfApi
+from huggingface_hub.hf_api import DatasetInfo
+@dataclass(frozen=True, eq=True)
+class EvaluationInfo:
+    task: str
+    model: str
+    dataset_name: str
+    dataset_config: str
+    dataset_split: str
+    metrics: set
+def create_evaluation_info(dataset_info: DatasetInfo) -> int:
+    if dataset_info.cardData is not None:
+        metadata = dataset_info.cardData["eval_info"]
+        metadata.pop("col_mapping", None)
+        # TODO(lewtun): populate dataset cards with metric info
+        if "metrics" not in metadata:
+            metadata["metrics"] = frozenset()
+        else:
+            metadata["metrics"] = frozenset(metadata["metrics"])
+        return EvaluationInfo(**metadata)
+def get_evaluation_infos():
+    filt = DatasetFilter(author="autoevaluate")
+    evaluation_datasets = HfApi().list_datasets(filter=filt, full=True)
+    return [create_evaluation_info(dset) for dset in evaluation_datasets]
+def filter_evaluated_models(models, task, dataset_name, dataset_config, dataset_split, metrics):
+    evaluation_infos = get_evaluation_infos()
+    models_to_filter = copy.copy(models)
+    for model in models_to_filter:
+        evaluation_info = EvaluationInfo(
+            task=task,
+            model=model,
+            dataset_name=dataset_name,
+            dataset_config=dataset_config,
+            dataset_split=dataset_split,
+            metrics=frozenset(metrics),
+        )
+        if evaluation_info in evaluation_infos:
+            st.info(
+                f"Model [`{model}`](https://huggingface.co/{model}) has already been evaluated on this configuration. \
+                    This model will be excluded from the evaluation job..."
+            )
+            models.remove(model)
+    return models

images/autotrain_job.png ADDED Viewed

images/autotrain_projects.png ADDED Viewed

notebooks/flush-prediction-repos.ipynb ADDED Viewed

	@@ -0,0 +1,177 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c8093b9e-ca6a-423d-96c3-5fe21f7109a1",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "efe8cda7-a687-4867-b1f0-8efbcd428681",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "from huggingface_hub import DatasetFilter, delete_repo, list_datasets\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "if Path(\".env\").is_file():\n",
+    "    load_dotenv(\".env\")\n",
+    "\n",
+    "HF_TOKEN = os.getenv(\"HF_TOKEN\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f6e01f0-b658-451f-999c-e08d9f4bbbd3",
+   "metadata": {},
+   "source": [
+    "## Get all prediction repos from autoevaluate org"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2e369478-66d3-498d-a8fd-95bc9180f362",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_prediction_repos():\n",
+    "    all_repos = list_datasets(author=\"autoevaluate\")\n",
+    "    prediction_repos = [\n",
+    "        repo for repo in all_repos if repo.id.split(\"/\")[1].startswith(\"autoeval-\")\n",
+    "    ]\n",
+    "    return prediction_repos"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "542db019-d01f-42f5-bef4-888dae8eeadb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "66"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "prediction_repos = get_prediction_repos()\n",
+    "len(prediction_repos)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "331cfabf-4b73-490f-8d6a-86b5bc162666",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetInfo: {\n",
+       "\tid: autoevaluate/autoeval-staging-eval-project-9dcc51b5-6464670\n",
+       "\tsha: d3bb02be592d167f7a217ac9341d187142d9a90a\n",
+       "\tlastModified: 2022-06-13T14:54:34.000Z\n",
+       "\ttags: ['type:predictions', 'tags:autotrain', 'tags:evaluation', 'datasets:glue']\n",
+       "\tprivate: False\n",
+       "\tauthor: autoevaluate\n",
+       "\tdescription: None\n",
+       "\tcitation: None\n",
+       "\tcardData: None\n",
+       "\tsiblings: None\n",
+       "\tgated: False\n",
+       "\tdownloads: 12\n",
+       "}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "prediction_repos[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "57a86b69-ffe8-4035-8f3d-5c917d8ce7bf",
+   "metadata": {},
+   "source": [
+    "## Delete all prediction repos"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "6c8e23e7-2a6d-437b-9742-17f37684d9eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "06fa304dcc6d44e39205b20a5e488052",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/66 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "for repo in tqdm(prediction_repos):\n",
+    "    delete_repo(\n",
+    "        repo_id=repo.id,\n",
+    "        repo_type=\"dataset\",\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d64b0aa-d05f-4497-9bd2-eb2fc0d8bd7a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "autoevaluate",
+   "language": "python",
+   "name": "autoevaluate"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [tool.isort]
2	+ profile = "black"

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+huggingface-hub<0.8
+python-dotenv
+streamlit==1.10.0
+datasets<2.3
+evaluate<0.2
+jsonlines
+typer
+# Dataset specific deps
+py7zr<0.19
+openpyxl<3.1
+# Dirty bug from Google
+protobuf<=3.20.1

run_evaluation_jobs.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+from pathlib import Path
+import typer
+from datasets import load_dataset
+from dotenv import load_dotenv
+from utils import http_get, http_post
+if Path(".env").is_file():
+    load_dotenv(".env")
+HF_TOKEN = os.getenv("HF_TOKEN")
+AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
+AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
+if "staging" in AUTOTRAIN_BACKEND_API:
+    AUTOTRAIN_ENV = "staging"
+else:
+    AUTOTRAIN_ENV = "prod"
+def main():
+    print(f"💡 Starting jobs on {AUTOTRAIN_ENV} environment")
+    logs_df = load_dataset("autoevaluate/evaluation-job-logs", use_auth_token=HF_TOKEN, split="train").to_pandas()
+    # Filter out legacy AutoTrain submissions prior to project approvals requirement
+    projects_df = logs_df.copy()[(~logs_df["project_id"].isnull())]
+    # Filter IDs for appropriate AutoTrain env (staging vs prod)
+    projects_df = projects_df.copy().query(f"autotrain_env == '{AUTOTRAIN_ENV}'")
+    projects_to_approve = projects_df["project_id"].astype(int).tolist()
+    failed_approvals = []
+    print(f"🚀 Found {len(projects_to_approve)} evaluation projects to approve!")
+    for project_id in projects_to_approve:
+        print(f"Attempting to evaluate project ID {project_id} ...")
+        try:
+            project_info = http_get(
+                path=f"/projects/{project_id}",
+                token=HF_TOKEN,
+                domain=AUTOTRAIN_BACKEND_API,
+            ).json()
+            print(project_info)
+            # Only start evaluation for projects with completed data processing (status=3)
+            if project_info["status"] == 3 and project_info["training_status"] == "not_started":
+                train_job_resp = http_post(
+                    path=f"/projects/{project_id}/start_training",
+                    token=HF_TOKEN,
+                    domain=AUTOTRAIN_BACKEND_API,
+                ).json()
+                print(f"🤖 Project {project_id} approval response: {train_job_resp}")
+            else:
+                print(f"💪 Project {project_id} either not ready or has already been evaluated. Skipping ...")
+        except Exception as e:
+            print(f"There was a problem obtaining the project info for project ID {project_id}")
+            print(f"Error message: {e}")
+            failed_approvals.append(project_id)
+            pass
+    if len(failed_approvals) > 0:
+        print(f"🚨 Failed to approve {len(failed_approvals)} projects: {failed_approvals}")
+if __name__ == "__main__":
+    typer.run(main)

utils.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import inspect
+import uuid
+from typing import Dict, List, Union
+import jsonlines
+import requests
+import streamlit as st
+from evaluate import load
+from huggingface_hub import HfApi, ModelFilter, Repository, dataset_info, list_metrics
+from tqdm import tqdm
+AUTOTRAIN_TASK_TO_HUB_TASK = {
+    "binary_classification": "text-classification",
+    "multi_class_classification": "text-classification",
+    "natural_language_inference": "text-classification",
+    "entity_extraction": "token-classification",
+    "extractive_question_answering": "question-answering",
+    "translation": "translation",
+    "summarization": "summarization",
+    "image_binary_classification": "image-classification",
+    "image_multi_class_classification": "image-classification",
+    "text_zero_shot_classification": "text-generation",
+}
+HUB_TASK_TO_AUTOTRAIN_TASK = {v: k for k, v in AUTOTRAIN_TASK_TO_HUB_TASK.items()}
+LOGS_REPO = "evaluation-job-logs"
+def get_auth_headers(token: str, prefix: str = "Bearer"):
+    return {"Authorization": f"{prefix} {token}"}
+def http_post(path: str, token: str, payload=None, domain: str = None, params=None) -> requests.Response:
+    """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
+    try:
+        response = requests.post(
+            url=domain + path,
+            json=payload,
+            headers=get_auth_headers(token=token),
+            allow_redirects=True,
+            params=params,
+        )
+    except requests.exceptions.ConnectionError:
+        print("❌ Failed to reach AutoNLP API, check your internet connection")
+    response.raise_for_status()
+    return response
+def http_get(path: str, domain: str, token: str = None, params: dict = None) -> requests.Response:
+    """HTTP POST request to `path`, raises UnreachableAPIError if the API cannot be reached"""
+    try:
+        response = requests.get(
+            url=domain + path,
+            headers=get_auth_headers(token=token),
+            allow_redirects=True,
+            params=params,
+        )
+    except requests.exceptions.ConnectionError:
+        print(f"❌ Failed to reach {path}, check your internet connection")
+    response.raise_for_status()
+    return response
+def get_metadata(dataset_name: str, token: str) -> Union[Dict, None]:
+    data = dataset_info(dataset_name, token=token)
+    if data.cardData is not None and "train-eval-index" in data.cardData.keys():
+        return data.cardData["train-eval-index"]
+    else:
+        return None
+def get_compatible_models(task: str, dataset_ids: List[str]) -> List[str]:
+    """
+    Returns all model IDs that are compatible with the given task and dataset names.
+    Args:
+        task (`str`): The task to search for.
+        dataset_names (`List[str]`): A list of dataset names to search for.
+    Returns:
+        A list of model IDs, sorted alphabetically.
+    """
+    compatible_models = []
+    # Allow any summarization model to be used for summarization tasks
+    # and allow any text-generation model to be used for text_zero_shot_classification
+    if task in ("summarization", "text_zero_shot_classification"):
+        model_filter = ModelFilter(
+            task=AUTOTRAIN_TASK_TO_HUB_TASK[task],
+            library=["transformers", "pytorch"],
+        )
+        compatible_models.extend(HfApi().list_models(filter=model_filter))
+    # Include models trained on SQuAD datasets, since these can be evaluated on
+    # other SQuAD-like datasets
+    if task == "extractive_question_answering":
+        dataset_ids.extend(["squad", "squad_v2"])
+    # TODO: relax filter on PyTorch models if TensorFlow supported in AutoTrain
+    for dataset_id in dataset_ids:
+        model_filter = ModelFilter(
+            task=AUTOTRAIN_TASK_TO_HUB_TASK[task],
+            trained_dataset=dataset_id,
+            library=["transformers", "pytorch"],
+        )
+        compatible_models.extend(HfApi().list_models(filter=model_filter))
+    return sorted(set([model.modelId for model in compatible_models]))
+def get_key(col_mapping, val):
+    for key, value in col_mapping.items():
+        if val == value:
+            return key
+    return "key doesn't exist"
+def format_col_mapping(col_mapping: dict) -> dict:
+    for k, v in col_mapping["answers"].items():
+        col_mapping[f"answers.{k}"] = f"answers.{v}"
+    del col_mapping["answers"]
+    return col_mapping
+def commit_evaluation_log(evaluation_log, hf_access_token=None):
+    logs_repo_url = f"https://huggingface.co/datasets/autoevaluate/{LOGS_REPO}"
+    logs_repo = Repository(
+        local_dir=LOGS_REPO,
+        clone_from=logs_repo_url,
+        repo_type="dataset",
+        private=True,
+        use_auth_token=hf_access_token,
+    )
+    logs_repo.git_pull()
+    with jsonlines.open(f"{LOGS_REPO}/logs.jsonl") as r:
+        lines = []
+        for obj in r:
+            lines.append(obj)
+    lines.append(evaluation_log)
+    with jsonlines.open(f"{LOGS_REPO}/logs.jsonl", mode="w") as writer:
+        for job in lines:
+            writer.write(job)
+    logs_repo.push_to_hub(
+        commit_message=f"Evaluation submitted with project name {evaluation_log['payload']['proj_name']}"
+    )
+    print("INFO -- Pushed evaluation logs to the Hub")
+@st.experimental_memo
+def get_supported_metrics():
+    """Helper function to get all metrics compatible with evaluation service.
+    Requires all metric dependencies installed in the same environment, so wait until
+    https://github.com/huggingface/evaluate/issues/138 is resolved before using this.
+    """
+    metrics = [metric.id for metric in list_metrics()]
+    supported_metrics = []
+    for metric in tqdm(metrics):
+        # TODO: this currently requires all metric dependencies to be installed
+        # in the same environment. Refactor to avoid needing to actually load
+        # the metric.
+        try:
+            print(f"INFO -- Attempting to load metric: {metric}")
+            metric_func = load(metric)
+        except Exception as e:
+            print(e)
+            print("WARNING -- Skipping the following metric, which cannot load:", metric)
+            continue
+        argspec = inspect.getfullargspec(metric_func.compute)
+        if "references" in argspec.kwonlyargs and "predictions" in argspec.kwonlyargs:
+            # We require that "references" and "predictions" are arguments
+            # to the metric function. We also require that the other arguments
+            # besides "references" and "predictions" have defaults and so do not
+            # need to be specified explicitly.
+            defaults = True
+            for key, value in argspec.kwonlydefaults.items():
+                if key not in ("references", "predictions"):
+                    if value is None:
+                        defaults = False
+                        break
+            if defaults:
+                supported_metrics.append(metric)
+    return supported_metrics
+def get_dataset_card_url(dataset_id: str) -> str:
+    """Gets the URL to edit the dataset card for the given dataset ID."""
+    if "/" in dataset_id:
+        return f"https://huggingface.co/datasets/{dataset_id}/edit/main/README.md"
+    else:
+        return f"https://github.com/huggingface/datasets/edit/master/datasets/{dataset_id}/README.md"
+def create_autotrain_project_name(dataset_id: str, dataset_config: str) -> str:
+    """Creates an AutoTrain project name for the given dataset ID."""
+    # Project names cannot have "/", so we need to format community datasets accordingly
+    dataset_id_formatted = dataset_id.replace("/", "__")
+    dataset_config_formatted = dataset_config.replace("--", "__")
+    # Project names need to be unique, so we append a random string to guarantee this while adhering to naming rules
+    basename = f"eval-{dataset_id_formatted}-{dataset_config_formatted}"
+    basename = basename[:60] if len(basename) > 60 else basename  # Hub naming limitation
+    return f"{basename}-{str(uuid.uuid4())[:6]}"
+def get_config_metadata(config: str, metadata: List[Dict] = None) -> Union[Dict, None]:
+    """Gets the dataset card metadata for the given config."""
+    if metadata is None:
+        return None
+    config_metadata = [m for m in metadata if m["config"] == config]
+    if len(config_metadata) >= 1:
+        return config_metadata[0]
+    else:
+        return None