Spaces:
Runtime error
Runtime error
Commit
·
3ce0948
0
Parent(s):
Duplicate from autoevaluate/model-evaluator
Browse filesCo-authored-by: Lewis Tunstall <[email protected]>
- .env.template +4 -0
- .github/workflows/check_filesize.yml +16 -0
- .github/workflows/quality.yml +29 -0
- .github/workflows/run_evaluation_jobs.yml +30 -0
- .github/workflows/sync_with_spaces.yml +20 -0
- .gitignore +134 -0
- LICENSE +201 -0
- Makefile +8 -0
- README.md +114 -0
- app.py +693 -0
- evaluation.py +57 -0
- images/autotrain_job.png +0 -0
- images/autotrain_projects.png +0 -0
- notebooks/flush-prediction-repos.ipynb +177 -0
- pyproject.toml +2 -0
- requirements.txt +12 -0
- run_evaluation_jobs.py +64 -0
- utils.py +215 -0
.env.template
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AUTOTRAIN_USERNAME=autoevaluator # The bot or user that authors evaluation jobs
|
2 |
+
HF_TOKEN=hf_xxx # An API token of the `autoevaluator` user
|
3 |
+
AUTOTRAIN_BACKEND_API=https://api-staging.autotrain.huggingface.co # The AutoTrain backend to send jobs to. Use https://api.autotrain.huggingface.co for prod or http://localhost:8000 for local development
|
4 |
+
DATASETS_PREVIEW_API=https://datasets-server.huggingface.co # The API to grab dataset information from
|
.github/workflows/check_filesize.yml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Check file size
|
2 |
+
on: # or directly `on: [push]` to run the action on every push on any branch
|
3 |
+
pull_request:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- name: Check large files
|
14 |
+
uses: ActionsDesk/[email protected]
|
15 |
+
with:
|
16 |
+
filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
|
.github/workflows/quality.yml
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Code quality
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
pull_request:
|
8 |
+
branches:
|
9 |
+
- main
|
10 |
+
|
11 |
+
jobs:
|
12 |
+
|
13 |
+
check_code_quality:
|
14 |
+
name: Check code quality
|
15 |
+
runs-on: ubuntu-latest
|
16 |
+
steps:
|
17 |
+
- name: Checkout code
|
18 |
+
uses: actions/checkout@v2
|
19 |
+
- name: Setup Python environment
|
20 |
+
uses: actions/setup-python@v2
|
21 |
+
with:
|
22 |
+
python-version: 3.9
|
23 |
+
- name: Install dependencies
|
24 |
+
run: |
|
25 |
+
python -m pip install --upgrade pip
|
26 |
+
python -m pip install black isort flake8
|
27 |
+
- name: Code quality
|
28 |
+
run: |
|
29 |
+
make quality
|
.github/workflows/run_evaluation_jobs.yml
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Start evaluation jobs
|
2 |
+
|
3 |
+
on:
|
4 |
+
schedule:
|
5 |
+
- cron: '*/15 * * * *' # Start evaluations every 15th minute
|
6 |
+
|
7 |
+
jobs:
|
8 |
+
|
9 |
+
build:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
|
12 |
+
steps:
|
13 |
+
- name: Checkout code
|
14 |
+
uses: actions/checkout@v2
|
15 |
+
|
16 |
+
- name: Setup Python Environment
|
17 |
+
uses: actions/setup-python@v2
|
18 |
+
with:
|
19 |
+
python-version: 3.8
|
20 |
+
|
21 |
+
- name: Install requirements
|
22 |
+
run: pip install -r requirements.txt
|
23 |
+
|
24 |
+
- name: Execute scoring script
|
25 |
+
env:
|
26 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
27 |
+
AUTOTRAIN_USERNAME: ${{ secrets.AUTOTRAIN_USERNAME }}
|
28 |
+
AUTOTRAIN_BACKEND_API: ${{ secrets.AUTOTRAIN_BACKEND_API }}
|
29 |
+
run: |
|
30 |
+
HF_TOKEN=$HF_TOKEN AUTOTRAIN_USERNAME=$AUTOTRAIN_USERNAME AUTOTRAIN_BACKEND_API=$AUTOTRAIN_BACKEND_API python run_evaluation_jobs.py
|
.github/workflows/sync_with_spaces.yml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- uses: actions/checkout@v2
|
14 |
+
with:
|
15 |
+
fetch-depth: 0
|
16 |
+
- name: Push to hub
|
17 |
+
env:
|
18 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
19 |
+
run: |
|
20 |
+
git push https://lewtun:[email protected]/spaces/autoevaluate/model-evaluator main
|
.gitignore
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
# Usually these files are written by a python script from a template
|
32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
+
*.manifest
|
34 |
+
*.spec
|
35 |
+
|
36 |
+
# Installer logs
|
37 |
+
pip-log.txt
|
38 |
+
pip-delete-this-directory.txt
|
39 |
+
|
40 |
+
# Unit test / coverage reports
|
41 |
+
htmlcov/
|
42 |
+
.tox/
|
43 |
+
.nox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
*.py,cover
|
51 |
+
.hypothesis/
|
52 |
+
.pytest_cache/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# IPython
|
81 |
+
profile_default/
|
82 |
+
ipython_config.py
|
83 |
+
|
84 |
+
# pyenv
|
85 |
+
.python-version
|
86 |
+
|
87 |
+
# pipenv
|
88 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
89 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
90 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
91 |
+
# install all needed dependencies.
|
92 |
+
#Pipfile.lock
|
93 |
+
|
94 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
95 |
+
__pypackages__/
|
96 |
+
|
97 |
+
# Celery stuff
|
98 |
+
celerybeat-schedule
|
99 |
+
celerybeat.pid
|
100 |
+
|
101 |
+
# SageMath parsed files
|
102 |
+
*.sage.py
|
103 |
+
|
104 |
+
# Environments
|
105 |
+
.env
|
106 |
+
.venv
|
107 |
+
env/
|
108 |
+
venv/
|
109 |
+
ENV/
|
110 |
+
env.bak/
|
111 |
+
venv.bak/
|
112 |
+
|
113 |
+
# Spyder project settings
|
114 |
+
.spyderproject
|
115 |
+
.spyproject
|
116 |
+
|
117 |
+
# Rope project settings
|
118 |
+
.ropeproject
|
119 |
+
|
120 |
+
# mkdocs documentation
|
121 |
+
/site
|
122 |
+
|
123 |
+
# mypy
|
124 |
+
.mypy_cache/
|
125 |
+
.dmypy.json
|
126 |
+
dmypy.json
|
127 |
+
|
128 |
+
# Pyre type checker
|
129 |
+
.pyre/
|
130 |
+
|
131 |
+
scratch/
|
132 |
+
|
133 |
+
# Evaluation job logs
|
134 |
+
evaluation-job-logs/
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
Makefile
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
style:
|
2 |
+
python -m black --line-length 119 --target-version py39 .
|
3 |
+
python -m isort .
|
4 |
+
|
5 |
+
quality:
|
6 |
+
python -m black --check --line-length 119 --target-version py39 .
|
7 |
+
python -m isort --check-only .
|
8 |
+
python -m flake8 --max-line-length 119
|
README.md
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Model Evaluator
|
3 |
+
emoji: 📊
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: red
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.10.0
|
8 |
+
app_file: app.py
|
9 |
+
duplicated_from: autoevaluate/model-evaluator
|
10 |
+
---
|
11 |
+
|
12 |
+
# Model Evaluator
|
13 |
+
|
14 |
+
> Submit evaluation jobs to AutoTrain from the Hugging Face Hub
|
15 |
+
|
16 |
+
## Supported tasks
|
17 |
+
|
18 |
+
The table below shows which tasks are currently supported for evaluation in the AutoTrain backend:
|
19 |
+
|
20 |
+
| Task | Supported |
|
21 |
+
|:-----------------------------------|:---------:|
|
22 |
+
| `binary_classification` | ✅ |
|
23 |
+
| `multi_class_classification` | ✅ |
|
24 |
+
| `multi_label_classification` | ❌ |
|
25 |
+
| `entity_extraction` | ✅ |
|
26 |
+
| `extractive_question_answering` | ✅ |
|
27 |
+
| `translation` | ✅ |
|
28 |
+
| `summarization` | ✅ |
|
29 |
+
| `image_binary_classification` | ✅ |
|
30 |
+
| `image_multi_class_classification` | ✅ |
|
31 |
+
| `text_zero_shot_evaluation` | ✅ |
|
32 |
+
|
33 |
+
|
34 |
+
## Installation
|
35 |
+
|
36 |
+
To run the application locally, first clone this repository and install the dependencies as follows:
|
37 |
+
|
38 |
+
```
|
39 |
+
pip install -r requirements.txt
|
40 |
+
```
|
41 |
+
|
42 |
+
Next, copy the example file of environment variables:
|
43 |
+
|
44 |
+
```
|
45 |
+
cp .env.template .env
|
46 |
+
```
|
47 |
+
|
48 |
+
and set the `HF_TOKEN` variable with a valid API token from the [`autoevaluator`](https://huggingface.co/autoevaluator) bot user. Finally, spin up the application by running:
|
49 |
+
|
50 |
+
```
|
51 |
+
streamlit run app.py
|
52 |
+
```
|
53 |
+
|
54 |
+
## Usage
|
55 |
+
|
56 |
+
Evaluation on the Hub involves two main steps:
|
57 |
+
|
58 |
+
1. Submitting an evaluation job via the UI. This creates an AutoTrain project with `N` models for evaluation. At this stage, the dataset is also processed and prepared for evaluation.
|
59 |
+
2. Triggering the evaluation itself once the dataset is processed.
|
60 |
+
|
61 |
+
From the user perspective, only step (1) is needed since step (2) is handled by a cron job on GitHub Actions that executes the `run_evaluation_jobs.py` script every 15 minutes.
|
62 |
+
|
63 |
+
See below for details on manually triggering evaluation jobs.
|
64 |
+
|
65 |
+
### Triggering an evaluation
|
66 |
+
|
67 |
+
To evaluate the models in an AutoTrain project, run:
|
68 |
+
|
69 |
+
```
|
70 |
+
python run_evaluation_jobs.py
|
71 |
+
```
|
72 |
+
|
73 |
+
This will download the [`autoevaluate/evaluation-job-logs`](https://huggingface.co/datasets/autoevaluate/evaluation-job-logs) dataset from the Hub and check which evaluation projects are ready for evaluation (i.e. those whose dataset has been processed).
|
74 |
+
|
75 |
+
## AutoTrain configuration details
|
76 |
+
|
77 |
+
Models are evaluated by the [`autoevaluator`](https://huggingface.co/autoevaluator) bot user in AutoTrain, with the payload sent to the `AUTOTRAIN_BACKEND_API` environment variable. Evaluation projects are created and run on either the `prod` or `staging` environments. You can view the status of projects in the AutoTrain UI by navigating to one of the links below (ask internally for access to the staging UI):
|
78 |
+
|
79 |
+
| AutoTrain environment | AutoTrain UI URL | `AUTOTRAIN_BACKEND_API` |
|
80 |
+
|:---------------------:|:--------------------------------------------------------------------------------------------------------------:|:--------------------------------------------:|
|
81 |
+
| `prod` | [`https://ui.autotrain.huggingface.co/projects`](https://ui.autotrain.huggingface.co/projects) | https://api.autotrain.huggingface.co |
|
82 |
+
| `staging` | [`https://ui-staging.autotrain.huggingface.co/projects`](https://ui-staging.autotrain.huggingface.co/projects) | https://api-staging.autotrain.huggingface.co |
|
83 |
+
|
84 |
+
|
85 |
+
The current configuration for evaluation jobs running on [Spaces](https://huggingface.co/spaces/autoevaluate/model-evaluator) is:
|
86 |
+
|
87 |
+
```
|
88 |
+
AUTOTRAIN_BACKEND_API=https://api.autotrain.huggingface.co
|
89 |
+
```
|
90 |
+
|
91 |
+
To evaluate models with a _local_ instance of AutoTrain, change the environment to:
|
92 |
+
|
93 |
+
```
|
94 |
+
AUTOTRAIN_BACKEND_API=http://localhost:8000
|
95 |
+
```
|
96 |
+
|
97 |
+
### Migrating from staging to production (and vice versa)
|
98 |
+
|
99 |
+
In general, evaluation jobs should run in AutoTrain's `prod` environment, which is defined by the following environment variable:
|
100 |
+
|
101 |
+
```
|
102 |
+
AUTOTRAIN_BACKEND_API=https://api.autotrain.huggingface.co
|
103 |
+
```
|
104 |
+
|
105 |
+
However, there are times when it is necessary to run evaluation jobs in AutoTrain's `staging` environment (e.g. because a new evaluation pipeline is being deployed). In these cases the corresponding environement variable is:
|
106 |
+
|
107 |
+
```
|
108 |
+
AUTOTRAIN_BACKEND_API=https://api-staging.autotrain.huggingface.co
|
109 |
+
```
|
110 |
+
|
111 |
+
To migrate between these two environments, update the `AUTOTRAIN_BACKEND_API` in two places:
|
112 |
+
|
113 |
+
* In the [repo secrets](https://huggingface.co/spaces/autoevaluate/model-evaluator/settings) associated with the `model-evaluator` Space. This will ensure evaluation projects are created in the desired environment.
|
114 |
+
* In the [GitHub Actions secrets](https://github.com/huggingface/model-evaluator/settings/secrets/actions) associated with this repo. This will ensure that the correct evaluation jobs are approved and launched via the `run_evaluation_jobs.py` script.
|
app.py
ADDED
@@ -0,0 +1,693 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
import streamlit as st
|
7 |
+
import yaml
|
8 |
+
from datasets import get_dataset_config_names
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
from huggingface_hub import list_datasets
|
11 |
+
|
12 |
+
from evaluation import filter_evaluated_models
|
13 |
+
from utils import (
|
14 |
+
AUTOTRAIN_TASK_TO_HUB_TASK,
|
15 |
+
commit_evaluation_log,
|
16 |
+
create_autotrain_project_name,
|
17 |
+
format_col_mapping,
|
18 |
+
get_compatible_models,
|
19 |
+
get_config_metadata,
|
20 |
+
get_dataset_card_url,
|
21 |
+
get_key,
|
22 |
+
get_metadata,
|
23 |
+
http_get,
|
24 |
+
http_post,
|
25 |
+
)
|
26 |
+
|
27 |
+
if Path(".env").is_file():
|
28 |
+
load_dotenv(".env")
|
29 |
+
|
30 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
31 |
+
AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
|
32 |
+
AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
|
33 |
+
DATASETS_PREVIEW_API = os.getenv("DATASETS_PREVIEW_API")
|
34 |
+
|
35 |
+
# Put image tasks on top
|
36 |
+
TASK_TO_ID = {
|
37 |
+
"image_binary_classification": 17,
|
38 |
+
"image_multi_class_classification": 18,
|
39 |
+
"binary_classification": 1,
|
40 |
+
"multi_class_classification": 2,
|
41 |
+
"natural_language_inference": 22,
|
42 |
+
"entity_extraction": 4,
|
43 |
+
"extractive_question_answering": 5,
|
44 |
+
"translation": 6,
|
45 |
+
"summarization": 8,
|
46 |
+
"text_zero_shot_classification": 23,
|
47 |
+
}
|
48 |
+
|
49 |
+
TASK_TO_DEFAULT_METRICS = {
|
50 |
+
"binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
|
51 |
+
"multi_class_classification": [
|
52 |
+
"f1",
|
53 |
+
"precision",
|
54 |
+
"recall",
|
55 |
+
"accuracy",
|
56 |
+
],
|
57 |
+
"natural_language_inference": ["f1", "precision", "recall", "auc", "accuracy"],
|
58 |
+
"entity_extraction": ["precision", "recall", "f1", "accuracy"],
|
59 |
+
"extractive_question_answering": ["f1", "exact_match"],
|
60 |
+
"translation": ["sacrebleu"],
|
61 |
+
"summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
|
62 |
+
"image_binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
|
63 |
+
"image_multi_class_classification": [
|
64 |
+
"f1",
|
65 |
+
"precision",
|
66 |
+
"recall",
|
67 |
+
"accuracy",
|
68 |
+
],
|
69 |
+
"text_zero_shot_classification": ["accuracy", "loss"],
|
70 |
+
}
|
71 |
+
|
72 |
+
AUTOTRAIN_TASK_TO_LANG = {
|
73 |
+
"translation": "en2de",
|
74 |
+
"image_binary_classification": "unk",
|
75 |
+
"image_multi_class_classification": "unk",
|
76 |
+
}
|
77 |
+
|
78 |
+
AUTOTRAIN_MACHINE = {"text_zero_shot_classification": "r5.16x"}
|
79 |
+
|
80 |
+
|
81 |
+
SUPPORTED_TASKS = list(TASK_TO_ID.keys())
|
82 |
+
|
83 |
+
# Extracted from utils.get_supported_metrics
|
84 |
+
# Hardcoded for now due to speed / caching constraints
|
85 |
+
SUPPORTED_METRICS = [
|
86 |
+
"accuracy",
|
87 |
+
"bertscore",
|
88 |
+
"bleu",
|
89 |
+
"cer",
|
90 |
+
"chrf",
|
91 |
+
"code_eval",
|
92 |
+
"comet",
|
93 |
+
"competition_math",
|
94 |
+
"coval",
|
95 |
+
"cuad",
|
96 |
+
"exact_match",
|
97 |
+
"f1",
|
98 |
+
"frugalscore",
|
99 |
+
"google_bleu",
|
100 |
+
"mae",
|
101 |
+
"mahalanobis",
|
102 |
+
"matthews_correlation",
|
103 |
+
"mean_iou",
|
104 |
+
"meteor",
|
105 |
+
"mse",
|
106 |
+
"pearsonr",
|
107 |
+
"perplexity",
|
108 |
+
"precision",
|
109 |
+
"recall",
|
110 |
+
"roc_auc",
|
111 |
+
"rouge",
|
112 |
+
"sacrebleu",
|
113 |
+
"sari",
|
114 |
+
"seqeval",
|
115 |
+
"spearmanr",
|
116 |
+
"squad",
|
117 |
+
"squad_v2",
|
118 |
+
"ter",
|
119 |
+
"trec_eval",
|
120 |
+
"wer",
|
121 |
+
"wiki_split",
|
122 |
+
"xnli",
|
123 |
+
"angelina-wang/directional_bias_amplification",
|
124 |
+
"jordyvl/ece",
|
125 |
+
"lvwerra/ai4code",
|
126 |
+
"lvwerra/amex",
|
127 |
+
]
|
128 |
+
|
129 |
+
|
130 |
+
#######
|
131 |
+
# APP #
|
132 |
+
#######
|
133 |
+
st.title("Evaluation on the Hub")
|
134 |
+
st.markdown(
|
135 |
+
"""
|
136 |
+
Welcome to Hugging Face's automatic model evaluator 👋!
|
137 |
+
|
138 |
+
This application allows you to evaluate 🤗 Transformers
|
139 |
+
[models](https://huggingface.co/models?library=transformers&sort=downloads)
|
140 |
+
across a wide variety of [datasets](https://huggingface.co/datasets) on the
|
141 |
+
Hub. Please select the dataset and configuration below. The results of your
|
142 |
+
evaluation will be displayed on the [public
|
143 |
+
leaderboards](https://huggingface.co/spaces/autoevaluate/leaderboards). For
|
144 |
+
more details, check out out our [blog
|
145 |
+
post](https://huggingface.co/blog/eval-on-the-hub).
|
146 |
+
"""
|
147 |
+
)
|
148 |
+
|
149 |
+
all_datasets = [d.id for d in list_datasets()]
|
150 |
+
query_params = st.experimental_get_query_params()
|
151 |
+
if "first_query_params" not in st.session_state:
|
152 |
+
st.session_state.first_query_params = query_params
|
153 |
+
first_query_params = st.session_state.first_query_params
|
154 |
+
default_dataset = all_datasets[0]
|
155 |
+
if "dataset" in first_query_params:
|
156 |
+
if len(first_query_params["dataset"]) > 0 and first_query_params["dataset"][0] in all_datasets:
|
157 |
+
default_dataset = first_query_params["dataset"][0]
|
158 |
+
|
159 |
+
selected_dataset = st.selectbox(
|
160 |
+
"Select a dataset",
|
161 |
+
all_datasets,
|
162 |
+
index=all_datasets.index(default_dataset),
|
163 |
+
help="""Datasets with metadata can be evaluated with 1-click. Configure an evaluation job to add \
|
164 |
+
new metadata to a dataset card.""",
|
165 |
+
)
|
166 |
+
st.experimental_set_query_params(**{"dataset": [selected_dataset]})
|
167 |
+
|
168 |
+
# Check if selected dataset can be streamed
|
169 |
+
is_valid_dataset = http_get(
|
170 |
+
path="/is-valid",
|
171 |
+
domain=DATASETS_PREVIEW_API,
|
172 |
+
params={"dataset": selected_dataset},
|
173 |
+
).json()
|
174 |
+
if is_valid_dataset["valid"] is False:
|
175 |
+
st.error(
|
176 |
+
"""The dataset you selected is not currently supported. Open a \
|
177 |
+
[discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) for support."""
|
178 |
+
)
|
179 |
+
|
180 |
+
metadata = get_metadata(selected_dataset, token=HF_TOKEN)
|
181 |
+
print(f"INFO -- Dataset metadata: {metadata}")
|
182 |
+
if metadata is None:
|
183 |
+
st.warning("No evaluation metadata found. Please configure the evaluation job below.")
|
184 |
+
|
185 |
+
with st.expander("Advanced configuration"):
|
186 |
+
# Select task
|
187 |
+
selected_task = st.selectbox(
|
188 |
+
"Select a task",
|
189 |
+
SUPPORTED_TASKS,
|
190 |
+
index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0,
|
191 |
+
help="""Don't see your favourite task here? Open a \
|
192 |
+
[discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) to request it!""",
|
193 |
+
)
|
194 |
+
# Select config
|
195 |
+
configs = get_dataset_config_names(selected_dataset)
|
196 |
+
selected_config = st.selectbox(
|
197 |
+
"Select a config",
|
198 |
+
configs,
|
199 |
+
help="""Some datasets contain several sub-datasets, known as _configurations_. \
|
200 |
+
Select one to evaluate your models on. \
|
201 |
+
See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details.
|
202 |
+
""",
|
203 |
+
)
|
204 |
+
# Some datasets have multiple metadata (one per config), so we grab the one associated with the selected config
|
205 |
+
config_metadata = get_config_metadata(selected_config, metadata)
|
206 |
+
print(f"INFO -- Config metadata: {config_metadata}")
|
207 |
+
|
208 |
+
# Select splits
|
209 |
+
splits_resp = http_get(
|
210 |
+
path="/splits",
|
211 |
+
domain=DATASETS_PREVIEW_API,
|
212 |
+
params={"dataset": selected_dataset},
|
213 |
+
)
|
214 |
+
if splits_resp.status_code == 200:
|
215 |
+
split_names = []
|
216 |
+
all_splits = splits_resp.json()
|
217 |
+
for split in all_splits["splits"]:
|
218 |
+
if split["config"] == selected_config:
|
219 |
+
split_names.append(split["split"])
|
220 |
+
|
221 |
+
if config_metadata is not None:
|
222 |
+
eval_split = config_metadata["splits"].get("eval_split", None)
|
223 |
+
else:
|
224 |
+
eval_split = None
|
225 |
+
selected_split = st.selectbox(
|
226 |
+
"Select a split",
|
227 |
+
split_names,
|
228 |
+
index=split_names.index(eval_split) if eval_split is not None else 0,
|
229 |
+
help="Be wary when evaluating models on the `train` split.",
|
230 |
+
)
|
231 |
+
|
232 |
+
# Select columns
|
233 |
+
rows_resp = http_get(
|
234 |
+
path="/first-rows",
|
235 |
+
domain=DATASETS_PREVIEW_API,
|
236 |
+
params={
|
237 |
+
"dataset": selected_dataset,
|
238 |
+
"config": selected_config,
|
239 |
+
"split": selected_split,
|
240 |
+
},
|
241 |
+
).json()
|
242 |
+
col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
|
243 |
+
|
244 |
+
st.markdown("**Map your dataset columns**")
|
245 |
+
st.markdown(
|
246 |
+
"""The model evaluator uses a standardised set of column names for the input examples and labels. \
|
247 |
+
Please define the mapping between your dataset columns (right) and the standardised column names (left)."""
|
248 |
+
)
|
249 |
+
col1, col2 = st.columns(2)
|
250 |
+
|
251 |
+
# TODO: find a better way to layout these items
|
252 |
+
# TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
|
253 |
+
col_mapping = {}
|
254 |
+
if selected_task in ["binary_classification", "multi_class_classification"]:
|
255 |
+
with col1:
|
256 |
+
st.markdown("`text` column")
|
257 |
+
st.text("")
|
258 |
+
st.text("")
|
259 |
+
st.text("")
|
260 |
+
st.text("")
|
261 |
+
st.markdown("`target` column")
|
262 |
+
with col2:
|
263 |
+
text_col = st.selectbox(
|
264 |
+
"This column should contain the text to be classified",
|
265 |
+
col_names,
|
266 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
|
267 |
+
if config_metadata is not None
|
268 |
+
else 0,
|
269 |
+
)
|
270 |
+
target_col = st.selectbox(
|
271 |
+
"This column should contain the labels associated with the text",
|
272 |
+
col_names,
|
273 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
|
274 |
+
if config_metadata is not None
|
275 |
+
else 0,
|
276 |
+
)
|
277 |
+
col_mapping[text_col] = "text"
|
278 |
+
col_mapping[target_col] = "target"
|
279 |
+
|
280 |
+
elif selected_task == "text_zero_shot_classification":
|
281 |
+
with col1:
|
282 |
+
st.markdown("`text` column")
|
283 |
+
st.text("")
|
284 |
+
st.text("")
|
285 |
+
st.text("")
|
286 |
+
st.text("")
|
287 |
+
st.markdown("`classes` column")
|
288 |
+
st.text("")
|
289 |
+
st.text("")
|
290 |
+
st.text("")
|
291 |
+
st.text("")
|
292 |
+
st.markdown("`target` column")
|
293 |
+
with col2:
|
294 |
+
text_col = st.selectbox(
|
295 |
+
"This column should contain the text to be classified",
|
296 |
+
col_names,
|
297 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
|
298 |
+
if config_metadata is not None
|
299 |
+
else 0,
|
300 |
+
)
|
301 |
+
classes_col = st.selectbox(
|
302 |
+
"This column should contain the classes associated with the text",
|
303 |
+
col_names,
|
304 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "classes"))
|
305 |
+
if config_metadata is not None
|
306 |
+
else 0,
|
307 |
+
)
|
308 |
+
target_col = st.selectbox(
|
309 |
+
"This column should contain the index of the correct class",
|
310 |
+
col_names,
|
311 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
|
312 |
+
if config_metadata is not None
|
313 |
+
else 0,
|
314 |
+
)
|
315 |
+
col_mapping[text_col] = "text"
|
316 |
+
col_mapping[classes_col] = "classes"
|
317 |
+
col_mapping[target_col] = "target"
|
318 |
+
|
319 |
+
if selected_task in ["natural_language_inference"]:
|
320 |
+
config_metadata = get_config_metadata(selected_config, metadata)
|
321 |
+
with col1:
|
322 |
+
st.markdown("`text1` column")
|
323 |
+
st.text("")
|
324 |
+
st.text("")
|
325 |
+
st.text("")
|
326 |
+
st.text("")
|
327 |
+
st.text("")
|
328 |
+
st.markdown("`text2` column")
|
329 |
+
st.text("")
|
330 |
+
st.text("")
|
331 |
+
st.text("")
|
332 |
+
st.text("")
|
333 |
+
st.text("")
|
334 |
+
st.markdown("`target` column")
|
335 |
+
with col2:
|
336 |
+
text1_col = st.selectbox(
|
337 |
+
"This column should contain the first text passage to be classified",
|
338 |
+
col_names,
|
339 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "text1"))
|
340 |
+
if config_metadata is not None
|
341 |
+
else 0,
|
342 |
+
)
|
343 |
+
text2_col = st.selectbox(
|
344 |
+
"This column should contain the second text passage to be classified",
|
345 |
+
col_names,
|
346 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "text2"))
|
347 |
+
if config_metadata is not None
|
348 |
+
else 0,
|
349 |
+
)
|
350 |
+
target_col = st.selectbox(
|
351 |
+
"This column should contain the labels associated with the text",
|
352 |
+
col_names,
|
353 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
|
354 |
+
if config_metadata is not None
|
355 |
+
else 0,
|
356 |
+
)
|
357 |
+
col_mapping[text1_col] = "text1"
|
358 |
+
col_mapping[text2_col] = "text2"
|
359 |
+
col_mapping[target_col] = "target"
|
360 |
+
|
361 |
+
elif selected_task == "entity_extraction":
|
362 |
+
with col1:
|
363 |
+
st.markdown("`tokens` column")
|
364 |
+
st.text("")
|
365 |
+
st.text("")
|
366 |
+
st.text("")
|
367 |
+
st.text("")
|
368 |
+
st.markdown("`tags` column")
|
369 |
+
with col2:
|
370 |
+
tokens_col = st.selectbox(
|
371 |
+
"This column should contain the array of tokens to be classified",
|
372 |
+
col_names,
|
373 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "tokens"))
|
374 |
+
if config_metadata is not None
|
375 |
+
else 0,
|
376 |
+
)
|
377 |
+
tags_col = st.selectbox(
|
378 |
+
"This column should contain the labels associated with each part of the text",
|
379 |
+
col_names,
|
380 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "tags"))
|
381 |
+
if config_metadata is not None
|
382 |
+
else 0,
|
383 |
+
)
|
384 |
+
col_mapping[tokens_col] = "tokens"
|
385 |
+
col_mapping[tags_col] = "tags"
|
386 |
+
|
387 |
+
elif selected_task == "translation":
|
388 |
+
with col1:
|
389 |
+
st.markdown("`source` column")
|
390 |
+
st.text("")
|
391 |
+
st.text("")
|
392 |
+
st.text("")
|
393 |
+
st.text("")
|
394 |
+
st.markdown("`target` column")
|
395 |
+
with col2:
|
396 |
+
text_col = st.selectbox(
|
397 |
+
"This column should contain the text to be translated",
|
398 |
+
col_names,
|
399 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "source"))
|
400 |
+
if config_metadata is not None
|
401 |
+
else 0,
|
402 |
+
)
|
403 |
+
target_col = st.selectbox(
|
404 |
+
"This column should contain the target translation",
|
405 |
+
col_names,
|
406 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
|
407 |
+
if config_metadata is not None
|
408 |
+
else 0,
|
409 |
+
)
|
410 |
+
col_mapping[text_col] = "source"
|
411 |
+
col_mapping[target_col] = "target"
|
412 |
+
|
413 |
+
elif selected_task == "summarization":
|
414 |
+
with col1:
|
415 |
+
st.markdown("`text` column")
|
416 |
+
st.text("")
|
417 |
+
st.text("")
|
418 |
+
st.text("")
|
419 |
+
st.text("")
|
420 |
+
st.markdown("`target` column")
|
421 |
+
with col2:
|
422 |
+
text_col = st.selectbox(
|
423 |
+
"This column should contain the text to be summarized",
|
424 |
+
col_names,
|
425 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
|
426 |
+
if config_metadata is not None
|
427 |
+
else 0,
|
428 |
+
)
|
429 |
+
target_col = st.selectbox(
|
430 |
+
"This column should contain the target summary",
|
431 |
+
col_names,
|
432 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
|
433 |
+
if config_metadata is not None
|
434 |
+
else 0,
|
435 |
+
)
|
436 |
+
col_mapping[text_col] = "text"
|
437 |
+
col_mapping[target_col] = "target"
|
438 |
+
|
439 |
+
elif selected_task == "extractive_question_answering":
|
440 |
+
if config_metadata is not None:
|
441 |
+
col_mapping = config_metadata["col_mapping"]
|
442 |
+
# Hub YAML parser converts periods to hyphens, so we remap them here
|
443 |
+
col_mapping = format_col_mapping(col_mapping)
|
444 |
+
with col1:
|
445 |
+
st.markdown("`context` column")
|
446 |
+
st.text("")
|
447 |
+
st.text("")
|
448 |
+
st.text("")
|
449 |
+
st.text("")
|
450 |
+
st.markdown("`question` column")
|
451 |
+
st.text("")
|
452 |
+
st.text("")
|
453 |
+
st.text("")
|
454 |
+
st.text("")
|
455 |
+
st.markdown("`answers.text` column")
|
456 |
+
st.text("")
|
457 |
+
st.text("")
|
458 |
+
st.text("")
|
459 |
+
st.text("")
|
460 |
+
st.markdown("`answers.answer_start` column")
|
461 |
+
with col2:
|
462 |
+
context_col = st.selectbox(
|
463 |
+
"This column should contain the question's context",
|
464 |
+
col_names,
|
465 |
+
index=col_names.index(get_key(col_mapping, "context")) if config_metadata is not None else 0,
|
466 |
+
)
|
467 |
+
question_col = st.selectbox(
|
468 |
+
"This column should contain the question to be answered, given the context",
|
469 |
+
col_names,
|
470 |
+
index=col_names.index(get_key(col_mapping, "question")) if config_metadata is not None else 0,
|
471 |
+
)
|
472 |
+
answers_text_col = st.selectbox(
|
473 |
+
"This column should contain example answers to the question, extracted from the context",
|
474 |
+
col_names,
|
475 |
+
index=col_names.index(get_key(col_mapping, "answers.text")) if config_metadata is not None else 0,
|
476 |
+
)
|
477 |
+
answers_start_col = st.selectbox(
|
478 |
+
"This column should contain the indices in the context of the first character of each `answers.text`",
|
479 |
+
col_names,
|
480 |
+
index=col_names.index(get_key(col_mapping, "answers.answer_start"))
|
481 |
+
if config_metadata is not None
|
482 |
+
else 0,
|
483 |
+
)
|
484 |
+
col_mapping[context_col] = "context"
|
485 |
+
col_mapping[question_col] = "question"
|
486 |
+
col_mapping[answers_text_col] = "answers.text"
|
487 |
+
col_mapping[answers_start_col] = "answers.answer_start"
|
488 |
+
elif selected_task in ["image_binary_classification", "image_multi_class_classification"]:
|
489 |
+
with col1:
|
490 |
+
st.markdown("`image` column")
|
491 |
+
st.text("")
|
492 |
+
st.text("")
|
493 |
+
st.text("")
|
494 |
+
st.text("")
|
495 |
+
st.markdown("`target` column")
|
496 |
+
with col2:
|
497 |
+
image_col = st.selectbox(
|
498 |
+
"This column should contain the images to be classified",
|
499 |
+
col_names,
|
500 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "image"))
|
501 |
+
if config_metadata is not None
|
502 |
+
else 0,
|
503 |
+
)
|
504 |
+
target_col = st.selectbox(
|
505 |
+
"This column should contain the labels associated with the images",
|
506 |
+
col_names,
|
507 |
+
index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
|
508 |
+
if config_metadata is not None
|
509 |
+
else 0,
|
510 |
+
)
|
511 |
+
col_mapping[image_col] = "image"
|
512 |
+
col_mapping[target_col] = "target"
|
513 |
+
|
514 |
+
# Select metrics
|
515 |
+
st.markdown("**Select metrics**")
|
516 |
+
st.markdown("The following metrics will be computed")
|
517 |
+
html_string = " ".join(
|
518 |
+
[
|
519 |
+
'<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
|
520 |
+
+ '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
|
521 |
+
+ 'padding-left:5px;color:white">'
|
522 |
+
+ metric
|
523 |
+
+ "</div></div>"
|
524 |
+
for metric in TASK_TO_DEFAULT_METRICS[selected_task]
|
525 |
+
]
|
526 |
+
)
|
527 |
+
st.markdown(html_string, unsafe_allow_html=True)
|
528 |
+
selected_metrics = st.multiselect(
|
529 |
+
"(Optional) Select additional metrics",
|
530 |
+
sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))),
|
531 |
+
help="""User-selected metrics will be computed with their default arguments. \
|
532 |
+
For example, `f1` will report results for binary labels. \
|
533 |
+
Check out the [available metrics](https://huggingface.co/metrics) for more details.""",
|
534 |
+
)
|
535 |
+
|
536 |
+
with st.form(key="form"):
|
537 |
+
compatible_models = get_compatible_models(selected_task, [selected_dataset])
|
538 |
+
selected_models = st.multiselect(
|
539 |
+
"Select the models you wish to evaluate",
|
540 |
+
compatible_models,
|
541 |
+
help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \
|
542 |
+
[model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
|
543 |
+
)
|
544 |
+
print("INFO -- Selected models before filter:", selected_models)
|
545 |
+
|
546 |
+
hf_username = st.text_input("Enter your 🤗 Hub username to be notified when the evaluation is finished")
|
547 |
+
|
548 |
+
submit_button = st.form_submit_button("Evaluate models 🚀")
|
549 |
+
|
550 |
+
if submit_button:
|
551 |
+
if len(hf_username) == 0:
|
552 |
+
st.warning("No 🤗 Hub username provided! Please enter your username and try again.")
|
553 |
+
elif len(selected_models) == 0:
|
554 |
+
st.warning("⚠️ No models were selected for evaluation! Please select at least one model and try again.")
|
555 |
+
elif len(selected_models) > 10:
|
556 |
+
st.warning("Only 10 models can be evaluated at once. Please select fewer models and try again.")
|
557 |
+
else:
|
558 |
+
# Filter out previously evaluated models
|
559 |
+
selected_models = filter_evaluated_models(
|
560 |
+
selected_models,
|
561 |
+
selected_task,
|
562 |
+
selected_dataset,
|
563 |
+
selected_config,
|
564 |
+
selected_split,
|
565 |
+
selected_metrics,
|
566 |
+
)
|
567 |
+
print("INFO -- Selected models after filter:", selected_models)
|
568 |
+
if len(selected_models) > 0:
|
569 |
+
project_payload = {
|
570 |
+
"username": AUTOTRAIN_USERNAME,
|
571 |
+
"proj_name": create_autotrain_project_name(selected_dataset, selected_config),
|
572 |
+
"task": TASK_TO_ID[selected_task],
|
573 |
+
"config": {
|
574 |
+
"language": AUTOTRAIN_TASK_TO_LANG[selected_task]
|
575 |
+
if selected_task in AUTOTRAIN_TASK_TO_LANG
|
576 |
+
else "en",
|
577 |
+
"max_models": 5,
|
578 |
+
"instance": {
|
579 |
+
"provider": "sagemaker" if selected_task in AUTOTRAIN_MACHINE.keys() else "ovh",
|
580 |
+
"instance_type": AUTOTRAIN_MACHINE[selected_task]
|
581 |
+
if selected_task in AUTOTRAIN_MACHINE.keys()
|
582 |
+
else "p3",
|
583 |
+
"max_runtime_seconds": 172800,
|
584 |
+
"num_instances": 1,
|
585 |
+
"disk_size_gb": 200,
|
586 |
+
},
|
587 |
+
"evaluation": {
|
588 |
+
"metrics": selected_metrics,
|
589 |
+
"models": selected_models,
|
590 |
+
"hf_username": hf_username,
|
591 |
+
},
|
592 |
+
},
|
593 |
+
}
|
594 |
+
print(f"INFO -- Payload: {project_payload}")
|
595 |
+
project_json_resp = http_post(
|
596 |
+
path="/projects/create",
|
597 |
+
payload=project_payload,
|
598 |
+
token=HF_TOKEN,
|
599 |
+
domain=AUTOTRAIN_BACKEND_API,
|
600 |
+
).json()
|
601 |
+
print(f"INFO -- Project creation response: {project_json_resp}")
|
602 |
+
|
603 |
+
if project_json_resp["created"]:
|
604 |
+
data_payload = {
|
605 |
+
"split": 4, # use "auto" split choice in AutoTrain
|
606 |
+
"col_mapping": col_mapping,
|
607 |
+
"load_config": {"max_size_bytes": 0, "shuffle": False},
|
608 |
+
"dataset_id": selected_dataset,
|
609 |
+
"dataset_config": selected_config,
|
610 |
+
"dataset_split": selected_split,
|
611 |
+
}
|
612 |
+
data_json_resp = http_post(
|
613 |
+
path=f"/projects/{project_json_resp['id']}/data/dataset",
|
614 |
+
payload=data_payload,
|
615 |
+
token=HF_TOKEN,
|
616 |
+
domain=AUTOTRAIN_BACKEND_API,
|
617 |
+
).json()
|
618 |
+
print(f"INFO -- Dataset creation response: {data_json_resp}")
|
619 |
+
if data_json_resp["download_status"] == 1:
|
620 |
+
train_json_resp = http_post(
|
621 |
+
path=f"/projects/{project_json_resp['id']}/data/start_processing",
|
622 |
+
token=HF_TOKEN,
|
623 |
+
domain=AUTOTRAIN_BACKEND_API,
|
624 |
+
).json()
|
625 |
+
# For local development we process and approve projects on-the-fly
|
626 |
+
if "localhost" in AUTOTRAIN_BACKEND_API:
|
627 |
+
with st.spinner("⏳ Waiting for data processing to complete ..."):
|
628 |
+
is_data_processing_success = False
|
629 |
+
while is_data_processing_success is not True:
|
630 |
+
project_status = http_get(
|
631 |
+
path=f"/projects/{project_json_resp['id']}",
|
632 |
+
token=HF_TOKEN,
|
633 |
+
domain=AUTOTRAIN_BACKEND_API,
|
634 |
+
).json()
|
635 |
+
if project_status["status"] == 3:
|
636 |
+
is_data_processing_success = True
|
637 |
+
time.sleep(10)
|
638 |
+
|
639 |
+
# Approve training job
|
640 |
+
train_job_resp = http_post(
|
641 |
+
path=f"/projects/{project_json_resp['id']}/start_training",
|
642 |
+
token=HF_TOKEN,
|
643 |
+
domain=AUTOTRAIN_BACKEND_API,
|
644 |
+
).json()
|
645 |
+
st.success("✅ Data processing and project approval complete - go forth and evaluate!")
|
646 |
+
else:
|
647 |
+
# Prod/staging submissions are evaluated in a cron job via run_evaluation_jobs.py
|
648 |
+
print(f"INFO -- AutoTrain job response: {train_json_resp}")
|
649 |
+
if train_json_resp["success"]:
|
650 |
+
train_eval_index = {
|
651 |
+
"train-eval-index": [
|
652 |
+
{
|
653 |
+
"config": selected_config,
|
654 |
+
"task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
|
655 |
+
"task_id": selected_task,
|
656 |
+
"splits": {"eval_split": selected_split},
|
657 |
+
"col_mapping": col_mapping,
|
658 |
+
}
|
659 |
+
]
|
660 |
+
}
|
661 |
+
selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
|
662 |
+
dataset_card_url = get_dataset_card_url(selected_dataset)
|
663 |
+
st.success("✅ Successfully submitted evaluation job!")
|
664 |
+
st.markdown(
|
665 |
+
f"""
|
666 |
+
Evaluation can take up to 1 hour to complete, so grab a ☕️ or 🍵 while you wait:
|
667 |
+
|
668 |
+
* 🔔 A [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) with the evaluation results will be opened for each model you selected. Check your email for notifications.
|
669 |
+
* 📊 Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) to view the results from your submission once the Hub pull request is merged.
|
670 |
+
* 🥱 Tired of configuring evaluations? Add the following metadata to the [dataset card]({dataset_card_url}) to enable 1-click evaluations:
|
671 |
+
""" # noqa
|
672 |
+
)
|
673 |
+
st.markdown(
|
674 |
+
f"""
|
675 |
+
```yaml
|
676 |
+
{selected_metadata}
|
677 |
+
"""
|
678 |
+
)
|
679 |
+
print("INFO -- Pushing evaluation job logs to the Hub")
|
680 |
+
evaluation_log = {}
|
681 |
+
evaluation_log["project_id"] = project_json_resp["id"]
|
682 |
+
evaluation_log["autotrain_env"] = (
|
683 |
+
"staging" if "staging" in AUTOTRAIN_BACKEND_API else "prod"
|
684 |
+
)
|
685 |
+
evaluation_log["payload"] = project_payload
|
686 |
+
evaluation_log["project_creation_response"] = project_json_resp
|
687 |
+
evaluation_log["dataset_creation_response"] = data_json_resp
|
688 |
+
evaluation_log["autotrain_job_response"] = train_json_resp
|
689 |
+
commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
|
690 |
+
else:
|
691 |
+
st.error("🙈 Oh no, there was an error submitting your evaluation job!")
|
692 |
+
else:
|
693 |
+
st.warning("⚠️ No models left to evaluate! Please select other models and try again.")
|
evaluation.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
from dataclasses import dataclass
|
3 |
+
|
4 |
+
import streamlit as st
|
5 |
+
from huggingface_hub import DatasetFilter, HfApi
|
6 |
+
from huggingface_hub.hf_api import DatasetInfo
|
7 |
+
|
8 |
+
|
9 |
+
@dataclass(frozen=True, eq=True)
|
10 |
+
class EvaluationInfo:
|
11 |
+
task: str
|
12 |
+
model: str
|
13 |
+
dataset_name: str
|
14 |
+
dataset_config: str
|
15 |
+
dataset_split: str
|
16 |
+
metrics: set
|
17 |
+
|
18 |
+
|
19 |
+
def create_evaluation_info(dataset_info: DatasetInfo) -> int:
|
20 |
+
if dataset_info.cardData is not None:
|
21 |
+
metadata = dataset_info.cardData["eval_info"]
|
22 |
+
metadata.pop("col_mapping", None)
|
23 |
+
# TODO(lewtun): populate dataset cards with metric info
|
24 |
+
if "metrics" not in metadata:
|
25 |
+
metadata["metrics"] = frozenset()
|
26 |
+
else:
|
27 |
+
metadata["metrics"] = frozenset(metadata["metrics"])
|
28 |
+
return EvaluationInfo(**metadata)
|
29 |
+
|
30 |
+
|
31 |
+
def get_evaluation_infos():
|
32 |
+
filt = DatasetFilter(author="autoevaluate")
|
33 |
+
evaluation_datasets = HfApi().list_datasets(filter=filt, full=True)
|
34 |
+
return [create_evaluation_info(dset) for dset in evaluation_datasets]
|
35 |
+
|
36 |
+
|
37 |
+
def filter_evaluated_models(models, task, dataset_name, dataset_config, dataset_split, metrics):
|
38 |
+
evaluation_infos = get_evaluation_infos()
|
39 |
+
models_to_filter = copy.copy(models)
|
40 |
+
|
41 |
+
for model in models_to_filter:
|
42 |
+
evaluation_info = EvaluationInfo(
|
43 |
+
task=task,
|
44 |
+
model=model,
|
45 |
+
dataset_name=dataset_name,
|
46 |
+
dataset_config=dataset_config,
|
47 |
+
dataset_split=dataset_split,
|
48 |
+
metrics=frozenset(metrics),
|
49 |
+
)
|
50 |
+
if evaluation_info in evaluation_infos:
|
51 |
+
st.info(
|
52 |
+
f"Model [`{model}`](https://huggingface.co/{model}) has already been evaluated on this configuration. \
|
53 |
+
This model will be excluded from the evaluation job..."
|
54 |
+
)
|
55 |
+
models.remove(model)
|
56 |
+
|
57 |
+
return models
|
images/autotrain_job.png
ADDED
![]() |
images/autotrain_projects.png
ADDED
![]() |
notebooks/flush-prediction-repos.ipynb
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "c8093b9e-ca6a-423d-96c3-5fe21f7109a1",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"## Imports"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "code",
|
13 |
+
"execution_count": 1,
|
14 |
+
"id": "efe8cda7-a687-4867-b1f0-8efbcd428681",
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [],
|
17 |
+
"source": [
|
18 |
+
"import os\n",
|
19 |
+
"from pathlib import Path\n",
|
20 |
+
"\n",
|
21 |
+
"from dotenv import load_dotenv\n",
|
22 |
+
"from huggingface_hub import DatasetFilter, delete_repo, list_datasets\n",
|
23 |
+
"from tqdm.auto import tqdm\n",
|
24 |
+
"\n",
|
25 |
+
"if Path(\".env\").is_file():\n",
|
26 |
+
" load_dotenv(\".env\")\n",
|
27 |
+
"\n",
|
28 |
+
"HF_TOKEN = os.getenv(\"HF_TOKEN\")"
|
29 |
+
]
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"cell_type": "markdown",
|
33 |
+
"id": "8f6e01f0-b658-451f-999c-e08d9f4bbbd3",
|
34 |
+
"metadata": {},
|
35 |
+
"source": [
|
36 |
+
"## Get all prediction repos from autoevaluate org"
|
37 |
+
]
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"cell_type": "code",
|
41 |
+
"execution_count": 2,
|
42 |
+
"id": "2e369478-66d3-498d-a8fd-95bc9180f362",
|
43 |
+
"metadata": {},
|
44 |
+
"outputs": [],
|
45 |
+
"source": [
|
46 |
+
"def get_prediction_repos():\n",
|
47 |
+
" all_repos = list_datasets(author=\"autoevaluate\")\n",
|
48 |
+
" prediction_repos = [\n",
|
49 |
+
" repo for repo in all_repos if repo.id.split(\"/\")[1].startswith(\"autoeval-\")\n",
|
50 |
+
" ]\n",
|
51 |
+
" return prediction_repos"
|
52 |
+
]
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"cell_type": "code",
|
56 |
+
"execution_count": 3,
|
57 |
+
"id": "542db019-d01f-42f5-bef4-888dae8eeadb",
|
58 |
+
"metadata": {},
|
59 |
+
"outputs": [
|
60 |
+
{
|
61 |
+
"data": {
|
62 |
+
"text/plain": [
|
63 |
+
"66"
|
64 |
+
]
|
65 |
+
},
|
66 |
+
"execution_count": 3,
|
67 |
+
"metadata": {},
|
68 |
+
"output_type": "execute_result"
|
69 |
+
}
|
70 |
+
],
|
71 |
+
"source": [
|
72 |
+
"prediction_repos = get_prediction_repos()\n",
|
73 |
+
"len(prediction_repos)"
|
74 |
+
]
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"cell_type": "code",
|
78 |
+
"execution_count": 4,
|
79 |
+
"id": "331cfabf-4b73-490f-8d6a-86b5bc162666",
|
80 |
+
"metadata": {},
|
81 |
+
"outputs": [
|
82 |
+
{
|
83 |
+
"data": {
|
84 |
+
"text/plain": [
|
85 |
+
"DatasetInfo: {\n",
|
86 |
+
"\tid: autoevaluate/autoeval-staging-eval-project-9dcc51b5-6464670\n",
|
87 |
+
"\tsha: d3bb02be592d167f7a217ac9341d187142d9a90a\n",
|
88 |
+
"\tlastModified: 2022-06-13T14:54:34.000Z\n",
|
89 |
+
"\ttags: ['type:predictions', 'tags:autotrain', 'tags:evaluation', 'datasets:glue']\n",
|
90 |
+
"\tprivate: False\n",
|
91 |
+
"\tauthor: autoevaluate\n",
|
92 |
+
"\tdescription: None\n",
|
93 |
+
"\tcitation: None\n",
|
94 |
+
"\tcardData: None\n",
|
95 |
+
"\tsiblings: None\n",
|
96 |
+
"\tgated: False\n",
|
97 |
+
"\tdownloads: 12\n",
|
98 |
+
"}"
|
99 |
+
]
|
100 |
+
},
|
101 |
+
"execution_count": 4,
|
102 |
+
"metadata": {},
|
103 |
+
"output_type": "execute_result"
|
104 |
+
}
|
105 |
+
],
|
106 |
+
"source": [
|
107 |
+
"prediction_repos[0]"
|
108 |
+
]
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"cell_type": "markdown",
|
112 |
+
"id": "57a86b69-ffe8-4035-8f3d-5c917d8ce7bf",
|
113 |
+
"metadata": {},
|
114 |
+
"source": [
|
115 |
+
"## Delete all prediction repos"
|
116 |
+
]
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"cell_type": "code",
|
120 |
+
"execution_count": 5,
|
121 |
+
"id": "6c8e23e7-2a6d-437b-9742-17f37684d9eb",
|
122 |
+
"metadata": {},
|
123 |
+
"outputs": [
|
124 |
+
{
|
125 |
+
"data": {
|
126 |
+
"application/vnd.jupyter.widget-view+json": {
|
127 |
+
"model_id": "06fa304dcc6d44e39205b20a5e488052",
|
128 |
+
"version_major": 2,
|
129 |
+
"version_minor": 0
|
130 |
+
},
|
131 |
+
"text/plain": [
|
132 |
+
" 0%| | 0/66 [00:00<?, ?it/s]"
|
133 |
+
]
|
134 |
+
},
|
135 |
+
"metadata": {},
|
136 |
+
"output_type": "display_data"
|
137 |
+
}
|
138 |
+
],
|
139 |
+
"source": [
|
140 |
+
"for repo in tqdm(prediction_repos):\n",
|
141 |
+
" delete_repo(\n",
|
142 |
+
" repo_id=repo.id,\n",
|
143 |
+
" repo_type=\"dataset\",\n",
|
144 |
+
" )"
|
145 |
+
]
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"cell_type": "code",
|
149 |
+
"execution_count": null,
|
150 |
+
"id": "7d64b0aa-d05f-4497-9bd2-eb2fc0d8bd7a",
|
151 |
+
"metadata": {},
|
152 |
+
"outputs": [],
|
153 |
+
"source": []
|
154 |
+
}
|
155 |
+
],
|
156 |
+
"metadata": {
|
157 |
+
"kernelspec": {
|
158 |
+
"display_name": "autoevaluate",
|
159 |
+
"language": "python",
|
160 |
+
"name": "autoevaluate"
|
161 |
+
},
|
162 |
+
"language_info": {
|
163 |
+
"codemirror_mode": {
|
164 |
+
"name": "ipython",
|
165 |
+
"version": 3
|
166 |
+
},
|
167 |
+
"file_extension": ".py",
|
168 |
+
"mimetype": "text/x-python",
|
169 |
+
"name": "python",
|
170 |
+
"nbconvert_exporter": "python",
|
171 |
+
"pygments_lexer": "ipython3",
|
172 |
+
"version": "3.8.13"
|
173 |
+
}
|
174 |
+
},
|
175 |
+
"nbformat": 4,
|
176 |
+
"nbformat_minor": 5
|
177 |
+
}
|
pyproject.toml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[tool.isort]
|
2 |
+
profile = "black"
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
huggingface-hub<0.8
|
2 |
+
python-dotenv
|
3 |
+
streamlit==1.10.0
|
4 |
+
datasets<2.3
|
5 |
+
evaluate<0.2
|
6 |
+
jsonlines
|
7 |
+
typer
|
8 |
+
# Dataset specific deps
|
9 |
+
py7zr<0.19
|
10 |
+
openpyxl<3.1
|
11 |
+
# Dirty bug from Google
|
12 |
+
protobuf<=3.20.1
|
run_evaluation_jobs.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import typer
|
5 |
+
from datasets import load_dataset
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
|
8 |
+
from utils import http_get, http_post
|
9 |
+
|
10 |
+
if Path(".env").is_file():
|
11 |
+
load_dotenv(".env")
|
12 |
+
|
13 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
14 |
+
AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
|
15 |
+
AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
|
16 |
+
|
17 |
+
if "staging" in AUTOTRAIN_BACKEND_API:
|
18 |
+
AUTOTRAIN_ENV = "staging"
|
19 |
+
else:
|
20 |
+
AUTOTRAIN_ENV = "prod"
|
21 |
+
|
22 |
+
|
23 |
+
def main():
|
24 |
+
print(f"💡 Starting jobs on {AUTOTRAIN_ENV} environment")
|
25 |
+
logs_df = load_dataset("autoevaluate/evaluation-job-logs", use_auth_token=HF_TOKEN, split="train").to_pandas()
|
26 |
+
# Filter out legacy AutoTrain submissions prior to project approvals requirement
|
27 |
+
projects_df = logs_df.copy()[(~logs_df["project_id"].isnull())]
|
28 |
+
# Filter IDs for appropriate AutoTrain env (staging vs prod)
|
29 |
+
projects_df = projects_df.copy().query(f"autotrain_env == '{AUTOTRAIN_ENV}'")
|
30 |
+
projects_to_approve = projects_df["project_id"].astype(int).tolist()
|
31 |
+
failed_approvals = []
|
32 |
+
print(f"🚀 Found {len(projects_to_approve)} evaluation projects to approve!")
|
33 |
+
|
34 |
+
for project_id in projects_to_approve:
|
35 |
+
print(f"Attempting to evaluate project ID {project_id} ...")
|
36 |
+
try:
|
37 |
+
project_info = http_get(
|
38 |
+
path=f"/projects/{project_id}",
|
39 |
+
token=HF_TOKEN,
|
40 |
+
domain=AUTOTRAIN_BACKEND_API,
|
41 |
+
).json()
|
42 |
+
print(project_info)
|
43 |
+
# Only start evaluation for projects with completed data processing (status=3)
|
44 |
+
if project_info["status"] == 3 and project_info["training_status"] == "not_started":
|
45 |
+
train_job_resp = http_post(
|
46 |
+
path=f"/projects/{project_id}/start_training",
|
47 |
+
token=HF_TOKEN,
|
48 |
+
domain=AUTOTRAIN_BACKEND_API,
|
49 |
+
).json()
|
50 |
+
print(f"🤖 Project {project_id} approval response: {train_job_resp}")
|
51 |
+
else:
|
52 |
+
print(f"💪 Project {project_id} either not ready or has already been evaluated. Skipping ...")
|
53 |
+
except Exception as e:
|
54 |
+
print(f"There was a problem obtaining the project info for project ID {project_id}")
|
55 |
+
print(f"Error message: {e}")
|
56 |
+
failed_approvals.append(project_id)
|
57 |
+
pass
|
58 |
+
|
59 |
+
if len(failed_approvals) > 0:
|
60 |
+
print(f"🚨 Failed to approve {len(failed_approvals)} projects: {failed_approvals}")
|
61 |
+
|
62 |
+
|
63 |
+
if __name__ == "__main__":
|
64 |
+
typer.run(main)
|
utils.py
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import inspect
|
2 |
+
import uuid
|
3 |
+
from typing import Dict, List, Union
|
4 |
+
|
5 |
+
import jsonlines
|
6 |
+
import requests
|
7 |
+
import streamlit as st
|
8 |
+
from evaluate import load
|
9 |
+
from huggingface_hub import HfApi, ModelFilter, Repository, dataset_info, list_metrics
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
AUTOTRAIN_TASK_TO_HUB_TASK = {
|
13 |
+
"binary_classification": "text-classification",
|
14 |
+
"multi_class_classification": "text-classification",
|
15 |
+
"natural_language_inference": "text-classification",
|
16 |
+
"entity_extraction": "token-classification",
|
17 |
+
"extractive_question_answering": "question-answering",
|
18 |
+
"translation": "translation",
|
19 |
+
"summarization": "summarization",
|
20 |
+
"image_binary_classification": "image-classification",
|
21 |
+
"image_multi_class_classification": "image-classification",
|
22 |
+
"text_zero_shot_classification": "text-generation",
|
23 |
+
}
|
24 |
+
|
25 |
+
|
26 |
+
HUB_TASK_TO_AUTOTRAIN_TASK = {v: k for k, v in AUTOTRAIN_TASK_TO_HUB_TASK.items()}
|
27 |
+
LOGS_REPO = "evaluation-job-logs"
|
28 |
+
|
29 |
+
|
30 |
+
def get_auth_headers(token: str, prefix: str = "Bearer"):
|
31 |
+
return {"Authorization": f"{prefix} {token}"}
|
32 |
+
|
33 |
+
|
34 |
+
def http_post(path: str, token: str, payload=None, domain: str = None, params=None) -> requests.Response:
|
35 |
+
"""HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
|
36 |
+
try:
|
37 |
+
response = requests.post(
|
38 |
+
url=domain + path,
|
39 |
+
json=payload,
|
40 |
+
headers=get_auth_headers(token=token),
|
41 |
+
allow_redirects=True,
|
42 |
+
params=params,
|
43 |
+
)
|
44 |
+
except requests.exceptions.ConnectionError:
|
45 |
+
print("❌ Failed to reach AutoNLP API, check your internet connection")
|
46 |
+
response.raise_for_status()
|
47 |
+
return response
|
48 |
+
|
49 |
+
|
50 |
+
def http_get(path: str, domain: str, token: str = None, params: dict = None) -> requests.Response:
|
51 |
+
"""HTTP POST request to `path`, raises UnreachableAPIError if the API cannot be reached"""
|
52 |
+
try:
|
53 |
+
response = requests.get(
|
54 |
+
url=domain + path,
|
55 |
+
headers=get_auth_headers(token=token),
|
56 |
+
allow_redirects=True,
|
57 |
+
params=params,
|
58 |
+
)
|
59 |
+
except requests.exceptions.ConnectionError:
|
60 |
+
print(f"❌ Failed to reach {path}, check your internet connection")
|
61 |
+
response.raise_for_status()
|
62 |
+
return response
|
63 |
+
|
64 |
+
|
65 |
+
def get_metadata(dataset_name: str, token: str) -> Union[Dict, None]:
|
66 |
+
data = dataset_info(dataset_name, token=token)
|
67 |
+
if data.cardData is not None and "train-eval-index" in data.cardData.keys():
|
68 |
+
return data.cardData["train-eval-index"]
|
69 |
+
else:
|
70 |
+
return None
|
71 |
+
|
72 |
+
|
73 |
+
def get_compatible_models(task: str, dataset_ids: List[str]) -> List[str]:
|
74 |
+
"""
|
75 |
+
Returns all model IDs that are compatible with the given task and dataset names.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
task (`str`): The task to search for.
|
79 |
+
dataset_names (`List[str]`): A list of dataset names to search for.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
A list of model IDs, sorted alphabetically.
|
83 |
+
"""
|
84 |
+
compatible_models = []
|
85 |
+
# Allow any summarization model to be used for summarization tasks
|
86 |
+
# and allow any text-generation model to be used for text_zero_shot_classification
|
87 |
+
if task in ("summarization", "text_zero_shot_classification"):
|
88 |
+
model_filter = ModelFilter(
|
89 |
+
task=AUTOTRAIN_TASK_TO_HUB_TASK[task],
|
90 |
+
library=["transformers", "pytorch"],
|
91 |
+
)
|
92 |
+
compatible_models.extend(HfApi().list_models(filter=model_filter))
|
93 |
+
# Include models trained on SQuAD datasets, since these can be evaluated on
|
94 |
+
# other SQuAD-like datasets
|
95 |
+
if task == "extractive_question_answering":
|
96 |
+
dataset_ids.extend(["squad", "squad_v2"])
|
97 |
+
|
98 |
+
# TODO: relax filter on PyTorch models if TensorFlow supported in AutoTrain
|
99 |
+
for dataset_id in dataset_ids:
|
100 |
+
model_filter = ModelFilter(
|
101 |
+
task=AUTOTRAIN_TASK_TO_HUB_TASK[task],
|
102 |
+
trained_dataset=dataset_id,
|
103 |
+
library=["transformers", "pytorch"],
|
104 |
+
)
|
105 |
+
compatible_models.extend(HfApi().list_models(filter=model_filter))
|
106 |
+
return sorted(set([model.modelId for model in compatible_models]))
|
107 |
+
|
108 |
+
|
109 |
+
def get_key(col_mapping, val):
|
110 |
+
for key, value in col_mapping.items():
|
111 |
+
if val == value:
|
112 |
+
return key
|
113 |
+
|
114 |
+
return "key doesn't exist"
|
115 |
+
|
116 |
+
|
117 |
+
def format_col_mapping(col_mapping: dict) -> dict:
|
118 |
+
for k, v in col_mapping["answers"].items():
|
119 |
+
col_mapping[f"answers.{k}"] = f"answers.{v}"
|
120 |
+
del col_mapping["answers"]
|
121 |
+
return col_mapping
|
122 |
+
|
123 |
+
|
124 |
+
def commit_evaluation_log(evaluation_log, hf_access_token=None):
|
125 |
+
logs_repo_url = f"https://huggingface.co/datasets/autoevaluate/{LOGS_REPO}"
|
126 |
+
logs_repo = Repository(
|
127 |
+
local_dir=LOGS_REPO,
|
128 |
+
clone_from=logs_repo_url,
|
129 |
+
repo_type="dataset",
|
130 |
+
private=True,
|
131 |
+
use_auth_token=hf_access_token,
|
132 |
+
)
|
133 |
+
logs_repo.git_pull()
|
134 |
+
with jsonlines.open(f"{LOGS_REPO}/logs.jsonl") as r:
|
135 |
+
lines = []
|
136 |
+
for obj in r:
|
137 |
+
lines.append(obj)
|
138 |
+
|
139 |
+
lines.append(evaluation_log)
|
140 |
+
with jsonlines.open(f"{LOGS_REPO}/logs.jsonl", mode="w") as writer:
|
141 |
+
for job in lines:
|
142 |
+
writer.write(job)
|
143 |
+
logs_repo.push_to_hub(
|
144 |
+
commit_message=f"Evaluation submitted with project name {evaluation_log['payload']['proj_name']}"
|
145 |
+
)
|
146 |
+
print("INFO -- Pushed evaluation logs to the Hub")
|
147 |
+
|
148 |
+
|
149 |
+
@st.experimental_memo
|
150 |
+
def get_supported_metrics():
|
151 |
+
"""Helper function to get all metrics compatible with evaluation service.
|
152 |
+
|
153 |
+
Requires all metric dependencies installed in the same environment, so wait until
|
154 |
+
https://github.com/huggingface/evaluate/issues/138 is resolved before using this.
|
155 |
+
"""
|
156 |
+
metrics = [metric.id for metric in list_metrics()]
|
157 |
+
supported_metrics = []
|
158 |
+
for metric in tqdm(metrics):
|
159 |
+
# TODO: this currently requires all metric dependencies to be installed
|
160 |
+
# in the same environment. Refactor to avoid needing to actually load
|
161 |
+
# the metric.
|
162 |
+
try:
|
163 |
+
print(f"INFO -- Attempting to load metric: {metric}")
|
164 |
+
metric_func = load(metric)
|
165 |
+
except Exception as e:
|
166 |
+
print(e)
|
167 |
+
print("WARNING -- Skipping the following metric, which cannot load:", metric)
|
168 |
+
continue
|
169 |
+
|
170 |
+
argspec = inspect.getfullargspec(metric_func.compute)
|
171 |
+
if "references" in argspec.kwonlyargs and "predictions" in argspec.kwonlyargs:
|
172 |
+
# We require that "references" and "predictions" are arguments
|
173 |
+
# to the metric function. We also require that the other arguments
|
174 |
+
# besides "references" and "predictions" have defaults and so do not
|
175 |
+
# need to be specified explicitly.
|
176 |
+
defaults = True
|
177 |
+
for key, value in argspec.kwonlydefaults.items():
|
178 |
+
if key not in ("references", "predictions"):
|
179 |
+
if value is None:
|
180 |
+
defaults = False
|
181 |
+
break
|
182 |
+
|
183 |
+
if defaults:
|
184 |
+
supported_metrics.append(metric)
|
185 |
+
return supported_metrics
|
186 |
+
|
187 |
+
|
188 |
+
def get_dataset_card_url(dataset_id: str) -> str:
|
189 |
+
"""Gets the URL to edit the dataset card for the given dataset ID."""
|
190 |
+
if "/" in dataset_id:
|
191 |
+
return f"https://huggingface.co/datasets/{dataset_id}/edit/main/README.md"
|
192 |
+
else:
|
193 |
+
return f"https://github.com/huggingface/datasets/edit/master/datasets/{dataset_id}/README.md"
|
194 |
+
|
195 |
+
|
196 |
+
def create_autotrain_project_name(dataset_id: str, dataset_config: str) -> str:
|
197 |
+
"""Creates an AutoTrain project name for the given dataset ID."""
|
198 |
+
# Project names cannot have "/", so we need to format community datasets accordingly
|
199 |
+
dataset_id_formatted = dataset_id.replace("/", "__")
|
200 |
+
dataset_config_formatted = dataset_config.replace("--", "__")
|
201 |
+
# Project names need to be unique, so we append a random string to guarantee this while adhering to naming rules
|
202 |
+
basename = f"eval-{dataset_id_formatted}-{dataset_config_formatted}"
|
203 |
+
basename = basename[:60] if len(basename) > 60 else basename # Hub naming limitation
|
204 |
+
return f"{basename}-{str(uuid.uuid4())[:6]}"
|
205 |
+
|
206 |
+
|
207 |
+
def get_config_metadata(config: str, metadata: List[Dict] = None) -> Union[Dict, None]:
|
208 |
+
"""Gets the dataset card metadata for the given config."""
|
209 |
+
if metadata is None:
|
210 |
+
return None
|
211 |
+
config_metadata = [m for m in metadata if m["config"] == config]
|
212 |
+
if len(config_metadata) >= 1:
|
213 |
+
return config_metadata[0]
|
214 |
+
else:
|
215 |
+
return None
|