HadrienByr lewtun HF Staff commited on
Commit
3ce0948
·
0 Parent(s):

Duplicate from autoevaluate/model-evaluator

Browse files

Co-authored-by: Lewis Tunstall <[email protected]>

.env.template ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ AUTOTRAIN_USERNAME=autoevaluator # The bot or user that authors evaluation jobs
2
+ HF_TOKEN=hf_xxx # An API token of the `autoevaluator` user
3
+ AUTOTRAIN_BACKEND_API=https://api-staging.autotrain.huggingface.co # The AutoTrain backend to send jobs to. Use https://api.autotrain.huggingface.co for prod or http://localhost:8000 for local development
4
+ DATASETS_PREVIEW_API=https://datasets-server.huggingface.co # The API to grab dataset information from
.github/workflows/check_filesize.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check file size
2
+ on: # or directly `on: [push]` to run the action on every push on any branch
3
+ pull_request:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Check large files
14
+ uses: ActionsDesk/[email protected]
15
+ with:
16
+ filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
.github/workflows/quality.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Code quality
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ pull_request:
8
+ branches:
9
+ - main
10
+
11
+ jobs:
12
+
13
+ check_code_quality:
14
+ name: Check code quality
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - name: Checkout code
18
+ uses: actions/checkout@v2
19
+ - name: Setup Python environment
20
+ uses: actions/setup-python@v2
21
+ with:
22
+ python-version: 3.9
23
+ - name: Install dependencies
24
+ run: |
25
+ python -m pip install --upgrade pip
26
+ python -m pip install black isort flake8
27
+ - name: Code quality
28
+ run: |
29
+ make quality
.github/workflows/run_evaluation_jobs.yml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Start evaluation jobs
2
+
3
+ on:
4
+ schedule:
5
+ - cron: '*/15 * * * *' # Start evaluations every 15th minute
6
+
7
+ jobs:
8
+
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout code
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Setup Python Environment
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: 3.8
20
+
21
+ - name: Install requirements
22
+ run: pip install -r requirements.txt
23
+
24
+ - name: Execute scoring script
25
+ env:
26
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
27
+ AUTOTRAIN_USERNAME: ${{ secrets.AUTOTRAIN_USERNAME }}
28
+ AUTOTRAIN_BACKEND_API: ${{ secrets.AUTOTRAIN_BACKEND_API }}
29
+ run: |
30
+ HF_TOKEN=$HF_TOKEN AUTOTRAIN_USERNAME=$AUTOTRAIN_USERNAME AUTOTRAIN_BACKEND_API=$AUTOTRAIN_BACKEND_API python run_evaluation_jobs.py
.github/workflows/sync_with_spaces.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v2
14
+ with:
15
+ fetch-depth: 0
16
+ - name: Push to hub
17
+ env:
18
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
19
+ run: |
20
+ git push https://lewtun:[email protected]/spaces/autoevaluate/model-evaluator main
.gitignore ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ scratch/
132
+
133
+ # Evaluation job logs
134
+ evaluation-job-logs/
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
Makefile ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ style:
2
+ python -m black --line-length 119 --target-version py39 .
3
+ python -m isort .
4
+
5
+ quality:
6
+ python -m black --check --line-length 119 --target-version py39 .
7
+ python -m isort --check-only .
8
+ python -m flake8 --max-line-length 119
README.md ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Model Evaluator
3
+ emoji: 📊
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: streamlit
7
+ sdk_version: 1.10.0
8
+ app_file: app.py
9
+ duplicated_from: autoevaluate/model-evaluator
10
+ ---
11
+
12
+ # Model Evaluator
13
+
14
+ > Submit evaluation jobs to AutoTrain from the Hugging Face Hub
15
+
16
+ ## Supported tasks
17
+
18
+ The table below shows which tasks are currently supported for evaluation in the AutoTrain backend:
19
+
20
+ | Task | Supported |
21
+ |:-----------------------------------|:---------:|
22
+ | `binary_classification` | ✅ |
23
+ | `multi_class_classification` | ✅ |
24
+ | `multi_label_classification` | ❌ |
25
+ | `entity_extraction` | ✅ |
26
+ | `extractive_question_answering` | ✅ |
27
+ | `translation` | ✅ |
28
+ | `summarization` | ✅ |
29
+ | `image_binary_classification` | ✅ |
30
+ | `image_multi_class_classification` | ✅ |
31
+ | `text_zero_shot_evaluation` | ✅ |
32
+
33
+
34
+ ## Installation
35
+
36
+ To run the application locally, first clone this repository and install the dependencies as follows:
37
+
38
+ ```
39
+ pip install -r requirements.txt
40
+ ```
41
+
42
+ Next, copy the example file of environment variables:
43
+
44
+ ```
45
+ cp .env.template .env
46
+ ```
47
+
48
+ and set the `HF_TOKEN` variable with a valid API token from the [`autoevaluator`](https://huggingface.co/autoevaluator) bot user. Finally, spin up the application by running:
49
+
50
+ ```
51
+ streamlit run app.py
52
+ ```
53
+
54
+ ## Usage
55
+
56
+ Evaluation on the Hub involves two main steps:
57
+
58
+ 1. Submitting an evaluation job via the UI. This creates an AutoTrain project with `N` models for evaluation. At this stage, the dataset is also processed and prepared for evaluation.
59
+ 2. Triggering the evaluation itself once the dataset is processed.
60
+
61
+ From the user perspective, only step (1) is needed since step (2) is handled by a cron job on GitHub Actions that executes the `run_evaluation_jobs.py` script every 15 minutes.
62
+
63
+ See below for details on manually triggering evaluation jobs.
64
+
65
+ ### Triggering an evaluation
66
+
67
+ To evaluate the models in an AutoTrain project, run:
68
+
69
+ ```
70
+ python run_evaluation_jobs.py
71
+ ```
72
+
73
+ This will download the [`autoevaluate/evaluation-job-logs`](https://huggingface.co/datasets/autoevaluate/evaluation-job-logs) dataset from the Hub and check which evaluation projects are ready for evaluation (i.e. those whose dataset has been processed).
74
+
75
+ ## AutoTrain configuration details
76
+
77
+ Models are evaluated by the [`autoevaluator`](https://huggingface.co/autoevaluator) bot user in AutoTrain, with the payload sent to the `AUTOTRAIN_BACKEND_API` environment variable. Evaluation projects are created and run on either the `prod` or `staging` environments. You can view the status of projects in the AutoTrain UI by navigating to one of the links below (ask internally for access to the staging UI):
78
+
79
+ | AutoTrain environment | AutoTrain UI URL | `AUTOTRAIN_BACKEND_API` |
80
+ |:---------------------:|:--------------------------------------------------------------------------------------------------------------:|:--------------------------------------------:|
81
+ | `prod` | [`https://ui.autotrain.huggingface.co/projects`](https://ui.autotrain.huggingface.co/projects) | https://api.autotrain.huggingface.co |
82
+ | `staging` | [`https://ui-staging.autotrain.huggingface.co/projects`](https://ui-staging.autotrain.huggingface.co/projects) | https://api-staging.autotrain.huggingface.co |
83
+
84
+
85
+ The current configuration for evaluation jobs running on [Spaces](https://huggingface.co/spaces/autoevaluate/model-evaluator) is:
86
+
87
+ ```
88
+ AUTOTRAIN_BACKEND_API=https://api.autotrain.huggingface.co
89
+ ```
90
+
91
+ To evaluate models with a _local_ instance of AutoTrain, change the environment to:
92
+
93
+ ```
94
+ AUTOTRAIN_BACKEND_API=http://localhost:8000
95
+ ```
96
+
97
+ ### Migrating from staging to production (and vice versa)
98
+
99
+ In general, evaluation jobs should run in AutoTrain's `prod` environment, which is defined by the following environment variable:
100
+
101
+ ```
102
+ AUTOTRAIN_BACKEND_API=https://api.autotrain.huggingface.co
103
+ ```
104
+
105
+ However, there are times when it is necessary to run evaluation jobs in AutoTrain's `staging` environment (e.g. because a new evaluation pipeline is being deployed). In these cases the corresponding environement variable is:
106
+
107
+ ```
108
+ AUTOTRAIN_BACKEND_API=https://api-staging.autotrain.huggingface.co
109
+ ```
110
+
111
+ To migrate between these two environments, update the `AUTOTRAIN_BACKEND_API` in two places:
112
+
113
+ * In the [repo secrets](https://huggingface.co/spaces/autoevaluate/model-evaluator/settings) associated with the `model-evaluator` Space. This will ensure evaluation projects are created in the desired environment.
114
+ * In the [GitHub Actions secrets](https://github.com/huggingface/model-evaluator/settings/secrets/actions) associated with this repo. This will ensure that the correct evaluation jobs are approved and launched via the `run_evaluation_jobs.py` script.
app.py ADDED
@@ -0,0 +1,693 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+ import streamlit as st
7
+ import yaml
8
+ from datasets import get_dataset_config_names
9
+ from dotenv import load_dotenv
10
+ from huggingface_hub import list_datasets
11
+
12
+ from evaluation import filter_evaluated_models
13
+ from utils import (
14
+ AUTOTRAIN_TASK_TO_HUB_TASK,
15
+ commit_evaluation_log,
16
+ create_autotrain_project_name,
17
+ format_col_mapping,
18
+ get_compatible_models,
19
+ get_config_metadata,
20
+ get_dataset_card_url,
21
+ get_key,
22
+ get_metadata,
23
+ http_get,
24
+ http_post,
25
+ )
26
+
27
+ if Path(".env").is_file():
28
+ load_dotenv(".env")
29
+
30
+ HF_TOKEN = os.getenv("HF_TOKEN")
31
+ AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
32
+ AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
33
+ DATASETS_PREVIEW_API = os.getenv("DATASETS_PREVIEW_API")
34
+
35
+ # Put image tasks on top
36
+ TASK_TO_ID = {
37
+ "image_binary_classification": 17,
38
+ "image_multi_class_classification": 18,
39
+ "binary_classification": 1,
40
+ "multi_class_classification": 2,
41
+ "natural_language_inference": 22,
42
+ "entity_extraction": 4,
43
+ "extractive_question_answering": 5,
44
+ "translation": 6,
45
+ "summarization": 8,
46
+ "text_zero_shot_classification": 23,
47
+ }
48
+
49
+ TASK_TO_DEFAULT_METRICS = {
50
+ "binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
51
+ "multi_class_classification": [
52
+ "f1",
53
+ "precision",
54
+ "recall",
55
+ "accuracy",
56
+ ],
57
+ "natural_language_inference": ["f1", "precision", "recall", "auc", "accuracy"],
58
+ "entity_extraction": ["precision", "recall", "f1", "accuracy"],
59
+ "extractive_question_answering": ["f1", "exact_match"],
60
+ "translation": ["sacrebleu"],
61
+ "summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
62
+ "image_binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
63
+ "image_multi_class_classification": [
64
+ "f1",
65
+ "precision",
66
+ "recall",
67
+ "accuracy",
68
+ ],
69
+ "text_zero_shot_classification": ["accuracy", "loss"],
70
+ }
71
+
72
+ AUTOTRAIN_TASK_TO_LANG = {
73
+ "translation": "en2de",
74
+ "image_binary_classification": "unk",
75
+ "image_multi_class_classification": "unk",
76
+ }
77
+
78
+ AUTOTRAIN_MACHINE = {"text_zero_shot_classification": "r5.16x"}
79
+
80
+
81
+ SUPPORTED_TASKS = list(TASK_TO_ID.keys())
82
+
83
+ # Extracted from utils.get_supported_metrics
84
+ # Hardcoded for now due to speed / caching constraints
85
+ SUPPORTED_METRICS = [
86
+ "accuracy",
87
+ "bertscore",
88
+ "bleu",
89
+ "cer",
90
+ "chrf",
91
+ "code_eval",
92
+ "comet",
93
+ "competition_math",
94
+ "coval",
95
+ "cuad",
96
+ "exact_match",
97
+ "f1",
98
+ "frugalscore",
99
+ "google_bleu",
100
+ "mae",
101
+ "mahalanobis",
102
+ "matthews_correlation",
103
+ "mean_iou",
104
+ "meteor",
105
+ "mse",
106
+ "pearsonr",
107
+ "perplexity",
108
+ "precision",
109
+ "recall",
110
+ "roc_auc",
111
+ "rouge",
112
+ "sacrebleu",
113
+ "sari",
114
+ "seqeval",
115
+ "spearmanr",
116
+ "squad",
117
+ "squad_v2",
118
+ "ter",
119
+ "trec_eval",
120
+ "wer",
121
+ "wiki_split",
122
+ "xnli",
123
+ "angelina-wang/directional_bias_amplification",
124
+ "jordyvl/ece",
125
+ "lvwerra/ai4code",
126
+ "lvwerra/amex",
127
+ ]
128
+
129
+
130
+ #######
131
+ # APP #
132
+ #######
133
+ st.title("Evaluation on the Hub")
134
+ st.markdown(
135
+ """
136
+ Welcome to Hugging Face's automatic model evaluator 👋!
137
+
138
+ This application allows you to evaluate 🤗 Transformers
139
+ [models](https://huggingface.co/models?library=transformers&sort=downloads)
140
+ across a wide variety of [datasets](https://huggingface.co/datasets) on the
141
+ Hub. Please select the dataset and configuration below. The results of your
142
+ evaluation will be displayed on the [public
143
+ leaderboards](https://huggingface.co/spaces/autoevaluate/leaderboards). For
144
+ more details, check out out our [blog
145
+ post](https://huggingface.co/blog/eval-on-the-hub).
146
+ """
147
+ )
148
+
149
+ all_datasets = [d.id for d in list_datasets()]
150
+ query_params = st.experimental_get_query_params()
151
+ if "first_query_params" not in st.session_state:
152
+ st.session_state.first_query_params = query_params
153
+ first_query_params = st.session_state.first_query_params
154
+ default_dataset = all_datasets[0]
155
+ if "dataset" in first_query_params:
156
+ if len(first_query_params["dataset"]) > 0 and first_query_params["dataset"][0] in all_datasets:
157
+ default_dataset = first_query_params["dataset"][0]
158
+
159
+ selected_dataset = st.selectbox(
160
+ "Select a dataset",
161
+ all_datasets,
162
+ index=all_datasets.index(default_dataset),
163
+ help="""Datasets with metadata can be evaluated with 1-click. Configure an evaluation job to add \
164
+ new metadata to a dataset card.""",
165
+ )
166
+ st.experimental_set_query_params(**{"dataset": [selected_dataset]})
167
+
168
+ # Check if selected dataset can be streamed
169
+ is_valid_dataset = http_get(
170
+ path="/is-valid",
171
+ domain=DATASETS_PREVIEW_API,
172
+ params={"dataset": selected_dataset},
173
+ ).json()
174
+ if is_valid_dataset["valid"] is False:
175
+ st.error(
176
+ """The dataset you selected is not currently supported. Open a \
177
+ [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) for support."""
178
+ )
179
+
180
+ metadata = get_metadata(selected_dataset, token=HF_TOKEN)
181
+ print(f"INFO -- Dataset metadata: {metadata}")
182
+ if metadata is None:
183
+ st.warning("No evaluation metadata found. Please configure the evaluation job below.")
184
+
185
+ with st.expander("Advanced configuration"):
186
+ # Select task
187
+ selected_task = st.selectbox(
188
+ "Select a task",
189
+ SUPPORTED_TASKS,
190
+ index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0,
191
+ help="""Don't see your favourite task here? Open a \
192
+ [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) to request it!""",
193
+ )
194
+ # Select config
195
+ configs = get_dataset_config_names(selected_dataset)
196
+ selected_config = st.selectbox(
197
+ "Select a config",
198
+ configs,
199
+ help="""Some datasets contain several sub-datasets, known as _configurations_. \
200
+ Select one to evaluate your models on. \
201
+ See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details.
202
+ """,
203
+ )
204
+ # Some datasets have multiple metadata (one per config), so we grab the one associated with the selected config
205
+ config_metadata = get_config_metadata(selected_config, metadata)
206
+ print(f"INFO -- Config metadata: {config_metadata}")
207
+
208
+ # Select splits
209
+ splits_resp = http_get(
210
+ path="/splits",
211
+ domain=DATASETS_PREVIEW_API,
212
+ params={"dataset": selected_dataset},
213
+ )
214
+ if splits_resp.status_code == 200:
215
+ split_names = []
216
+ all_splits = splits_resp.json()
217
+ for split in all_splits["splits"]:
218
+ if split["config"] == selected_config:
219
+ split_names.append(split["split"])
220
+
221
+ if config_metadata is not None:
222
+ eval_split = config_metadata["splits"].get("eval_split", None)
223
+ else:
224
+ eval_split = None
225
+ selected_split = st.selectbox(
226
+ "Select a split",
227
+ split_names,
228
+ index=split_names.index(eval_split) if eval_split is not None else 0,
229
+ help="Be wary when evaluating models on the `train` split.",
230
+ )
231
+
232
+ # Select columns
233
+ rows_resp = http_get(
234
+ path="/first-rows",
235
+ domain=DATASETS_PREVIEW_API,
236
+ params={
237
+ "dataset": selected_dataset,
238
+ "config": selected_config,
239
+ "split": selected_split,
240
+ },
241
+ ).json()
242
+ col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
243
+
244
+ st.markdown("**Map your dataset columns**")
245
+ st.markdown(
246
+ """The model evaluator uses a standardised set of column names for the input examples and labels. \
247
+ Please define the mapping between your dataset columns (right) and the standardised column names (left)."""
248
+ )
249
+ col1, col2 = st.columns(2)
250
+
251
+ # TODO: find a better way to layout these items
252
+ # TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
253
+ col_mapping = {}
254
+ if selected_task in ["binary_classification", "multi_class_classification"]:
255
+ with col1:
256
+ st.markdown("`text` column")
257
+ st.text("")
258
+ st.text("")
259
+ st.text("")
260
+ st.text("")
261
+ st.markdown("`target` column")
262
+ with col2:
263
+ text_col = st.selectbox(
264
+ "This column should contain the text to be classified",
265
+ col_names,
266
+ index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
267
+ if config_metadata is not None
268
+ else 0,
269
+ )
270
+ target_col = st.selectbox(
271
+ "This column should contain the labels associated with the text",
272
+ col_names,
273
+ index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
274
+ if config_metadata is not None
275
+ else 0,
276
+ )
277
+ col_mapping[text_col] = "text"
278
+ col_mapping[target_col] = "target"
279
+
280
+ elif selected_task == "text_zero_shot_classification":
281
+ with col1:
282
+ st.markdown("`text` column")
283
+ st.text("")
284
+ st.text("")
285
+ st.text("")
286
+ st.text("")
287
+ st.markdown("`classes` column")
288
+ st.text("")
289
+ st.text("")
290
+ st.text("")
291
+ st.text("")
292
+ st.markdown("`target` column")
293
+ with col2:
294
+ text_col = st.selectbox(
295
+ "This column should contain the text to be classified",
296
+ col_names,
297
+ index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
298
+ if config_metadata is not None
299
+ else 0,
300
+ )
301
+ classes_col = st.selectbox(
302
+ "This column should contain the classes associated with the text",
303
+ col_names,
304
+ index=col_names.index(get_key(config_metadata["col_mapping"], "classes"))
305
+ if config_metadata is not None
306
+ else 0,
307
+ )
308
+ target_col = st.selectbox(
309
+ "This column should contain the index of the correct class",
310
+ col_names,
311
+ index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
312
+ if config_metadata is not None
313
+ else 0,
314
+ )
315
+ col_mapping[text_col] = "text"
316
+ col_mapping[classes_col] = "classes"
317
+ col_mapping[target_col] = "target"
318
+
319
+ if selected_task in ["natural_language_inference"]:
320
+ config_metadata = get_config_metadata(selected_config, metadata)
321
+ with col1:
322
+ st.markdown("`text1` column")
323
+ st.text("")
324
+ st.text("")
325
+ st.text("")
326
+ st.text("")
327
+ st.text("")
328
+ st.markdown("`text2` column")
329
+ st.text("")
330
+ st.text("")
331
+ st.text("")
332
+ st.text("")
333
+ st.text("")
334
+ st.markdown("`target` column")
335
+ with col2:
336
+ text1_col = st.selectbox(
337
+ "This column should contain the first text passage to be classified",
338
+ col_names,
339
+ index=col_names.index(get_key(config_metadata["col_mapping"], "text1"))
340
+ if config_metadata is not None
341
+ else 0,
342
+ )
343
+ text2_col = st.selectbox(
344
+ "This column should contain the second text passage to be classified",
345
+ col_names,
346
+ index=col_names.index(get_key(config_metadata["col_mapping"], "text2"))
347
+ if config_metadata is not None
348
+ else 0,
349
+ )
350
+ target_col = st.selectbox(
351
+ "This column should contain the labels associated with the text",
352
+ col_names,
353
+ index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
354
+ if config_metadata is not None
355
+ else 0,
356
+ )
357
+ col_mapping[text1_col] = "text1"
358
+ col_mapping[text2_col] = "text2"
359
+ col_mapping[target_col] = "target"
360
+
361
+ elif selected_task == "entity_extraction":
362
+ with col1:
363
+ st.markdown("`tokens` column")
364
+ st.text("")
365
+ st.text("")
366
+ st.text("")
367
+ st.text("")
368
+ st.markdown("`tags` column")
369
+ with col2:
370
+ tokens_col = st.selectbox(
371
+ "This column should contain the array of tokens to be classified",
372
+ col_names,
373
+ index=col_names.index(get_key(config_metadata["col_mapping"], "tokens"))
374
+ if config_metadata is not None
375
+ else 0,
376
+ )
377
+ tags_col = st.selectbox(
378
+ "This column should contain the labels associated with each part of the text",
379
+ col_names,
380
+ index=col_names.index(get_key(config_metadata["col_mapping"], "tags"))
381
+ if config_metadata is not None
382
+ else 0,
383
+ )
384
+ col_mapping[tokens_col] = "tokens"
385
+ col_mapping[tags_col] = "tags"
386
+
387
+ elif selected_task == "translation":
388
+ with col1:
389
+ st.markdown("`source` column")
390
+ st.text("")
391
+ st.text("")
392
+ st.text("")
393
+ st.text("")
394
+ st.markdown("`target` column")
395
+ with col2:
396
+ text_col = st.selectbox(
397
+ "This column should contain the text to be translated",
398
+ col_names,
399
+ index=col_names.index(get_key(config_metadata["col_mapping"], "source"))
400
+ if config_metadata is not None
401
+ else 0,
402
+ )
403
+ target_col = st.selectbox(
404
+ "This column should contain the target translation",
405
+ col_names,
406
+ index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
407
+ if config_metadata is not None
408
+ else 0,
409
+ )
410
+ col_mapping[text_col] = "source"
411
+ col_mapping[target_col] = "target"
412
+
413
+ elif selected_task == "summarization":
414
+ with col1:
415
+ st.markdown("`text` column")
416
+ st.text("")
417
+ st.text("")
418
+ st.text("")
419
+ st.text("")
420
+ st.markdown("`target` column")
421
+ with col2:
422
+ text_col = st.selectbox(
423
+ "This column should contain the text to be summarized",
424
+ col_names,
425
+ index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
426
+ if config_metadata is not None
427
+ else 0,
428
+ )
429
+ target_col = st.selectbox(
430
+ "This column should contain the target summary",
431
+ col_names,
432
+ index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
433
+ if config_metadata is not None
434
+ else 0,
435
+ )
436
+ col_mapping[text_col] = "text"
437
+ col_mapping[target_col] = "target"
438
+
439
+ elif selected_task == "extractive_question_answering":
440
+ if config_metadata is not None:
441
+ col_mapping = config_metadata["col_mapping"]
442
+ # Hub YAML parser converts periods to hyphens, so we remap them here
443
+ col_mapping = format_col_mapping(col_mapping)
444
+ with col1:
445
+ st.markdown("`context` column")
446
+ st.text("")
447
+ st.text("")
448
+ st.text("")
449
+ st.text("")
450
+ st.markdown("`question` column")
451
+ st.text("")
452
+ st.text("")
453
+ st.text("")
454
+ st.text("")
455
+ st.markdown("`answers.text` column")
456
+ st.text("")
457
+ st.text("")
458
+ st.text("")
459
+ st.text("")
460
+ st.markdown("`answers.answer_start` column")
461
+ with col2:
462
+ context_col = st.selectbox(
463
+ "This column should contain the question's context",
464
+ col_names,
465
+ index=col_names.index(get_key(col_mapping, "context")) if config_metadata is not None else 0,
466
+ )
467
+ question_col = st.selectbox(
468
+ "This column should contain the question to be answered, given the context",
469
+ col_names,
470
+ index=col_names.index(get_key(col_mapping, "question")) if config_metadata is not None else 0,
471
+ )
472
+ answers_text_col = st.selectbox(
473
+ "This column should contain example answers to the question, extracted from the context",
474
+ col_names,
475
+ index=col_names.index(get_key(col_mapping, "answers.text")) if config_metadata is not None else 0,
476
+ )
477
+ answers_start_col = st.selectbox(
478
+ "This column should contain the indices in the context of the first character of each `answers.text`",
479
+ col_names,
480
+ index=col_names.index(get_key(col_mapping, "answers.answer_start"))
481
+ if config_metadata is not None
482
+ else 0,
483
+ )
484
+ col_mapping[context_col] = "context"
485
+ col_mapping[question_col] = "question"
486
+ col_mapping[answers_text_col] = "answers.text"
487
+ col_mapping[answers_start_col] = "answers.answer_start"
488
+ elif selected_task in ["image_binary_classification", "image_multi_class_classification"]:
489
+ with col1:
490
+ st.markdown("`image` column")
491
+ st.text("")
492
+ st.text("")
493
+ st.text("")
494
+ st.text("")
495
+ st.markdown("`target` column")
496
+ with col2:
497
+ image_col = st.selectbox(
498
+ "This column should contain the images to be classified",
499
+ col_names,
500
+ index=col_names.index(get_key(config_metadata["col_mapping"], "image"))
501
+ if config_metadata is not None
502
+ else 0,
503
+ )
504
+ target_col = st.selectbox(
505
+ "This column should contain the labels associated with the images",
506
+ col_names,
507
+ index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
508
+ if config_metadata is not None
509
+ else 0,
510
+ )
511
+ col_mapping[image_col] = "image"
512
+ col_mapping[target_col] = "target"
513
+
514
+ # Select metrics
515
+ st.markdown("**Select metrics**")
516
+ st.markdown("The following metrics will be computed")
517
+ html_string = " ".join(
518
+ [
519
+ '<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
520
+ + '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
521
+ + 'padding-left:5px;color:white">'
522
+ + metric
523
+ + "</div></div>"
524
+ for metric in TASK_TO_DEFAULT_METRICS[selected_task]
525
+ ]
526
+ )
527
+ st.markdown(html_string, unsafe_allow_html=True)
528
+ selected_metrics = st.multiselect(
529
+ "(Optional) Select additional metrics",
530
+ sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))),
531
+ help="""User-selected metrics will be computed with their default arguments. \
532
+ For example, `f1` will report results for binary labels. \
533
+ Check out the [available metrics](https://huggingface.co/metrics) for more details.""",
534
+ )
535
+
536
+ with st.form(key="form"):
537
+ compatible_models = get_compatible_models(selected_task, [selected_dataset])
538
+ selected_models = st.multiselect(
539
+ "Select the models you wish to evaluate",
540
+ compatible_models,
541
+ help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \
542
+ [model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
543
+ )
544
+ print("INFO -- Selected models before filter:", selected_models)
545
+
546
+ hf_username = st.text_input("Enter your 🤗 Hub username to be notified when the evaluation is finished")
547
+
548
+ submit_button = st.form_submit_button("Evaluate models 🚀")
549
+
550
+ if submit_button:
551
+ if len(hf_username) == 0:
552
+ st.warning("No 🤗 Hub username provided! Please enter your username and try again.")
553
+ elif len(selected_models) == 0:
554
+ st.warning("⚠️ No models were selected for evaluation! Please select at least one model and try again.")
555
+ elif len(selected_models) > 10:
556
+ st.warning("Only 10 models can be evaluated at once. Please select fewer models and try again.")
557
+ else:
558
+ # Filter out previously evaluated models
559
+ selected_models = filter_evaluated_models(
560
+ selected_models,
561
+ selected_task,
562
+ selected_dataset,
563
+ selected_config,
564
+ selected_split,
565
+ selected_metrics,
566
+ )
567
+ print("INFO -- Selected models after filter:", selected_models)
568
+ if len(selected_models) > 0:
569
+ project_payload = {
570
+ "username": AUTOTRAIN_USERNAME,
571
+ "proj_name": create_autotrain_project_name(selected_dataset, selected_config),
572
+ "task": TASK_TO_ID[selected_task],
573
+ "config": {
574
+ "language": AUTOTRAIN_TASK_TO_LANG[selected_task]
575
+ if selected_task in AUTOTRAIN_TASK_TO_LANG
576
+ else "en",
577
+ "max_models": 5,
578
+ "instance": {
579
+ "provider": "sagemaker" if selected_task in AUTOTRAIN_MACHINE.keys() else "ovh",
580
+ "instance_type": AUTOTRAIN_MACHINE[selected_task]
581
+ if selected_task in AUTOTRAIN_MACHINE.keys()
582
+ else "p3",
583
+ "max_runtime_seconds": 172800,
584
+ "num_instances": 1,
585
+ "disk_size_gb": 200,
586
+ },
587
+ "evaluation": {
588
+ "metrics": selected_metrics,
589
+ "models": selected_models,
590
+ "hf_username": hf_username,
591
+ },
592
+ },
593
+ }
594
+ print(f"INFO -- Payload: {project_payload}")
595
+ project_json_resp = http_post(
596
+ path="/projects/create",
597
+ payload=project_payload,
598
+ token=HF_TOKEN,
599
+ domain=AUTOTRAIN_BACKEND_API,
600
+ ).json()
601
+ print(f"INFO -- Project creation response: {project_json_resp}")
602
+
603
+ if project_json_resp["created"]:
604
+ data_payload = {
605
+ "split": 4, # use "auto" split choice in AutoTrain
606
+ "col_mapping": col_mapping,
607
+ "load_config": {"max_size_bytes": 0, "shuffle": False},
608
+ "dataset_id": selected_dataset,
609
+ "dataset_config": selected_config,
610
+ "dataset_split": selected_split,
611
+ }
612
+ data_json_resp = http_post(
613
+ path=f"/projects/{project_json_resp['id']}/data/dataset",
614
+ payload=data_payload,
615
+ token=HF_TOKEN,
616
+ domain=AUTOTRAIN_BACKEND_API,
617
+ ).json()
618
+ print(f"INFO -- Dataset creation response: {data_json_resp}")
619
+ if data_json_resp["download_status"] == 1:
620
+ train_json_resp = http_post(
621
+ path=f"/projects/{project_json_resp['id']}/data/start_processing",
622
+ token=HF_TOKEN,
623
+ domain=AUTOTRAIN_BACKEND_API,
624
+ ).json()
625
+ # For local development we process and approve projects on-the-fly
626
+ if "localhost" in AUTOTRAIN_BACKEND_API:
627
+ with st.spinner("⏳ Waiting for data processing to complete ..."):
628
+ is_data_processing_success = False
629
+ while is_data_processing_success is not True:
630
+ project_status = http_get(
631
+ path=f"/projects/{project_json_resp['id']}",
632
+ token=HF_TOKEN,
633
+ domain=AUTOTRAIN_BACKEND_API,
634
+ ).json()
635
+ if project_status["status"] == 3:
636
+ is_data_processing_success = True
637
+ time.sleep(10)
638
+
639
+ # Approve training job
640
+ train_job_resp = http_post(
641
+ path=f"/projects/{project_json_resp['id']}/start_training",
642
+ token=HF_TOKEN,
643
+ domain=AUTOTRAIN_BACKEND_API,
644
+ ).json()
645
+ st.success("✅ Data processing and project approval complete - go forth and evaluate!")
646
+ else:
647
+ # Prod/staging submissions are evaluated in a cron job via run_evaluation_jobs.py
648
+ print(f"INFO -- AutoTrain job response: {train_json_resp}")
649
+ if train_json_resp["success"]:
650
+ train_eval_index = {
651
+ "train-eval-index": [
652
+ {
653
+ "config": selected_config,
654
+ "task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
655
+ "task_id": selected_task,
656
+ "splits": {"eval_split": selected_split},
657
+ "col_mapping": col_mapping,
658
+ }
659
+ ]
660
+ }
661
+ selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
662
+ dataset_card_url = get_dataset_card_url(selected_dataset)
663
+ st.success("✅ Successfully submitted evaluation job!")
664
+ st.markdown(
665
+ f"""
666
+ Evaluation can take up to 1 hour to complete, so grab a ☕️ or 🍵 while you wait:
667
+
668
+ * 🔔 A [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) with the evaluation results will be opened for each model you selected. Check your email for notifications.
669
+ * 📊 Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) to view the results from your submission once the Hub pull request is merged.
670
+ * 🥱 Tired of configuring evaluations? Add the following metadata to the [dataset card]({dataset_card_url}) to enable 1-click evaluations:
671
+ """ # noqa
672
+ )
673
+ st.markdown(
674
+ f"""
675
+ ```yaml
676
+ {selected_metadata}
677
+ """
678
+ )
679
+ print("INFO -- Pushing evaluation job logs to the Hub")
680
+ evaluation_log = {}
681
+ evaluation_log["project_id"] = project_json_resp["id"]
682
+ evaluation_log["autotrain_env"] = (
683
+ "staging" if "staging" in AUTOTRAIN_BACKEND_API else "prod"
684
+ )
685
+ evaluation_log["payload"] = project_payload
686
+ evaluation_log["project_creation_response"] = project_json_resp
687
+ evaluation_log["dataset_creation_response"] = data_json_resp
688
+ evaluation_log["autotrain_job_response"] = train_json_resp
689
+ commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
690
+ else:
691
+ st.error("🙈 Oh no, there was an error submitting your evaluation job!")
692
+ else:
693
+ st.warning("⚠️ No models left to evaluate! Please select other models and try again.")
evaluation.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from dataclasses import dataclass
3
+
4
+ import streamlit as st
5
+ from huggingface_hub import DatasetFilter, HfApi
6
+ from huggingface_hub.hf_api import DatasetInfo
7
+
8
+
9
+ @dataclass(frozen=True, eq=True)
10
+ class EvaluationInfo:
11
+ task: str
12
+ model: str
13
+ dataset_name: str
14
+ dataset_config: str
15
+ dataset_split: str
16
+ metrics: set
17
+
18
+
19
+ def create_evaluation_info(dataset_info: DatasetInfo) -> int:
20
+ if dataset_info.cardData is not None:
21
+ metadata = dataset_info.cardData["eval_info"]
22
+ metadata.pop("col_mapping", None)
23
+ # TODO(lewtun): populate dataset cards with metric info
24
+ if "metrics" not in metadata:
25
+ metadata["metrics"] = frozenset()
26
+ else:
27
+ metadata["metrics"] = frozenset(metadata["metrics"])
28
+ return EvaluationInfo(**metadata)
29
+
30
+
31
+ def get_evaluation_infos():
32
+ filt = DatasetFilter(author="autoevaluate")
33
+ evaluation_datasets = HfApi().list_datasets(filter=filt, full=True)
34
+ return [create_evaluation_info(dset) for dset in evaluation_datasets]
35
+
36
+
37
+ def filter_evaluated_models(models, task, dataset_name, dataset_config, dataset_split, metrics):
38
+ evaluation_infos = get_evaluation_infos()
39
+ models_to_filter = copy.copy(models)
40
+
41
+ for model in models_to_filter:
42
+ evaluation_info = EvaluationInfo(
43
+ task=task,
44
+ model=model,
45
+ dataset_name=dataset_name,
46
+ dataset_config=dataset_config,
47
+ dataset_split=dataset_split,
48
+ metrics=frozenset(metrics),
49
+ )
50
+ if evaluation_info in evaluation_infos:
51
+ st.info(
52
+ f"Model [`{model}`](https://huggingface.co/{model}) has already been evaluated on this configuration. \
53
+ This model will be excluded from the evaluation job..."
54
+ )
55
+ models.remove(model)
56
+
57
+ return models
images/autotrain_job.png ADDED
images/autotrain_projects.png ADDED
notebooks/flush-prediction-repos.ipynb ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "c8093b9e-ca6a-423d-96c3-5fe21f7109a1",
6
+ "metadata": {},
7
+ "source": [
8
+ "## Imports"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "efe8cda7-a687-4867-b1f0-8efbcd428681",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import os\n",
19
+ "from pathlib import Path\n",
20
+ "\n",
21
+ "from dotenv import load_dotenv\n",
22
+ "from huggingface_hub import DatasetFilter, delete_repo, list_datasets\n",
23
+ "from tqdm.auto import tqdm\n",
24
+ "\n",
25
+ "if Path(\".env\").is_file():\n",
26
+ " load_dotenv(\".env\")\n",
27
+ "\n",
28
+ "HF_TOKEN = os.getenv(\"HF_TOKEN\")"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "id": "8f6e01f0-b658-451f-999c-e08d9f4bbbd3",
34
+ "metadata": {},
35
+ "source": [
36
+ "## Get all prediction repos from autoevaluate org"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 2,
42
+ "id": "2e369478-66d3-498d-a8fd-95bc9180f362",
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "def get_prediction_repos():\n",
47
+ " all_repos = list_datasets(author=\"autoevaluate\")\n",
48
+ " prediction_repos = [\n",
49
+ " repo for repo in all_repos if repo.id.split(\"/\")[1].startswith(\"autoeval-\")\n",
50
+ " ]\n",
51
+ " return prediction_repos"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": 3,
57
+ "id": "542db019-d01f-42f5-bef4-888dae8eeadb",
58
+ "metadata": {},
59
+ "outputs": [
60
+ {
61
+ "data": {
62
+ "text/plain": [
63
+ "66"
64
+ ]
65
+ },
66
+ "execution_count": 3,
67
+ "metadata": {},
68
+ "output_type": "execute_result"
69
+ }
70
+ ],
71
+ "source": [
72
+ "prediction_repos = get_prediction_repos()\n",
73
+ "len(prediction_repos)"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 4,
79
+ "id": "331cfabf-4b73-490f-8d6a-86b5bc162666",
80
+ "metadata": {},
81
+ "outputs": [
82
+ {
83
+ "data": {
84
+ "text/plain": [
85
+ "DatasetInfo: {\n",
86
+ "\tid: autoevaluate/autoeval-staging-eval-project-9dcc51b5-6464670\n",
87
+ "\tsha: d3bb02be592d167f7a217ac9341d187142d9a90a\n",
88
+ "\tlastModified: 2022-06-13T14:54:34.000Z\n",
89
+ "\ttags: ['type:predictions', 'tags:autotrain', 'tags:evaluation', 'datasets:glue']\n",
90
+ "\tprivate: False\n",
91
+ "\tauthor: autoevaluate\n",
92
+ "\tdescription: None\n",
93
+ "\tcitation: None\n",
94
+ "\tcardData: None\n",
95
+ "\tsiblings: None\n",
96
+ "\tgated: False\n",
97
+ "\tdownloads: 12\n",
98
+ "}"
99
+ ]
100
+ },
101
+ "execution_count": 4,
102
+ "metadata": {},
103
+ "output_type": "execute_result"
104
+ }
105
+ ],
106
+ "source": [
107
+ "prediction_repos[0]"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "markdown",
112
+ "id": "57a86b69-ffe8-4035-8f3d-5c917d8ce7bf",
113
+ "metadata": {},
114
+ "source": [
115
+ "## Delete all prediction repos"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 5,
121
+ "id": "6c8e23e7-2a6d-437b-9742-17f37684d9eb",
122
+ "metadata": {},
123
+ "outputs": [
124
+ {
125
+ "data": {
126
+ "application/vnd.jupyter.widget-view+json": {
127
+ "model_id": "06fa304dcc6d44e39205b20a5e488052",
128
+ "version_major": 2,
129
+ "version_minor": 0
130
+ },
131
+ "text/plain": [
132
+ " 0%| | 0/66 [00:00<?, ?it/s]"
133
+ ]
134
+ },
135
+ "metadata": {},
136
+ "output_type": "display_data"
137
+ }
138
+ ],
139
+ "source": [
140
+ "for repo in tqdm(prediction_repos):\n",
141
+ " delete_repo(\n",
142
+ " repo_id=repo.id,\n",
143
+ " repo_type=\"dataset\",\n",
144
+ " )"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": null,
150
+ "id": "7d64b0aa-d05f-4497-9bd2-eb2fc0d8bd7a",
151
+ "metadata": {},
152
+ "outputs": [],
153
+ "source": []
154
+ }
155
+ ],
156
+ "metadata": {
157
+ "kernelspec": {
158
+ "display_name": "autoevaluate",
159
+ "language": "python",
160
+ "name": "autoevaluate"
161
+ },
162
+ "language_info": {
163
+ "codemirror_mode": {
164
+ "name": "ipython",
165
+ "version": 3
166
+ },
167
+ "file_extension": ".py",
168
+ "mimetype": "text/x-python",
169
+ "name": "python",
170
+ "nbconvert_exporter": "python",
171
+ "pygments_lexer": "ipython3",
172
+ "version": "3.8.13"
173
+ }
174
+ },
175
+ "nbformat": 4,
176
+ "nbformat_minor": 5
177
+ }
pyproject.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [tool.isort]
2
+ profile = "black"
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ huggingface-hub<0.8
2
+ python-dotenv
3
+ streamlit==1.10.0
4
+ datasets<2.3
5
+ evaluate<0.2
6
+ jsonlines
7
+ typer
8
+ # Dataset specific deps
9
+ py7zr<0.19
10
+ openpyxl<3.1
11
+ # Dirty bug from Google
12
+ protobuf<=3.20.1
run_evaluation_jobs.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import typer
5
+ from datasets import load_dataset
6
+ from dotenv import load_dotenv
7
+
8
+ from utils import http_get, http_post
9
+
10
+ if Path(".env").is_file():
11
+ load_dotenv(".env")
12
+
13
+ HF_TOKEN = os.getenv("HF_TOKEN")
14
+ AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
15
+ AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
16
+
17
+ if "staging" in AUTOTRAIN_BACKEND_API:
18
+ AUTOTRAIN_ENV = "staging"
19
+ else:
20
+ AUTOTRAIN_ENV = "prod"
21
+
22
+
23
+ def main():
24
+ print(f"💡 Starting jobs on {AUTOTRAIN_ENV} environment")
25
+ logs_df = load_dataset("autoevaluate/evaluation-job-logs", use_auth_token=HF_TOKEN, split="train").to_pandas()
26
+ # Filter out legacy AutoTrain submissions prior to project approvals requirement
27
+ projects_df = logs_df.copy()[(~logs_df["project_id"].isnull())]
28
+ # Filter IDs for appropriate AutoTrain env (staging vs prod)
29
+ projects_df = projects_df.copy().query(f"autotrain_env == '{AUTOTRAIN_ENV}'")
30
+ projects_to_approve = projects_df["project_id"].astype(int).tolist()
31
+ failed_approvals = []
32
+ print(f"🚀 Found {len(projects_to_approve)} evaluation projects to approve!")
33
+
34
+ for project_id in projects_to_approve:
35
+ print(f"Attempting to evaluate project ID {project_id} ...")
36
+ try:
37
+ project_info = http_get(
38
+ path=f"/projects/{project_id}",
39
+ token=HF_TOKEN,
40
+ domain=AUTOTRAIN_BACKEND_API,
41
+ ).json()
42
+ print(project_info)
43
+ # Only start evaluation for projects with completed data processing (status=3)
44
+ if project_info["status"] == 3 and project_info["training_status"] == "not_started":
45
+ train_job_resp = http_post(
46
+ path=f"/projects/{project_id}/start_training",
47
+ token=HF_TOKEN,
48
+ domain=AUTOTRAIN_BACKEND_API,
49
+ ).json()
50
+ print(f"🤖 Project {project_id} approval response: {train_job_resp}")
51
+ else:
52
+ print(f"💪 Project {project_id} either not ready or has already been evaluated. Skipping ...")
53
+ except Exception as e:
54
+ print(f"There was a problem obtaining the project info for project ID {project_id}")
55
+ print(f"Error message: {e}")
56
+ failed_approvals.append(project_id)
57
+ pass
58
+
59
+ if len(failed_approvals) > 0:
60
+ print(f"🚨 Failed to approve {len(failed_approvals)} projects: {failed_approvals}")
61
+
62
+
63
+ if __name__ == "__main__":
64
+ typer.run(main)
utils.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ import uuid
3
+ from typing import Dict, List, Union
4
+
5
+ import jsonlines
6
+ import requests
7
+ import streamlit as st
8
+ from evaluate import load
9
+ from huggingface_hub import HfApi, ModelFilter, Repository, dataset_info, list_metrics
10
+ from tqdm import tqdm
11
+
12
+ AUTOTRAIN_TASK_TO_HUB_TASK = {
13
+ "binary_classification": "text-classification",
14
+ "multi_class_classification": "text-classification",
15
+ "natural_language_inference": "text-classification",
16
+ "entity_extraction": "token-classification",
17
+ "extractive_question_answering": "question-answering",
18
+ "translation": "translation",
19
+ "summarization": "summarization",
20
+ "image_binary_classification": "image-classification",
21
+ "image_multi_class_classification": "image-classification",
22
+ "text_zero_shot_classification": "text-generation",
23
+ }
24
+
25
+
26
+ HUB_TASK_TO_AUTOTRAIN_TASK = {v: k for k, v in AUTOTRAIN_TASK_TO_HUB_TASK.items()}
27
+ LOGS_REPO = "evaluation-job-logs"
28
+
29
+
30
+ def get_auth_headers(token: str, prefix: str = "Bearer"):
31
+ return {"Authorization": f"{prefix} {token}"}
32
+
33
+
34
+ def http_post(path: str, token: str, payload=None, domain: str = None, params=None) -> requests.Response:
35
+ """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
36
+ try:
37
+ response = requests.post(
38
+ url=domain + path,
39
+ json=payload,
40
+ headers=get_auth_headers(token=token),
41
+ allow_redirects=True,
42
+ params=params,
43
+ )
44
+ except requests.exceptions.ConnectionError:
45
+ print("❌ Failed to reach AutoNLP API, check your internet connection")
46
+ response.raise_for_status()
47
+ return response
48
+
49
+
50
+ def http_get(path: str, domain: str, token: str = None, params: dict = None) -> requests.Response:
51
+ """HTTP POST request to `path`, raises UnreachableAPIError if the API cannot be reached"""
52
+ try:
53
+ response = requests.get(
54
+ url=domain + path,
55
+ headers=get_auth_headers(token=token),
56
+ allow_redirects=True,
57
+ params=params,
58
+ )
59
+ except requests.exceptions.ConnectionError:
60
+ print(f"❌ Failed to reach {path}, check your internet connection")
61
+ response.raise_for_status()
62
+ return response
63
+
64
+
65
+ def get_metadata(dataset_name: str, token: str) -> Union[Dict, None]:
66
+ data = dataset_info(dataset_name, token=token)
67
+ if data.cardData is not None and "train-eval-index" in data.cardData.keys():
68
+ return data.cardData["train-eval-index"]
69
+ else:
70
+ return None
71
+
72
+
73
+ def get_compatible_models(task: str, dataset_ids: List[str]) -> List[str]:
74
+ """
75
+ Returns all model IDs that are compatible with the given task and dataset names.
76
+
77
+ Args:
78
+ task (`str`): The task to search for.
79
+ dataset_names (`List[str]`): A list of dataset names to search for.
80
+
81
+ Returns:
82
+ A list of model IDs, sorted alphabetically.
83
+ """
84
+ compatible_models = []
85
+ # Allow any summarization model to be used for summarization tasks
86
+ # and allow any text-generation model to be used for text_zero_shot_classification
87
+ if task in ("summarization", "text_zero_shot_classification"):
88
+ model_filter = ModelFilter(
89
+ task=AUTOTRAIN_TASK_TO_HUB_TASK[task],
90
+ library=["transformers", "pytorch"],
91
+ )
92
+ compatible_models.extend(HfApi().list_models(filter=model_filter))
93
+ # Include models trained on SQuAD datasets, since these can be evaluated on
94
+ # other SQuAD-like datasets
95
+ if task == "extractive_question_answering":
96
+ dataset_ids.extend(["squad", "squad_v2"])
97
+
98
+ # TODO: relax filter on PyTorch models if TensorFlow supported in AutoTrain
99
+ for dataset_id in dataset_ids:
100
+ model_filter = ModelFilter(
101
+ task=AUTOTRAIN_TASK_TO_HUB_TASK[task],
102
+ trained_dataset=dataset_id,
103
+ library=["transformers", "pytorch"],
104
+ )
105
+ compatible_models.extend(HfApi().list_models(filter=model_filter))
106
+ return sorted(set([model.modelId for model in compatible_models]))
107
+
108
+
109
+ def get_key(col_mapping, val):
110
+ for key, value in col_mapping.items():
111
+ if val == value:
112
+ return key
113
+
114
+ return "key doesn't exist"
115
+
116
+
117
+ def format_col_mapping(col_mapping: dict) -> dict:
118
+ for k, v in col_mapping["answers"].items():
119
+ col_mapping[f"answers.{k}"] = f"answers.{v}"
120
+ del col_mapping["answers"]
121
+ return col_mapping
122
+
123
+
124
+ def commit_evaluation_log(evaluation_log, hf_access_token=None):
125
+ logs_repo_url = f"https://huggingface.co/datasets/autoevaluate/{LOGS_REPO}"
126
+ logs_repo = Repository(
127
+ local_dir=LOGS_REPO,
128
+ clone_from=logs_repo_url,
129
+ repo_type="dataset",
130
+ private=True,
131
+ use_auth_token=hf_access_token,
132
+ )
133
+ logs_repo.git_pull()
134
+ with jsonlines.open(f"{LOGS_REPO}/logs.jsonl") as r:
135
+ lines = []
136
+ for obj in r:
137
+ lines.append(obj)
138
+
139
+ lines.append(evaluation_log)
140
+ with jsonlines.open(f"{LOGS_REPO}/logs.jsonl", mode="w") as writer:
141
+ for job in lines:
142
+ writer.write(job)
143
+ logs_repo.push_to_hub(
144
+ commit_message=f"Evaluation submitted with project name {evaluation_log['payload']['proj_name']}"
145
+ )
146
+ print("INFO -- Pushed evaluation logs to the Hub")
147
+
148
+
149
+ @st.experimental_memo
150
+ def get_supported_metrics():
151
+ """Helper function to get all metrics compatible with evaluation service.
152
+
153
+ Requires all metric dependencies installed in the same environment, so wait until
154
+ https://github.com/huggingface/evaluate/issues/138 is resolved before using this.
155
+ """
156
+ metrics = [metric.id for metric in list_metrics()]
157
+ supported_metrics = []
158
+ for metric in tqdm(metrics):
159
+ # TODO: this currently requires all metric dependencies to be installed
160
+ # in the same environment. Refactor to avoid needing to actually load
161
+ # the metric.
162
+ try:
163
+ print(f"INFO -- Attempting to load metric: {metric}")
164
+ metric_func = load(metric)
165
+ except Exception as e:
166
+ print(e)
167
+ print("WARNING -- Skipping the following metric, which cannot load:", metric)
168
+ continue
169
+
170
+ argspec = inspect.getfullargspec(metric_func.compute)
171
+ if "references" in argspec.kwonlyargs and "predictions" in argspec.kwonlyargs:
172
+ # We require that "references" and "predictions" are arguments
173
+ # to the metric function. We also require that the other arguments
174
+ # besides "references" and "predictions" have defaults and so do not
175
+ # need to be specified explicitly.
176
+ defaults = True
177
+ for key, value in argspec.kwonlydefaults.items():
178
+ if key not in ("references", "predictions"):
179
+ if value is None:
180
+ defaults = False
181
+ break
182
+
183
+ if defaults:
184
+ supported_metrics.append(metric)
185
+ return supported_metrics
186
+
187
+
188
+ def get_dataset_card_url(dataset_id: str) -> str:
189
+ """Gets the URL to edit the dataset card for the given dataset ID."""
190
+ if "/" in dataset_id:
191
+ return f"https://huggingface.co/datasets/{dataset_id}/edit/main/README.md"
192
+ else:
193
+ return f"https://github.com/huggingface/datasets/edit/master/datasets/{dataset_id}/README.md"
194
+
195
+
196
+ def create_autotrain_project_name(dataset_id: str, dataset_config: str) -> str:
197
+ """Creates an AutoTrain project name for the given dataset ID."""
198
+ # Project names cannot have "/", so we need to format community datasets accordingly
199
+ dataset_id_formatted = dataset_id.replace("/", "__")
200
+ dataset_config_formatted = dataset_config.replace("--", "__")
201
+ # Project names need to be unique, so we append a random string to guarantee this while adhering to naming rules
202
+ basename = f"eval-{dataset_id_formatted}-{dataset_config_formatted}"
203
+ basename = basename[:60] if len(basename) > 60 else basename # Hub naming limitation
204
+ return f"{basename}-{str(uuid.uuid4())[:6]}"
205
+
206
+
207
+ def get_config_metadata(config: str, metadata: List[Dict] = None) -> Union[Dict, None]:
208
+ """Gets the dataset card metadata for the given config."""
209
+ if metadata is None:
210
+ return None
211
+ config_metadata = [m for m in metadata if m["config"] == config]
212
+ if len(config_metadata) >= 1:
213
+ return config_metadata[0]
214
+ else:
215
+ return None