File size: 9,358 Bytes
803812e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 |
# coding=utf-8
# Copyright 2022-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains utilities to validate argument values in `huggingface_hub`."""
import inspect
import re
import warnings
from functools import wraps
from itertools import chain
from typing import Any, Dict
from ._typing import CallableT
REPO_ID_REGEX = re.compile(
r"""
^
(\b[\w\-.]+\b/)? # optional namespace (username or organization)
\b # starts with a word boundary
[\w\-.]{1,96} # repo_name: alphanumeric + . _ -
\b # ends with a word boundary
$
""",
flags=re.VERBOSE,
)
class HFValidationError(ValueError):
"""Generic exception thrown by `huggingface_hub` validators.
Inherits from [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError).
"""
def validate_hf_hub_args(fn: CallableT) -> CallableT:
"""Validate values received as argument for any public method of `huggingface_hub`.
The goal of this decorator is to harmonize validation of arguments reused
everywhere. By default, all defined validators are tested.
Validators:
- [`~utils.validate_repo_id`]: `repo_id` must be `"repo_name"`
or `"namespace/repo_name"`. Namespace is a username or an organization.
- [`~utils.smoothly_deprecate_use_auth_token`]: Use `token` instead of
`use_auth_token` (only if `use_auth_token` is not expected by the decorated
function - in practice, always the case in `huggingface_hub`).
Example:
```py
>>> from huggingface_hub.utils import validate_hf_hub_args
>>> @validate_hf_hub_args
... def my_cool_method(repo_id: str):
... print(repo_id)
>>> my_cool_method(repo_id="valid_repo_id")
valid_repo_id
>>> my_cool_method("other..repo..id")
huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'.
>>> my_cool_method(repo_id="other..repo..id")
huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'.
>>> @validate_hf_hub_args
... def my_cool_auth_method(token: str):
... print(token)
>>> my_cool_auth_method(token="a token")
"a token"
>>> my_cool_auth_method(use_auth_token="a use_auth_token")
"a use_auth_token"
>>> my_cool_auth_method(token="a token", use_auth_token="a use_auth_token")
UserWarning: Both `token` and `use_auth_token` are passed (...)
"a token"
```
Raises:
[`~utils.HFValidationError`]:
If an input is not valid.
"""
# TODO: add an argument to opt-out validation for specific argument?
signature = inspect.signature(fn)
# Should the validator switch `use_auth_token` values to `token`? In practice, always
# True in `huggingface_hub`. Might not be the case in a downstream library.
check_use_auth_token = "use_auth_token" not in signature.parameters and "token" in signature.parameters
@wraps(fn)
def _inner_fn(*args, **kwargs):
has_token = False
for arg_name, arg_value in chain(
zip(signature.parameters, args), # Args values
kwargs.items(), # Kwargs values
):
if arg_name in ["repo_id", "from_id", "to_id"]:
validate_repo_id(arg_value)
elif arg_name == "token" and arg_value is not None:
has_token = True
if check_use_auth_token:
kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
return fn(*args, **kwargs)
return _inner_fn # type: ignore
def validate_repo_id(repo_id: str) -> None:
"""Validate `repo_id` is valid.
This is not meant to replace the proper validation made on the Hub but rather to
avoid local inconsistencies whenever possible (example: passing `repo_type` in the
`repo_id` is forbidden).
Rules:
- Between 1 and 96 characters.
- Either "repo_name" or "namespace/repo_name"
- [a-zA-Z0-9] or "-", "_", "."
- "--" and ".." are forbidden
Valid: `"foo"`, `"foo/bar"`, `"123"`, `"Foo-BAR_foo.bar123"`
Not valid: `"datasets/foo/bar"`, `".repo_id"`, `"foo--bar"`, `"foo.git"`
Example:
```py
>>> from huggingface_hub.utils import validate_repo_id
>>> validate_repo_id(repo_id="valid_repo_id")
>>> validate_repo_id(repo_id="other..repo..id")
huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'.
```
Discussed in https://github.com/huggingface/huggingface_hub/issues/1008.
In moon-landing (internal repository):
- https://github.com/huggingface/moon-landing/blob/main/server/lib/Names.ts#L27
- https://github.com/huggingface/moon-landing/blob/main/server/views/components/NewRepoForm/NewRepoForm.svelte#L138
"""
if not isinstance(repo_id, str):
# Typically, a Path is not a repo_id
raise HFValidationError(f"Repo id must be a string, not {type(repo_id)}: '{repo_id}'.")
if repo_id.count("/") > 1:
raise HFValidationError(
"Repo id must be in the form 'repo_name' or 'namespace/repo_name':"
f" '{repo_id}'. Use `repo_type` argument if needed."
)
if not REPO_ID_REGEX.match(repo_id):
raise HFValidationError(
"Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are"
" forbidden, '-' and '.' cannot start or end the name, max length is 96:"
f" '{repo_id}'."
)
if "--" in repo_id or ".." in repo_id:
raise HFValidationError(f"Cannot have -- or .. in repo_id: '{repo_id}'.")
if repo_id.endswith(".git"):
raise HFValidationError(f"Repo_id cannot end by '.git': '{repo_id}'.")
def smoothly_deprecate_use_auth_token(fn_name: str, has_token: bool, kwargs: Dict[str, Any]) -> Dict[str, Any]:
"""Smoothly deprecate `use_auth_token` in the `huggingface_hub` codebase.
The long-term goal is to remove any mention of `use_auth_token` in the codebase in
favor of a unique and less verbose `token` argument. This will be done a few steps:
0. Step 0: methods that require a read-access to the Hub use the `use_auth_token`
argument (`str`, `bool` or `None`). Methods requiring write-access have a `token`
argument (`str`, `None`). This implicit rule exists to be able to not send the
token when not necessary (`use_auth_token=False`) even if logged in.
1. Step 1: we want to harmonize everything and use `token` everywhere (supporting
`token=False` for read-only methods). In order not to break existing code, if
`use_auth_token` is passed to a function, the `use_auth_token` value is passed
as `token` instead, without any warning.
a. Corner case: if both `use_auth_token` and `token` values are passed, a warning
is thrown and the `use_auth_token` value is ignored.
2. Step 2: Once it is release, we should push downstream libraries to switch from
`use_auth_token` to `token` as much as possible, but without throwing a warning
(e.g. manually create issues on the corresponding repos).
3. Step 3: After a transitional period (6 months e.g. until April 2023?), we update
`huggingface_hub` to throw a warning on `use_auth_token`. Hopefully, very few
users will be impacted as it would have already been fixed.
In addition, unit tests in `huggingface_hub` must be adapted to expect warnings
to be thrown (but still use `use_auth_token` as before).
4. Step 4: After a normal deprecation cycle (3 releases ?), remove this validator.
`use_auth_token` will definitely not be supported.
In addition, we update unit tests in `huggingface_hub` to use `token` everywhere.
This has been discussed in:
- https://github.com/huggingface/huggingface_hub/issues/1094.
- https://github.com/huggingface/huggingface_hub/pull/928
- (related) https://github.com/huggingface/huggingface_hub/pull/1064
"""
new_kwargs = kwargs.copy() # do not mutate input !
use_auth_token = new_kwargs.pop("use_auth_token", None) # remove from kwargs
if use_auth_token is not None:
if has_token:
warnings.warn(
"Both `token` and `use_auth_token` are passed to"
f" `{fn_name}` with non-None values. `token` is now the"
" preferred argument to pass a User Access Token."
" `use_auth_token` value will be ignored."
)
else:
# `token` argument is not passed and a non-None value is passed in
# `use_auth_token` => use `use_auth_token` value as `token` kwarg.
new_kwargs["token"] = use_auth_token
return new_kwargs
|