Upload folder using huggingface_hub

d1ceb73 verified 11 months ago

21.8 kB

	# Copyright 2015 Google Inc. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import math
	import re
	from typing import (
	Any,
	Callable,
	IO,
	Iterable,
	Mapping,
	Optional,
	Set,
	Tuple,
	Union,
	)
	import unicodedata

	from .parser import Parser


	def load(
	fp: IO,
	*,
	encoding: Optional[str] = None,
	cls: None = None,
	object_hook: Optional[Callable[[Mapping[str, Any]], Any]] = None,
	parse_float: Optional[Callable[[str], Any]] = None,
	parse_int: Optional[Callable[[str], Any]] = None,
	parse_constant: Optional[Callable[[str], Any]] = None,
	object_pairs_hook: Optional[
	Callable[[Iterable[Tuple[str, Any]]], Any]
	] = None,
	allow_duplicate_keys: bool = True,
	) -> Any:
	"""Deserialize ``fp`` (a ``.read()``-supporting file-like object
	containing a JSON document) to a Python object.

	Supports almost the same arguments as ``json.load()`` except that:
	- the `cls` keyword is ignored.
	- an extra `allow_duplicate_keys` parameter supports checking for
	duplicate keys in a object; by default, this is True for
	compatibility with ``json.load()``, but if set to False and
	the object contains duplicate keys, a ValueError will be raised.
	"""

	s = fp.read()
	return loads(
	s,
	encoding=encoding,
	cls=cls,
	object_hook=object_hook,
	parse_float=parse_float,
	parse_int=parse_int,
	parse_constant=parse_constant,
	object_pairs_hook=object_pairs_hook,
	allow_duplicate_keys=allow_duplicate_keys,
	)


	def loads(
	s: str,
	*,
	encoding: Optional[str] = None,
	cls: None = None,
	object_hook: Optional[Callable[[Mapping[str, Any]], Any]] = None,
	parse_float: Optional[Callable[[str], Any]] = None,
	parse_int: Optional[Callable[[str], Any]] = None,
	parse_constant: Optional[Callable[[str], Any]] = None,
	object_pairs_hook: Optional[
	Callable[[Iterable[Tuple[str, Any]]], Any]
	] = None,
	allow_duplicate_keys: bool = True,
	):
	"""Deserialize ``s`` (a string containing a JSON5 document) to a Python
	object.

	Supports the same arguments as ``json.load()`` except that:
	- the `cls` keyword is ignored.
	- an extra `allow_duplicate_keys` parameter supports checking for
	duplicate keys in a object; by default, this is True for
	compatibility with ``json.load()``, but if set to False and
	the object contains duplicate keys, a ValueError will be raised.
	"""

	assert cls is None, 'Custom decoders are not supported'

	if isinstance(s, bytes):
	encoding = encoding or 'utf-8'
	s = s.decode(encoding)

	if not s:
	raise ValueError('Empty strings are not legal JSON5')
	parser = Parser(s, '<string>')
	ast, err, _ = parser.parse()
	if err:
	raise ValueError(err)

	def _fp_constant_parser(s):
	return float(s.replace('Infinity', 'inf').replace('NaN', 'nan'))

	if object_pairs_hook:
	dictify = object_pairs_hook
	elif object_hook:

	def dictify(pairs):
	return object_hook(dict(pairs))
	else:
	dictify = dict

	if not allow_duplicate_keys:
	_orig_dictify = dictify

	def dictify(pairs): # pylint: disable=function-redefined
	return _reject_duplicate_keys(pairs, _orig_dictify)

	parse_float = parse_float or float
	parse_int = parse_int or int
	parse_constant = parse_constant or _fp_constant_parser

	return _walk_ast(ast, dictify, parse_float, parse_int, parse_constant)


	def _reject_duplicate_keys(pairs, dictify):
	keys = set()
	for key, _ in pairs:
	if key in keys:
	raise ValueError(f'Duplicate key "{key}" found in object')
	keys.add(key)
	return dictify(pairs)


	def _walk_ast(
	el,
	dictify: Callable[[Iterable[Tuple[str, Any]]], Any],
	parse_float,
	parse_int,
	parse_constant,
	):
	if el == 'None':
	return None
	if el == 'True':
	return True
	if el == 'False':
	return False
	ty, v = el
	if ty == 'number':
	if v.startswith('0x') or v.startswith('0X'):
	return parse_int(v, base=16)
	if '.' in v or 'e' in v or 'E' in v:
	return parse_float(v)
	if 'Infinity' in v or 'NaN' in v:
	return parse_constant(v)
	return parse_int(v)
	if ty == 'string':
	return v
	if ty == 'object':
	pairs = []
	for key, val_expr in v:
	val = _walk_ast(
	val_expr, dictify, parse_float, parse_int, parse_constant
	)
	pairs.append((key, val))
	return dictify(pairs)
	if ty == 'array':
	return [
	_walk_ast(el, dictify, parse_float, parse_int, parse_constant)
	for el in v
	]
	raise ValueError('unknown el: ' + el) # pragma: no cover


	def dump(
	obj: Any,
	fp: IO,
	*,
	skipkeys: bool = False,
	ensure_ascii: bool = True,
	check_circular: bool = True,
	allow_nan: bool = True,
	cls: None = None,
	indent: Optional[Union[int, str]] = None,
	separators: Optional[Tuple[str, str]] = None,
	default: Optional[Callable[[Any], Any]] = None,
	sort_keys: bool = False,
	quote_keys: bool = False,
	trailing_commas: bool = True,
	allow_duplicate_keys: bool = True,
	**kwargs,
	):
	"""Serialize ``obj`` to a JSON5-formatted stream to ``fp``,
	a ``.write()``-supporting file-like object.

	Supports the same arguments as ``json.dump()``, except that:

	- The ``cls`` keyword is not supported.
	- The ``encoding`` keyword is ignored; Unicode strings are always
	written.
	- By default, object keys that are legal identifiers are not quoted;
	if you pass ``quote_keys=True``, they will be.
	- By default, if lists and objects span multiple lines of output (i.e.,
	when ``indent`` >=0), the last item will have a trailing comma
	after it. If you pass ``trailing_commas=False``, it will not.
	- If you use a number, a boolean, or ``None`` as a key value in a dict,
	it will be converted to the corresponding JSON string value, e.g.
	"1", "true", or "null". By default, ``dump()`` will match the `json`
	modules behavior and produce malformed JSON if you mix keys of
	different types that have the same converted value; e.g.,
	``{1: "foo", "1": "bar"}`` produces '{"1": "foo", "1": "bar"}', an
	object with duplicated keys. If you pass
	``allow_duplicate_keys=False``, an exception will be raised instead.
	- If `quote_keys` is true, then keys of objects will be enclosed in
	quotes, as in regular JSON. Otherwise, keys will not be enclosed in
	quotes unless they contain whitespace.
	- If `trailing_commas` is false, then commas will not be inserted after
	the final elements of objects and arrays, as in regular JSON.
	Otherwise, such commas will be inserted.
	- If `allow_duplicate_keys` is false, then only the last entry with a
	given key will be written. Otherwise, all entries with the same key
	will be written.

	Calling ``dump(obj, fp, quote_keys=True, trailing_commas=False, \
	allow_duplicate_keys=True)``
	should produce exactly the same output as ``json.dump(obj, fp).``
	"""

	del kwargs
	fp.write(
	dumps(
	obj=obj,
	skipkeys=skipkeys,
	ensure_ascii=ensure_ascii,
	check_circular=check_circular,
	allow_nan=allow_nan,
	cls=cls,
	indent=indent,
	separators=separators,
	default=default,
	sort_keys=sort_keys,
	quote_keys=quote_keys,
	trailing_commas=trailing_commas,
	allow_duplicate_keys=allow_duplicate_keys,
	)
	)


	def dumps(
	obj: Any,
	*,
	skipkeys: bool = False,
	ensure_ascii: bool = True,
	check_circular: bool = True,
	allow_nan: bool = True,
	cls: None = None,
	indent: Optional[Union[int, str]] = None,
	separators: Optional[Tuple[str, str]] = None,
	default: Optional[Callable[[Any], Any]] = None,
	sort_keys: bool = False,
	quote_keys: bool = False,
	trailing_commas: bool = True,
	allow_duplicate_keys: bool = True,
	**kwargs,
	):
	"""Serialize ``obj`` to a JSON5-formatted string.

	Supports the same arguments as ``json.dumps()``, except that:

	- The ``cls`` keyword is not supported.
	- The ``encoding`` keyword is ignored; Unicode strings are always
	written.
	- By default, object keys that are legal identifiers are not quoted;
	if you pass ``quote_keys=True``, they will be.
	- By default, if lists and objects span multiple lines of output (i.e.,
	when ``indent`` >=0), the last item will have a trailing comma
	after it. If you pass ``trailing_commas=False``, it will not.
	- If you use a number, a boolean, or ``None`` as a key value in a dict,
	it will be converted to the corresponding JSON string value, e.g.
	"1", "true", or "null". By default, ``dump()`` will match the `json`
	modules behavior and produce malformed JSON if you mix keys of
	different types that have the same converted value; e.g.,
	``{1: "foo", "1": "bar"}`` produces '{"1": "foo", "1": "bar"}', an
	object with duplicated keys. If you pass
	``allow_duplicate_keys=False``, an exception will be raised instead.
	- If `quote_keys` is true, then keys of objects will be enclosed
	in quotes, as in regular JSON. Otheriwse, keys will not be enclosed
	in quotes unless they contain whitespace.
	- If `trailing_commas` is false, then commas will not be inserted after
	the final elements of objects and arrays, as in regular JSON.
	Otherwise, such commas will be inserted.
	- If `allow_duplicate_keys` is false, then only the last entry with a
	given key will be written. Otherwise, all entries with the same key
	will be written.

	Calling ``dumps(obj, quote_keys=True, trailing_commas=False, \
	allow_duplicate_keys=True)``
	should produce exactly the same output as ``json.dumps(obj).``
	"""

	assert kwargs.get('cls', None) is None, 'Custom encoders are not supported'
	del cls

	if separators is None:
	if indent is None:
	separators = (', ', ': ')
	else:
	separators = (',', ': ')

	default = default or _raise_type_error

	if check_circular:
	seen: Optional[Set[int]] = set()
	else:
	seen = None

	level = 1
	is_key = False

	_, v = _dumps(
	obj,
	skipkeys,
	ensure_ascii,
	check_circular,
	allow_nan,
	indent,
	separators,
	default,
	sort_keys,
	quote_keys,
	trailing_commas,
	allow_duplicate_keys,
	seen,
	level,
	is_key,
	)
	return v


	def _dumps(
	obj,
	skipkeys,
	ensure_ascii,
	check_circular,
	allow_nan,
	indent,
	separators,
	default,
	sort_keys,
	quote_keys,
	trailing_commas,
	allow_duplicate_keys,
	seen: Optional[Set[int]],
	level: int,
	is_key: bool,
	):
	# pylint: disable=too-many-statements
	if obj is True:
	s = 'true'
	elif obj is False:
	s = 'false'
	elif obj is None:
	s = 'null'
	elif obj == float('inf'):
	if allow_nan:
	s = 'Infinity'
	else:
	raise ValueError()
	elif obj == float('-inf'):
	if allow_nan:
	s = '-Infinity'
	else:
	raise ValueError()
	elif isinstance(obj, float) and math.isnan(obj):
	if allow_nan:
	s = 'NaN'
	else:
	raise ValueError()
	elif isinstance(obj, str):
	if (
	is_key
	and _is_ident(obj)
	and not quote_keys
	and not _is_reserved_word(obj)
	):
	return True, obj
	return True, _dump_str(obj, ensure_ascii)
	elif isinstance(obj, int):
	# Subclasses of `int` and `float` may have custom
	# __repr__ or __str__ methods, but the `JSON` library
	# ignores them in order to ensure that the representation
	# are just bare numbers. In order to match JSON's behavior
	# we call the methods of the `float` and `int` class directly.
	s = int.__repr__(obj)
	elif isinstance(obj, float):
	# See comment above for int
	s = float.__repr__(obj)
	else:
	s = None

	if is_key:
	if s is not None:
	return True, f'"{s}"'
	if skipkeys:
	return False, None
	raise TypeError(f'invalid key {repr(obj)}')

	if s is not None:
	return True, s

	if indent is not None:
	end_str = ''
	if trailing_commas:
	end_str = ','
	if isinstance(indent, int):
	if indent > 0:
	indent_str = '\n' + ' ' * indent * level
	end_str += '\n' + ' ' * indent * (level - 1)
	else:
	indent_str = '\n'
	end_str += '\n'
	else:
	indent_str = '\n' + indent * level
	end_str += '\n' + indent * (level - 1)
	else:
	indent_str = ''
	end_str = ''

	item_sep, kv_sep = separators
	item_sep += indent_str

	if seen is not None:
	i = id(obj)
	if i in seen:
	raise ValueError('Circular reference detected.')
	seen.add(i)

	# Ideally we'd use collections.abc.Mapping and collections.abc.Sequence
	# here, but for backwards-compatibility with potential old callers,
	# we only check for the two attributes we need in each case.
	if hasattr(obj, 'keys') and hasattr(obj, '__getitem__'):
	s = _dump_dict(
	obj,
	skipkeys,
	ensure_ascii,
	check_circular,
	allow_nan,
	indent,
	separators,
	default,
	sort_keys,
	quote_keys,
	trailing_commas,
	allow_duplicate_keys,
	seen,
	level + 1,
	item_sep,
	kv_sep,
	indent_str,
	end_str,
	)
	elif hasattr(obj, '__getitem__') and hasattr(obj, '__iter__'):
	s = _dump_array(
	obj,
	skipkeys,
	ensure_ascii,
	check_circular,
	allow_nan,
	indent,
	separators,
	default,
	sort_keys,
	quote_keys,
	trailing_commas,
	allow_duplicate_keys,
	seen,
	level + 1,
	item_sep,
	indent_str,
	end_str,
	)
	else:
	s = _dumps(
	default(obj),
	skipkeys,
	ensure_ascii,
	check_circular,
	allow_nan,
	indent,
	separators,
	default,
	sort_keys,
	quote_keys,
	trailing_commas,
	allow_duplicate_keys,
	seen,
	level,
	is_key,
	)[1]

	if seen is not None:
	seen.remove(i)
	return False, s


	def _dump_dict(
	obj,
	skipkeys,
	ensure_ascii,
	check_circular,
	allow_nan,
	indent,
	separators,
	default,
	sort_keys,
	quote_keys,
	trailing_commas,
	allow_duplicate_keys,
	seen,
	level,
	item_sep,
	kv_sep,
	indent_str,
	end_str,
	):
	if not obj:
	return '{}'

	if sort_keys:
	keys = sorted(obj.keys())
	else:
	keys = obj.keys()

	s = '{' + indent_str

	num_items_added = 0
	new_keys = set()
	for key in keys:
	valid_key, key_str = _dumps(
	key,
	skipkeys,
	ensure_ascii,
	check_circular,
	allow_nan,
	indent,
	separators,
	default,
	sort_keys,
	quote_keys,
	trailing_commas,
	allow_duplicate_keys,
	seen,
	level,
	is_key=True,
	)

	if skipkeys and not valid_key:
	continue

	if not allow_duplicate_keys:
	if key_str in new_keys:
	raise ValueError(f'duplicate key {repr(key)}')
	new_keys.add(key_str)

	if num_items_added:
	s += item_sep

	s += (
	key_str
	+ kv_sep
	+ _dumps(
	obj[key],
	skipkeys,
	ensure_ascii,
	check_circular,
	allow_nan,
	indent,
	separators,
	default,
	sort_keys,
	quote_keys,
	trailing_commas,
	allow_duplicate_keys,
	seen,
	level,
	is_key=False,
	)[1]
	)
	num_items_added += 1

	s += end_str + '}'
	return s


	def _dump_array(
	obj,
	skipkeys,
	ensure_ascii,
	check_circular,
	allow_nan,
	indent,
	separators,
	default,
	sort_keys,
	quote_keys,
	trailing_commas,
	allow_duplicate_keys,
	seen,
	level,
	item_sep,
	indent_str,
	end_str,
	):
	if not obj:
	return '[]'
	return (
	'['
	+ indent_str
	+ item_sep.join(
	[
	_dumps(
	el,
	skipkeys,
	ensure_ascii,
	check_circular,
	allow_nan,
	indent,
	separators,
	default,
	sort_keys,
	quote_keys,
	trailing_commas,
	allow_duplicate_keys,
	seen,
	level,
	False,
	)[1]
	for el in obj
	]
	)
	+ end_str
	+ ']'
	)


	def _dump_str(obj, ensure_ascii):
	ret = ['"']
	for ch in obj:
	if ch == '\\':
	ret.append('\\\\')
	elif ch == '"':
	ret.append('\\"')
	elif ch == '\u2028':
	ret.append('\\u2028')
	elif ch == '\u2029':
	ret.append('\\u2029')
	elif ch == '\n':
	ret.append('\\n')
	elif ch == '\r':
	ret.append('\\r')
	elif ch == '\b':
	ret.append('\\b')
	elif ch == '\f':
	ret.append('\\f')
	elif ch == '\t':
	ret.append('\\t')
	elif ch == '\v':
	ret.append('\\v')
	elif ch == '\0':
	ret.append('\\0')
	elif not ensure_ascii:
	ret.append(ch)
	else:
	o = ord(ch)
	if 32 <= o < 128:
	ret.append(ch)
	elif o < 65536:
	ret.append(f'\\u{o:04x}')
	else:
	val = o - 0x10000
	high = 0xD800 + (val >> 10)
	low = 0xDC00 + (val & 0x3FF)
	ret.append(f'\\u{high:04x}\\u{low:04x}')
	return ''.join(ret) + '"'


	def _is_ident(k):
	if not k or not _is_id_start(k[0]) and k[0] not in ('$', '_'):
	return False
	for ch in k[1:]:
	if not _is_id_continue(ch) and ch not in ('$', '_'):
	return False
	return True


	def _is_id_start(ch):
	return unicodedata.category(ch) in (
	'Lu',
	'Ll',
	'Li',
	'Lt',
	'Lm',
	'Lo',
	'Nl',
	)


	def _is_id_continue(ch):
	return unicodedata.category(ch) in (
	'Lu',
	'Ll',
	'Li',
	'Lt',
	'Lm',
	'Lo',
	'Nl',
	'Nd',
	'Mn',
	'Mc',
	'Pc',
	)


	_reserved_word_re = None


	def _is_reserved_word(k):
	global _reserved_word_re

	if _reserved_word_re is None:
	# List taken from section 7.6.1 of ECMA-262.
	_reserved_word_re = re.compile(
	'('
	+ '\|'.join(
	[
	'break',
	'case',
	'catch',
	'class',
	'const',
	'continue',
	'debugger',
	'default',
	'delete',
	'do',
	'else',
	'enum',
	'export',
	'extends',
	'false',
	'finally',
	'for',
	'function',
	'if',
	'import',
	'in',
	'instanceof',
	'new',
	'null',
	'return',
	'super',
	'switch',
	'this',
	'throw',
	'true',
	'try',
	'typeof',
	'var',
	'void',
	'while',
	'with',
	]
	)
	+ ')$'
	)
	return _reserved_word_re.match(k) is not None


	def _raise_type_error(obj):
	raise TypeError(f'{repr(obj)} is not JSON5 serializable')