Upload folder using huggingface_hub

d1ceb73 verified 11 months ago

10.6 kB

	# coding: utf-8
	"""

	webencodings
	~~~~~~~~~~~~

	This is a Python implementation of the `WHATWG Encoding standard
	<http://encoding.spec.whatwg.org/>`. See README for details.

	:copyright: Copyright 2012 by Simon Sapin
	:license: BSD, see LICENSE for details.

	"""

	from __future__ import unicode_literals

	import codecs

	from .labels import LABELS


	VERSION = '0.5.1'


	# Some names in Encoding are not valid Python aliases. Remap these.
	PYTHON_NAMES = {
	'iso-8859-8-i': 'iso-8859-8',
	'x-mac-cyrillic': 'mac-cyrillic',
	'macintosh': 'mac-roman',
	'windows-874': 'cp874'}

	CACHE = {}


	def ascii_lower(string):
	r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.

	:param string: An Unicode string.
	:returns: A new Unicode string.

	This is used for `ASCII case-insensitive
	<http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
	matching of encoding labels.
	The same matching is also used, among other things,
	for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.

	This is different from the :meth:`~py:str.lower` method of Unicode strings
	which also affect non-ASCII characters,
	sometimes mapping them into the ASCII range:

	>>> keyword = u'Bac\N{KELVIN SIGN}ground'
	>>> assert keyword.lower() == u'background'
	>>> assert ascii_lower(keyword) != keyword.lower()
	>>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'

	"""
	# This turns out to be faster than unicode.translate()
	return string.encode('utf8').lower().decode('utf8')


	def lookup(label):
	"""
	Look for an encoding by its label.
	This is the spec’s `get an encoding
	<http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
	Supported labels are listed there.

	:param label: A string.
	:returns:
	An :class:`Encoding` object, or :obj:`None` for an unknown label.

	"""
	# Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
	label = ascii_lower(label.strip('\t\n\f\r '))
	name = LABELS.get(label)
	if name is None:
	return None
	encoding = CACHE.get(name)
	if encoding is None:
	if name == 'x-user-defined':
	from .x_user_defined import codec_info
	else:
	python_name = PYTHON_NAMES.get(name, name)
	# Any python_name value that gets to here should be valid.
	codec_info = codecs.lookup(python_name)
	encoding = Encoding(name, codec_info)
	CACHE[name] = encoding
	return encoding


	def _get_encoding(encoding_or_label):
	"""
	Accept either an encoding object or label.

	:param encoding: An :class:`Encoding` object or a label string.
	:returns: An :class:`Encoding` object.
	:raises: :exc:`~exceptions.LookupError` for an unknown label.

	"""
	if hasattr(encoding_or_label, 'codec_info'):
	return encoding_or_label

	encoding = lookup(encoding_or_label)
	if encoding is None:
	raise LookupError('Unknown encoding label: %r' % encoding_or_label)
	return encoding


	class Encoding(object):
	"""Reresents a character encoding such as UTF-8,
	that can be used for decoding or encoding.

	.. attribute:: name

	Canonical name of the encoding

	.. attribute:: codec_info

	The actual implementation of the encoding,
	a stdlib :class:`~codecs.CodecInfo` object.
	See :func:`codecs.register`.

	"""
	def __init__(self, name, codec_info):
	self.name = name
	self.codec_info = codec_info

	def __repr__(self):
	return '<Encoding %s>' % self.name


	#: The UTF-8 encoding. Should be used for new content and formats.
	UTF8 = lookup('utf-8')

	_UTF16LE = lookup('utf-16le')
	_UTF16BE = lookup('utf-16be')


	def decode(input, fallback_encoding, errors='replace'):
	"""
	Decode a single string.

	:param input: A byte string
	:param fallback_encoding:
	An :class:`Encoding` object or a label string.
	The encoding to use if :obj:`input` does note have a BOM.
	:param errors: Type of error handling. See :func:`codecs.register`.
	:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
	:return:
	A ``(output, encoding)`` tuple of an Unicode string
	and an :obj:`Encoding`.

	"""
	# Fail early if `encoding` is an invalid label.
	fallback_encoding = _get_encoding(fallback_encoding)
	bom_encoding, input = _detect_bom(input)
	encoding = bom_encoding or fallback_encoding
	return encoding.codec_info.decode(input, errors)[0], encoding


	def _detect_bom(input):
	"""Return (bom_encoding, input), with any BOM removed from the input."""
	if input.startswith(b'\xFF\xFE'):
	return _UTF16LE, input[2:]
	if input.startswith(b'\xFE\xFF'):
	return _UTF16BE, input[2:]
	if input.startswith(b'\xEF\xBB\xBF'):
	return UTF8, input[3:]
	return None, input


	def encode(input, encoding=UTF8, errors='strict'):
	"""
	Encode a single string.

	:param input: An Unicode string.
	:param encoding: An :class:`Encoding` object or a label string.
	:param errors: Type of error handling. See :func:`codecs.register`.
	:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
	:return: A byte string.

	"""
	return _get_encoding(encoding).codec_info.encode(input, errors)[0]


	def iter_decode(input, fallback_encoding, errors='replace'):
	"""
	"Pull"-based decoder.

	:param input:
	An iterable of byte strings.

	The input is first consumed just enough to determine the encoding
	based on the precense of a BOM,
	then consumed on demand when the return value is.
	:param fallback_encoding:
	An :class:`Encoding` object or a label string.
	The encoding to use if :obj:`input` does note have a BOM.
	:param errors: Type of error handling. See :func:`codecs.register`.
	:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
	:returns:
	An ``(output, encoding)`` tuple.
	:obj:`output` is an iterable of Unicode strings,
	:obj:`encoding` is the :obj:`Encoding` that is being used.

	"""

	decoder = IncrementalDecoder(fallback_encoding, errors)
	generator = _iter_decode_generator(input, decoder)
	encoding = next(generator)
	return generator, encoding


	def _iter_decode_generator(input, decoder):
	"""Return a generator that first yields the :obj:`Encoding`,
	then yields output chukns as Unicode strings.

	"""
	decode = decoder.decode
	input = iter(input)
	for chunck in input:
	output = decode(chunck)
	if output:
	assert decoder.encoding is not None
	yield decoder.encoding
	yield output
	break
	else:
	# Input exhausted without determining the encoding
	output = decode(b'', final=True)
	assert decoder.encoding is not None
	yield decoder.encoding
	if output:
	yield output
	return

	for chunck in input:
	output = decode(chunck)
	if output:
	yield output
	output = decode(b'', final=True)
	if output:
	yield output


	def iter_encode(input, encoding=UTF8, errors='strict'):
	"""
	“Pull”-based encoder.

	:param input: An iterable of Unicode strings.
	:param encoding: An :class:`Encoding` object or a label string.
	:param errors: Type of error handling. See :func:`codecs.register`.
	:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
	:returns: An iterable of byte strings.

	"""
	# Fail early if `encoding` is an invalid label.
	encode = IncrementalEncoder(encoding, errors).encode
	return _iter_encode_generator(input, encode)


	def _iter_encode_generator(input, encode):
	for chunck in input:
	output = encode(chunck)
	if output:
	yield output
	output = encode('', final=True)
	if output:
	yield output


	class IncrementalDecoder(object):
	"""
	“Push”-based decoder.

	:param fallback_encoding:
	An :class:`Encoding` object or a label string.
	The encoding to use if :obj:`input` does note have a BOM.
	:param errors: Type of error handling. See :func:`codecs.register`.
	:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.

	"""
	def __init__(self, fallback_encoding, errors='replace'):
	# Fail early if `encoding` is an invalid label.
	self._fallback_encoding = _get_encoding(fallback_encoding)
	self._errors = errors
	self._buffer = b''
	self._decoder = None
	#: The actual :class:`Encoding` that is being used,
	#: or :obj:`None` if that is not determined yet.
	#: (Ie. if there is not enough input yet to determine
	#: if there is a BOM.)
	self.encoding = None # Not known yet.

	def decode(self, input, final=False):
	"""Decode one chunk of the input.

	:param input: A byte string.
	:param final:
	Indicate that no more input is available.
	Must be :obj:`True` if this is the last call.
	:returns: An Unicode string.

	"""
	decoder = self._decoder
	if decoder is not None:
	return decoder(input, final)

	input = self._buffer + input
	encoding, input = _detect_bom(input)
	if encoding is None:
	if len(input) < 3 and not final: # Not enough data yet.
	self._buffer = input
	return ''
	else: # No BOM
	encoding = self._fallback_encoding
	decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
	self._decoder = decoder
	self.encoding = encoding
	return decoder(input, final)


	class IncrementalEncoder(object):
	"""
	“Push”-based encoder.

	:param encoding: An :class:`Encoding` object or a label string.
	:param errors: Type of error handling. See :func:`codecs.register`.
	:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.

	.. method:: encode(input, final=False)

	:param input: An Unicode string.
	:param final:
	Indicate that no more input is available.
	Must be :obj:`True` if this is the last call.
	:returns: A byte string.

	"""
	def __init__(self, encoding=UTF8, errors='strict'):
	encoding = _get_encoding(encoding)
	self.encode = encoding.codec_info.incrementalencoder(errors).encode