|
|
|
""" |
|
|
|
webencodings |
|
~~~~~~~~~~~~ |
|
|
|
This is a Python implementation of the `WHATWG Encoding standard |
|
<http://encoding.spec.whatwg.org/>`. See README for details. |
|
|
|
:copyright: Copyright 2012 by Simon Sapin |
|
:license: BSD, see LICENSE for details. |
|
|
|
""" |
|
|
|
from __future__ import unicode_literals |
|
|
|
import codecs |
|
|
|
from .labels import LABELS |
|
|
|
|
|
VERSION = '0.5.1' |
|
|
|
|
|
|
|
PYTHON_NAMES = { |
|
'iso-8859-8-i': 'iso-8859-8', |
|
'x-mac-cyrillic': 'mac-cyrillic', |
|
'macintosh': 'mac-roman', |
|
'windows-874': 'cp874'} |
|
|
|
CACHE = {} |
|
|
|
|
|
def ascii_lower(string): |
|
r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z. |
|
|
|
:param string: An Unicode string. |
|
:returns: A new Unicode string. |
|
|
|
This is used for `ASCII case-insensitive |
|
<http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_ |
|
matching of encoding labels. |
|
The same matching is also used, among other things, |
|
for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_. |
|
|
|
This is different from the :meth:`~py:str.lower` method of Unicode strings |
|
which also affect non-ASCII characters, |
|
sometimes mapping them into the ASCII range: |
|
|
|
>>> keyword = u'Bac\N{KELVIN SIGN}ground' |
|
>>> assert keyword.lower() == u'background' |
|
>>> assert ascii_lower(keyword) != keyword.lower() |
|
>>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground' |
|
|
|
""" |
|
|
|
return string.encode('utf8').lower().decode('utf8') |
|
|
|
|
|
def lookup(label): |
|
""" |
|
Look for an encoding by its label. |
|
This is the spec’s `get an encoding |
|
<http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm. |
|
Supported labels are listed there. |
|
|
|
:param label: A string. |
|
:returns: |
|
An :class:`Encoding` object, or :obj:`None` for an unknown label. |
|
|
|
""" |
|
|
|
label = ascii_lower(label.strip('\t\n\f\r ')) |
|
name = LABELS.get(label) |
|
if name is None: |
|
return None |
|
encoding = CACHE.get(name) |
|
if encoding is None: |
|
if name == 'x-user-defined': |
|
from .x_user_defined import codec_info |
|
else: |
|
python_name = PYTHON_NAMES.get(name, name) |
|
|
|
codec_info = codecs.lookup(python_name) |
|
encoding = Encoding(name, codec_info) |
|
CACHE[name] = encoding |
|
return encoding |
|
|
|
|
|
def _get_encoding(encoding_or_label): |
|
""" |
|
Accept either an encoding object or label. |
|
|
|
:param encoding: An :class:`Encoding` object or a label string. |
|
:returns: An :class:`Encoding` object. |
|
:raises: :exc:`~exceptions.LookupError` for an unknown label. |
|
|
|
""" |
|
if hasattr(encoding_or_label, 'codec_info'): |
|
return encoding_or_label |
|
|
|
encoding = lookup(encoding_or_label) |
|
if encoding is None: |
|
raise LookupError('Unknown encoding label: %r' % encoding_or_label) |
|
return encoding |
|
|
|
|
|
class Encoding(object): |
|
"""Reresents a character encoding such as UTF-8, |
|
that can be used for decoding or encoding. |
|
|
|
.. attribute:: name |
|
|
|
Canonical name of the encoding |
|
|
|
.. attribute:: codec_info |
|
|
|
The actual implementation of the encoding, |
|
a stdlib :class:`~codecs.CodecInfo` object. |
|
See :func:`codecs.register`. |
|
|
|
""" |
|
def __init__(self, name, codec_info): |
|
self.name = name |
|
self.codec_info = codec_info |
|
|
|
def __repr__(self): |
|
return '<Encoding %s>' % self.name |
|
|
|
|
|
|
|
UTF8 = lookup('utf-8') |
|
|
|
_UTF16LE = lookup('utf-16le') |
|
_UTF16BE = lookup('utf-16be') |
|
|
|
|
|
def decode(input, fallback_encoding, errors='replace'): |
|
""" |
|
Decode a single string. |
|
|
|
:param input: A byte string |
|
:param fallback_encoding: |
|
An :class:`Encoding` object or a label string. |
|
The encoding to use if :obj:`input` does note have a BOM. |
|
:param errors: Type of error handling. See :func:`codecs.register`. |
|
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label. |
|
:return: |
|
A ``(output, encoding)`` tuple of an Unicode string |
|
and an :obj:`Encoding`. |
|
|
|
""" |
|
|
|
fallback_encoding = _get_encoding(fallback_encoding) |
|
bom_encoding, input = _detect_bom(input) |
|
encoding = bom_encoding or fallback_encoding |
|
return encoding.codec_info.decode(input, errors)[0], encoding |
|
|
|
|
|
def _detect_bom(input): |
|
"""Return (bom_encoding, input), with any BOM removed from the input.""" |
|
if input.startswith(b'\xFF\xFE'): |
|
return _UTF16LE, input[2:] |
|
if input.startswith(b'\xFE\xFF'): |
|
return _UTF16BE, input[2:] |
|
if input.startswith(b'\xEF\xBB\xBF'): |
|
return UTF8, input[3:] |
|
return None, input |
|
|
|
|
|
def encode(input, encoding=UTF8, errors='strict'): |
|
""" |
|
Encode a single string. |
|
|
|
:param input: An Unicode string. |
|
:param encoding: An :class:`Encoding` object or a label string. |
|
:param errors: Type of error handling. See :func:`codecs.register`. |
|
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label. |
|
:return: A byte string. |
|
|
|
""" |
|
return _get_encoding(encoding).codec_info.encode(input, errors)[0] |
|
|
|
|
|
def iter_decode(input, fallback_encoding, errors='replace'): |
|
""" |
|
"Pull"-based decoder. |
|
|
|
:param input: |
|
An iterable of byte strings. |
|
|
|
The input is first consumed just enough to determine the encoding |
|
based on the precense of a BOM, |
|
then consumed on demand when the return value is. |
|
:param fallback_encoding: |
|
An :class:`Encoding` object or a label string. |
|
The encoding to use if :obj:`input` does note have a BOM. |
|
:param errors: Type of error handling. See :func:`codecs.register`. |
|
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label. |
|
:returns: |
|
An ``(output, encoding)`` tuple. |
|
:obj:`output` is an iterable of Unicode strings, |
|
:obj:`encoding` is the :obj:`Encoding` that is being used. |
|
|
|
""" |
|
|
|
decoder = IncrementalDecoder(fallback_encoding, errors) |
|
generator = _iter_decode_generator(input, decoder) |
|
encoding = next(generator) |
|
return generator, encoding |
|
|
|
|
|
def _iter_decode_generator(input, decoder): |
|
"""Return a generator that first yields the :obj:`Encoding`, |
|
then yields output chukns as Unicode strings. |
|
|
|
""" |
|
decode = decoder.decode |
|
input = iter(input) |
|
for chunck in input: |
|
output = decode(chunck) |
|
if output: |
|
assert decoder.encoding is not None |
|
yield decoder.encoding |
|
yield output |
|
break |
|
else: |
|
|
|
output = decode(b'', final=True) |
|
assert decoder.encoding is not None |
|
yield decoder.encoding |
|
if output: |
|
yield output |
|
return |
|
|
|
for chunck in input: |
|
output = decode(chunck) |
|
if output: |
|
yield output |
|
output = decode(b'', final=True) |
|
if output: |
|
yield output |
|
|
|
|
|
def iter_encode(input, encoding=UTF8, errors='strict'): |
|
""" |
|
“Pull”-based encoder. |
|
|
|
:param input: An iterable of Unicode strings. |
|
:param encoding: An :class:`Encoding` object or a label string. |
|
:param errors: Type of error handling. See :func:`codecs.register`. |
|
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label. |
|
:returns: An iterable of byte strings. |
|
|
|
""" |
|
|
|
encode = IncrementalEncoder(encoding, errors).encode |
|
return _iter_encode_generator(input, encode) |
|
|
|
|
|
def _iter_encode_generator(input, encode): |
|
for chunck in input: |
|
output = encode(chunck) |
|
if output: |
|
yield output |
|
output = encode('', final=True) |
|
if output: |
|
yield output |
|
|
|
|
|
class IncrementalDecoder(object): |
|
""" |
|
“Push”-based decoder. |
|
|
|
:param fallback_encoding: |
|
An :class:`Encoding` object or a label string. |
|
The encoding to use if :obj:`input` does note have a BOM. |
|
:param errors: Type of error handling. See :func:`codecs.register`. |
|
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label. |
|
|
|
""" |
|
def __init__(self, fallback_encoding, errors='replace'): |
|
|
|
self._fallback_encoding = _get_encoding(fallback_encoding) |
|
self._errors = errors |
|
self._buffer = b'' |
|
self._decoder = None |
|
|
|
|
|
|
|
|
|
self.encoding = None |
|
|
|
def decode(self, input, final=False): |
|
"""Decode one chunk of the input. |
|
|
|
:param input: A byte string. |
|
:param final: |
|
Indicate that no more input is available. |
|
Must be :obj:`True` if this is the last call. |
|
:returns: An Unicode string. |
|
|
|
""" |
|
decoder = self._decoder |
|
if decoder is not None: |
|
return decoder(input, final) |
|
|
|
input = self._buffer + input |
|
encoding, input = _detect_bom(input) |
|
if encoding is None: |
|
if len(input) < 3 and not final: |
|
self._buffer = input |
|
return '' |
|
else: |
|
encoding = self._fallback_encoding |
|
decoder = encoding.codec_info.incrementaldecoder(self._errors).decode |
|
self._decoder = decoder |
|
self.encoding = encoding |
|
return decoder(input, final) |
|
|
|
|
|
class IncrementalEncoder(object): |
|
""" |
|
“Push”-based encoder. |
|
|
|
:param encoding: An :class:`Encoding` object or a label string. |
|
:param errors: Type of error handling. See :func:`codecs.register`. |
|
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label. |
|
|
|
.. method:: encode(input, final=False) |
|
|
|
:param input: An Unicode string. |
|
:param final: |
|
Indicate that no more input is available. |
|
Must be :obj:`True` if this is the last call. |
|
:returns: A byte string. |
|
|
|
""" |
|
def __init__(self, encoding=UTF8, errors='strict'): |
|
encoding = _get_encoding(encoding) |
|
self.encode = encoding.codec_info.incrementalencoder(errors).encode |
|
|