|
|
|
"""Beautiful Soup bonus library: Unicode, Dammit |
|
|
|
This library converts a bytestream to Unicode through any means |
|
necessary. It is heavily based on code from Mark Pilgrim's Universal |
|
Feed Parser. It works best on XML and HTML, but it does not rewrite the |
|
XML or HTML to reflect a new encoding; that's the tree builder's job. |
|
""" |
|
|
|
__license__ = "MIT" |
|
|
|
from html.entities import codepoint2name |
|
from collections import defaultdict |
|
import codecs |
|
import re |
|
import logging |
|
import string |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chardet_module = None |
|
try: |
|
|
|
import cchardet as chardet_module |
|
except ImportError: |
|
try: |
|
|
|
|
|
import chardet as chardet_module |
|
except ImportError: |
|
try: |
|
|
|
import charset_normalizer as chardet_module |
|
except ImportError: |
|
|
|
chardet_module = None |
|
|
|
if chardet_module: |
|
def chardet_dammit(s): |
|
if isinstance(s, str): |
|
return None |
|
return chardet_module.detect(s)['encoding'] |
|
else: |
|
def chardet_dammit(s): |
|
return None |
|
|
|
|
|
|
|
xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' |
|
html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' |
|
encoding_res = dict() |
|
encoding_res[bytes] = { |
|
'html' : re.compile(html_meta.encode("ascii"), re.I), |
|
'xml' : re.compile(xml_encoding.encode("ascii"), re.I), |
|
} |
|
encoding_res[str] = { |
|
'html' : re.compile(html_meta, re.I), |
|
'xml' : re.compile(xml_encoding, re.I) |
|
} |
|
|
|
from html.entities import html5 |
|
|
|
class EntitySubstitution(object): |
|
"""The ability to substitute XML or HTML entities for certain characters.""" |
|
|
|
def _populate_class_variables(): |
|
"""Initialize variables used by this class to manage the plethora of |
|
HTML5 named entities. |
|
|
|
This function returns a 3-tuple containing two dictionaries |
|
and a regular expression: |
|
|
|
unicode_to_name - A mapping of Unicode strings like "⦨" to |
|
entity names like "angmsdaa". When a single Unicode string has |
|
multiple entity names, we try to choose the most commonly-used |
|
name. |
|
|
|
name_to_unicode: A mapping of entity names like "angmsdaa" to |
|
Unicode strings like "⦨". |
|
|
|
named_entity_re: A regular expression matching (almost) any |
|
Unicode string that corresponds to an HTML5 named entity. |
|
""" |
|
unicode_to_name = {} |
|
name_to_unicode = {} |
|
|
|
short_entities = set() |
|
long_entities_by_first_character = defaultdict(set) |
|
|
|
for name_with_semicolon, character in sorted(html5.items()): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if name_with_semicolon.endswith(';'): |
|
name = name_with_semicolon[:-1] |
|
else: |
|
name = name_with_semicolon |
|
|
|
|
|
|
|
|
|
if name not in name_to_unicode: |
|
name_to_unicode[name] = character |
|
|
|
|
|
|
|
|
|
unicode_to_name[character] = name |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (len(character) == 1 and ord(character) < 128 |
|
and character not in '<>&'): |
|
|
|
|
|
|
|
|
|
continue |
|
|
|
if len(character) > 1 and all(ord(x) < 128 for x in character): |
|
|
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(character) == 1: |
|
short_entities.add(character) |
|
else: |
|
long_entities_by_first_character[character[0]].add(character) |
|
|
|
|
|
|
|
particles = set() |
|
for short in short_entities: |
|
long_versions = long_entities_by_first_character[short] |
|
if not long_versions: |
|
particles.add(short) |
|
else: |
|
ignore = "".join([x[1] for x in long_versions]) |
|
|
|
|
|
particles.add("%s(?![%s])" % (short, ignore)) |
|
|
|
for long_entities in list(long_entities_by_first_character.values()): |
|
for long_entity in long_entities: |
|
particles.add(long_entity) |
|
|
|
re_definition = "(%s)" % "|".join(particles) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for codepoint, name in list(codepoint2name.items()): |
|
character = chr(codepoint) |
|
unicode_to_name[character] = name |
|
|
|
return unicode_to_name, name_to_unicode, re.compile(re_definition) |
|
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, |
|
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() |
|
|
|
CHARACTER_TO_XML_ENTITY = { |
|
"'": "apos", |
|
'"': "quot", |
|
"&": "amp", |
|
"<": "lt", |
|
">": "gt", |
|
} |
|
|
|
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" |
|
"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" |
|
")") |
|
|
|
AMPERSAND_OR_BRACKET = re.compile("([<>&])") |
|
|
|
@classmethod |
|
def _substitute_html_entity(cls, matchobj): |
|
"""Used with a regular expression to substitute the |
|
appropriate HTML entity for a special character string.""" |
|
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) |
|
return "&%s;" % entity |
|
|
|
@classmethod |
|
def _substitute_xml_entity(cls, matchobj): |
|
"""Used with a regular expression to substitute the |
|
appropriate XML entity for a special character string.""" |
|
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] |
|
return "&%s;" % entity |
|
|
|
@classmethod |
|
def quoted_attribute_value(self, value): |
|
"""Make a value into a quoted XML attribute, possibly escaping it. |
|
|
|
Most strings will be quoted using double quotes. |
|
|
|
Bob's Bar -> "Bob's Bar" |
|
|
|
If a string contains double quotes, it will be quoted using |
|
single quotes. |
|
|
|
Welcome to "my bar" -> 'Welcome to "my bar"' |
|
|
|
If a string contains both single and double quotes, the |
|
double quotes will be escaped, and the string will be quoted |
|
using double quotes. |
|
|
|
Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" |
|
""" |
|
quote_with = '"' |
|
if '"' in value: |
|
if "'" in value: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
replace_with = """ |
|
value = value.replace('"', replace_with) |
|
else: |
|
|
|
|
|
quote_with = "'" |
|
return quote_with + value + quote_with |
|
|
|
@classmethod |
|
def substitute_xml(cls, value, make_quoted_attribute=False): |
|
"""Substitute XML entities for special XML characters. |
|
|
|
:param value: A string to be substituted. The less-than sign |
|
will become <, the greater-than sign will become >, |
|
and any ampersands will become &. If you want ampersands |
|
that appear to be part of an entity definition to be left |
|
alone, use substitute_xml_containing_entities() instead. |
|
|
|
:param make_quoted_attribute: If True, then the string will be |
|
quoted, as befits an attribute value. |
|
""" |
|
|
|
value = cls.AMPERSAND_OR_BRACKET.sub( |
|
cls._substitute_xml_entity, value) |
|
|
|
if make_quoted_attribute: |
|
value = cls.quoted_attribute_value(value) |
|
return value |
|
|
|
@classmethod |
|
def substitute_xml_containing_entities( |
|
cls, value, make_quoted_attribute=False): |
|
"""Substitute XML entities for special XML characters. |
|
|
|
:param value: A string to be substituted. The less-than sign will |
|
become <, the greater-than sign will become >, and any |
|
ampersands that are not part of an entity defition will |
|
become &. |
|
|
|
:param make_quoted_attribute: If True, then the string will be |
|
quoted, as befits an attribute value. |
|
""" |
|
|
|
|
|
value = cls.BARE_AMPERSAND_OR_BRACKET.sub( |
|
cls._substitute_xml_entity, value) |
|
|
|
if make_quoted_attribute: |
|
value = cls.quoted_attribute_value(value) |
|
return value |
|
|
|
@classmethod |
|
def substitute_html(cls, s): |
|
"""Replace certain Unicode characters with named HTML entities. |
|
|
|
This differs from data.encode(encoding, 'xmlcharrefreplace') |
|
in that the goal is to make the result more readable (to those |
|
with ASCII displays) rather than to recover from |
|
errors. There's absolutely nothing wrong with a UTF-8 string |
|
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that |
|
character with "é" will make it more readable to some |
|
people. |
|
|
|
:param s: A Unicode string. |
|
""" |
|
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( |
|
cls._substitute_html_entity, s) |
|
|
|
|
|
class EncodingDetector: |
|
"""Suggests a number of possible encodings for a bytestring. |
|
|
|
Order of precedence: |
|
|
|
1. Encodings you specifically tell EncodingDetector to try first |
|
(the known_definite_encodings argument to the constructor). |
|
|
|
2. An encoding determined by sniffing the document's byte-order mark. |
|
|
|
3. Encodings you specifically tell EncodingDetector to try if |
|
byte-order mark sniffing fails (the user_encodings argument to the |
|
constructor). |
|
|
|
4. An encoding declared within the bytestring itself, either in an |
|
XML declaration (if the bytestring is to be interpreted as an XML |
|
document), or in a <meta> tag (if the bytestring is to be |
|
interpreted as an HTML document.) |
|
|
|
5. An encoding detected through textual analysis by chardet, |
|
cchardet, or a similar external library. |
|
|
|
4. UTF-8. |
|
|
|
5. Windows-1252. |
|
|
|
""" |
|
def __init__(self, markup, known_definite_encodings=None, |
|
is_html=False, exclude_encodings=None, |
|
user_encodings=None, override_encodings=None): |
|
"""Constructor. |
|
|
|
:param markup: Some markup in an unknown encoding. |
|
|
|
:param known_definite_encodings: When determining the encoding |
|
of `markup`, these encodings will be tried first, in |
|
order. In HTML terms, this corresponds to the "known |
|
definite encoding" step defined here: |
|
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding |
|
|
|
:param user_encodings: These encodings will be tried after the |
|
`known_definite_encodings` have been tried and failed, and |
|
after an attempt to sniff the encoding by looking at a |
|
byte order mark has failed. In HTML terms, this |
|
corresponds to the step "user has explicitly instructed |
|
the user agent to override the document's character |
|
encoding", defined here: |
|
https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding |
|
|
|
:param override_encodings: A deprecated alias for |
|
known_definite_encodings. Any encodings here will be tried |
|
immediately after the encodings in |
|
known_definite_encodings. |
|
|
|
:param is_html: If True, this markup is considered to be |
|
HTML. Otherwise it's assumed to be XML. |
|
|
|
:param exclude_encodings: These encodings will not be tried, |
|
even if they otherwise would be. |
|
|
|
""" |
|
self.known_definite_encodings = list(known_definite_encodings or []) |
|
if override_encodings: |
|
self.known_definite_encodings += override_encodings |
|
self.user_encodings = user_encodings or [] |
|
exclude_encodings = exclude_encodings or [] |
|
self.exclude_encodings = set([x.lower() for x in exclude_encodings]) |
|
self.chardet_encoding = None |
|
self.is_html = is_html |
|
self.declared_encoding = None |
|
|
|
|
|
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) |
|
|
|
def _usable(self, encoding, tried): |
|
"""Should we even bother to try this encoding? |
|
|
|
:param encoding: Name of an encoding. |
|
:param tried: Encodings that have already been tried. This will be modified |
|
as a side effect. |
|
""" |
|
if encoding is not None: |
|
encoding = encoding.lower() |
|
if encoding in self.exclude_encodings: |
|
return False |
|
if encoding not in tried: |
|
tried.add(encoding) |
|
return True |
|
return False |
|
|
|
@property |
|
def encodings(self): |
|
"""Yield a number of encodings that might work for this markup. |
|
|
|
:yield: A sequence of strings. |
|
""" |
|
tried = set() |
|
|
|
|
|
for e in self.known_definite_encodings: |
|
if self._usable(e, tried): |
|
yield e |
|
|
|
|
|
|
|
if self._usable(self.sniffed_encoding, tried): |
|
yield self.sniffed_encoding |
|
|
|
|
|
|
|
for e in self.user_encodings: |
|
if self._usable(e, tried): |
|
yield e |
|
|
|
|
|
|
|
if self.declared_encoding is None: |
|
self.declared_encoding = self.find_declared_encoding( |
|
self.markup, self.is_html) |
|
if self._usable(self.declared_encoding, tried): |
|
yield self.declared_encoding |
|
|
|
|
|
|
|
if self.chardet_encoding is None: |
|
self.chardet_encoding = chardet_dammit(self.markup) |
|
if self._usable(self.chardet_encoding, tried): |
|
yield self.chardet_encoding |
|
|
|
|
|
for e in ('utf-8', 'windows-1252'): |
|
if self._usable(e, tried): |
|
yield e |
|
|
|
@classmethod |
|
def strip_byte_order_mark(cls, data): |
|
"""If a byte-order mark is present, strip it and return the encoding it implies. |
|
|
|
:param data: Some markup. |
|
:return: A 2-tuple (modified data, implied encoding) |
|
""" |
|
encoding = None |
|
if isinstance(data, str): |
|
|
|
return data, encoding |
|
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ |
|
and (data[2:4] != '\x00\x00'): |
|
encoding = 'utf-16be' |
|
data = data[2:] |
|
elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ |
|
and (data[2:4] != '\x00\x00'): |
|
encoding = 'utf-16le' |
|
data = data[2:] |
|
elif data[:3] == b'\xef\xbb\xbf': |
|
encoding = 'utf-8' |
|
data = data[3:] |
|
elif data[:4] == b'\x00\x00\xfe\xff': |
|
encoding = 'utf-32be' |
|
data = data[4:] |
|
elif data[:4] == b'\xff\xfe\x00\x00': |
|
encoding = 'utf-32le' |
|
data = data[4:] |
|
return data, encoding |
|
|
|
@classmethod |
|
def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): |
|
"""Given a document, tries to find its declared encoding. |
|
|
|
An XML encoding is declared at the beginning of the document. |
|
|
|
An HTML encoding is declared in a <meta> tag, hopefully near the |
|
beginning of the document. |
|
|
|
:param markup: Some markup. |
|
:param is_html: If True, this markup is considered to be HTML. Otherwise |
|
it's assumed to be XML. |
|
:param search_entire_document: Since an encoding is supposed to declared near the beginning |
|
of the document, most of the time it's only necessary to search a few kilobytes of data. |
|
Set this to True to force this method to search the entire document. |
|
""" |
|
if search_entire_document: |
|
xml_endpos = html_endpos = len(markup) |
|
else: |
|
xml_endpos = 1024 |
|
html_endpos = max(2048, int(len(markup) * 0.05)) |
|
|
|
if isinstance(markup, bytes): |
|
res = encoding_res[bytes] |
|
else: |
|
res = encoding_res[str] |
|
|
|
xml_re = res['xml'] |
|
html_re = res['html'] |
|
declared_encoding = None |
|
declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) |
|
if not declared_encoding_match and is_html: |
|
declared_encoding_match = html_re.search(markup, endpos=html_endpos) |
|
if declared_encoding_match is not None: |
|
declared_encoding = declared_encoding_match.groups()[0] |
|
if declared_encoding: |
|
if isinstance(declared_encoding, bytes): |
|
declared_encoding = declared_encoding.decode('ascii', 'replace') |
|
return declared_encoding.lower() |
|
return None |
|
|
|
class UnicodeDammit: |
|
"""A class for detecting the encoding of a *ML document and |
|
converting it to a Unicode string. If the source encoding is |
|
windows-1252, can replace MS smart quotes with their HTML or XML |
|
equivalents.""" |
|
|
|
|
|
|
|
|
|
|
|
CHARSET_ALIASES = {"macintosh": "mac-roman", |
|
"x-sjis": "shift-jis"} |
|
|
|
ENCODINGS_WITH_SMART_QUOTES = [ |
|
"windows-1252", |
|
"iso-8859-1", |
|
"iso-8859-2", |
|
] |
|
|
|
def __init__(self, markup, known_definite_encodings=[], |
|
smart_quotes_to=None, is_html=False, exclude_encodings=[], |
|
user_encodings=None, override_encodings=None |
|
): |
|
"""Constructor. |
|
|
|
:param markup: A bytestring representing markup in an unknown encoding. |
|
|
|
:param known_definite_encodings: When determining the encoding |
|
of `markup`, these encodings will be tried first, in |
|
order. In HTML terms, this corresponds to the "known |
|
definite encoding" step defined here: |
|
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding |
|
|
|
:param user_encodings: These encodings will be tried after the |
|
`known_definite_encodings` have been tried and failed, and |
|
after an attempt to sniff the encoding by looking at a |
|
byte order mark has failed. In HTML terms, this |
|
corresponds to the step "user has explicitly instructed |
|
the user agent to override the document's character |
|
encoding", defined here: |
|
https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding |
|
|
|
:param override_encodings: A deprecated alias for |
|
known_definite_encodings. Any encodings here will be tried |
|
immediately after the encodings in |
|
known_definite_encodings. |
|
|
|
:param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted |
|
to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. |
|
Setting it to 'xml' will convert them to XML entity references, and setting it to 'html' |
|
will convert them to HTML entity references. |
|
:param is_html: If True, this markup is considered to be HTML. Otherwise |
|
it's assumed to be XML. |
|
:param exclude_encodings: These encodings will not be considered, even |
|
if the sniffing code thinks they might make sense. |
|
|
|
""" |
|
self.smart_quotes_to = smart_quotes_to |
|
self.tried_encodings = [] |
|
self.contains_replacement_characters = False |
|
self.is_html = is_html |
|
self.log = logging.getLogger(__name__) |
|
self.detector = EncodingDetector( |
|
markup, known_definite_encodings, is_html, exclude_encodings, |
|
user_encodings, override_encodings |
|
) |
|
|
|
|
|
if isinstance(markup, str) or markup == '': |
|
self.markup = markup |
|
self.unicode_markup = str(markup) |
|
self.original_encoding = None |
|
return |
|
|
|
|
|
|
|
self.markup = self.detector.markup |
|
|
|
u = None |
|
for encoding in self.detector.encodings: |
|
markup = self.detector.markup |
|
u = self._convert_from(encoding) |
|
if u is not None: |
|
break |
|
|
|
if not u: |
|
|
|
|
|
|
|
for encoding in self.detector.encodings: |
|
if encoding != "ascii": |
|
u = self._convert_from(encoding, "replace") |
|
if u is not None: |
|
self.log.warning( |
|
"Some characters could not be decoded, and were " |
|
"replaced with REPLACEMENT CHARACTER." |
|
) |
|
self.contains_replacement_characters = True |
|
break |
|
|
|
|
|
|
|
|
|
self.unicode_markup = u |
|
if not u: |
|
self.original_encoding = None |
|
|
|
def _sub_ms_char(self, match): |
|
"""Changes a MS smart quote character to an XML or HTML |
|
entity, or an ASCII character.""" |
|
orig = match.group(1) |
|
if self.smart_quotes_to == 'ascii': |
|
sub = self.MS_CHARS_TO_ASCII.get(orig).encode() |
|
else: |
|
sub = self.MS_CHARS.get(orig) |
|
if type(sub) == tuple: |
|
if self.smart_quotes_to == 'xml': |
|
sub = '&#x'.encode() + sub[1].encode() + ';'.encode() |
|
else: |
|
sub = '&'.encode() + sub[0].encode() + ';'.encode() |
|
else: |
|
sub = sub.encode() |
|
return sub |
|
|
|
def _convert_from(self, proposed, errors="strict"): |
|
"""Attempt to convert the markup to the proposed encoding. |
|
|
|
:param proposed: The name of a character encoding. |
|
""" |
|
proposed = self.find_codec(proposed) |
|
if not proposed or (proposed, errors) in self.tried_encodings: |
|
return None |
|
self.tried_encodings.append((proposed, errors)) |
|
markup = self.markup |
|
|
|
|
|
if (self.smart_quotes_to is not None |
|
and proposed in self.ENCODINGS_WITH_SMART_QUOTES): |
|
smart_quotes_re = b"([\x80-\x9f])" |
|
smart_quotes_compiled = re.compile(smart_quotes_re) |
|
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) |
|
|
|
try: |
|
|
|
|
|
u = self._to_unicode(markup, proposed, errors) |
|
self.markup = u |
|
self.original_encoding = proposed |
|
except Exception as e: |
|
|
|
|
|
return None |
|
|
|
return self.markup |
|
|
|
def _to_unicode(self, data, encoding, errors="strict"): |
|
"""Given a string and its encoding, decodes the string into Unicode. |
|
|
|
:param encoding: The name of an encoding. |
|
""" |
|
return str(data, encoding, errors) |
|
|
|
@property |
|
def declared_html_encoding(self): |
|
"""If the markup is an HTML document, returns the encoding declared _within_ |
|
the document. |
|
""" |
|
if not self.is_html: |
|
return None |
|
return self.detector.declared_encoding |
|
|
|
def find_codec(self, charset): |
|
"""Convert the name of a character set to a codec name. |
|
|
|
:param charset: The name of a character set. |
|
:return: The name of a codec. |
|
""" |
|
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) |
|
or (charset and self._codec(charset.replace("-", ""))) |
|
or (charset and self._codec(charset.replace("-", "_"))) |
|
or (charset and charset.lower()) |
|
or charset |
|
) |
|
if value: |
|
return value.lower() |
|
return None |
|
|
|
def _codec(self, charset): |
|
if not charset: |
|
return charset |
|
codec = None |
|
try: |
|
codecs.lookup(charset) |
|
codec = charset |
|
except (LookupError, ValueError): |
|
pass |
|
return codec |
|
|
|
|
|
|
|
MS_CHARS = {b'\x80': ('euro', '20AC'), |
|
b'\x81': ' ', |
|
b'\x82': ('sbquo', '201A'), |
|
b'\x83': ('fnof', '192'), |
|
b'\x84': ('bdquo', '201E'), |
|
b'\x85': ('hellip', '2026'), |
|
b'\x86': ('dagger', '2020'), |
|
b'\x87': ('Dagger', '2021'), |
|
b'\x88': ('circ', '2C6'), |
|
b'\x89': ('permil', '2030'), |
|
b'\x8A': ('Scaron', '160'), |
|
b'\x8B': ('lsaquo', '2039'), |
|
b'\x8C': ('OElig', '152'), |
|
b'\x8D': '?', |
|
b'\x8E': ('#x17D', '17D'), |
|
b'\x8F': '?', |
|
b'\x90': '?', |
|
b'\x91': ('lsquo', '2018'), |
|
b'\x92': ('rsquo', '2019'), |
|
b'\x93': ('ldquo', '201C'), |
|
b'\x94': ('rdquo', '201D'), |
|
b'\x95': ('bull', '2022'), |
|
b'\x96': ('ndash', '2013'), |
|
b'\x97': ('mdash', '2014'), |
|
b'\x98': ('tilde', '2DC'), |
|
b'\x99': ('trade', '2122'), |
|
b'\x9a': ('scaron', '161'), |
|
b'\x9b': ('rsaquo', '203A'), |
|
b'\x9c': ('oelig', '153'), |
|
b'\x9d': '?', |
|
b'\x9e': ('#x17E', '17E'), |
|
b'\x9f': ('Yuml', ''),} |
|
|
|
|
|
|
|
|
|
MS_CHARS_TO_ASCII = { |
|
b'\x80' : 'EUR', |
|
b'\x81' : ' ', |
|
b'\x82' : ',', |
|
b'\x83' : 'f', |
|
b'\x84' : ',,', |
|
b'\x85' : '...', |
|
b'\x86' : '+', |
|
b'\x87' : '++', |
|
b'\x88' : '^', |
|
b'\x89' : '%', |
|
b'\x8a' : 'S', |
|
b'\x8b' : '<', |
|
b'\x8c' : 'OE', |
|
b'\x8d' : '?', |
|
b'\x8e' : 'Z', |
|
b'\x8f' : '?', |
|
b'\x90' : '?', |
|
b'\x91' : "'", |
|
b'\x92' : "'", |
|
b'\x93' : '"', |
|
b'\x94' : '"', |
|
b'\x95' : '*', |
|
b'\x96' : '-', |
|
b'\x97' : '--', |
|
b'\x98' : '~', |
|
b'\x99' : '(TM)', |
|
b'\x9a' : 's', |
|
b'\x9b' : '>', |
|
b'\x9c' : 'oe', |
|
b'\x9d' : '?', |
|
b'\x9e' : 'z', |
|
b'\x9f' : 'Y', |
|
b'\xa0' : ' ', |
|
b'\xa1' : '!', |
|
b'\xa2' : 'c', |
|
b'\xa3' : 'GBP', |
|
b'\xa4' : '$', |
|
|
|
b'\xa5' : 'YEN', |
|
b'\xa6' : '|', |
|
b'\xa7' : 'S', |
|
b'\xa8' : '..', |
|
b'\xa9' : '', |
|
b'\xaa' : '(th)', |
|
b'\xab' : '<<', |
|
b'\xac' : '!', |
|
b'\xad' : ' ', |
|
b'\xae' : '(R)', |
|
b'\xaf' : '-', |
|
b'\xb0' : 'o', |
|
b'\xb1' : '+-', |
|
b'\xb2' : '2', |
|
b'\xb3' : '3', |
|
b'\xb4' : ("'", 'acute'), |
|
b'\xb5' : 'u', |
|
b'\xb6' : 'P', |
|
b'\xb7' : '*', |
|
b'\xb8' : ',', |
|
b'\xb9' : '1', |
|
b'\xba' : '(th)', |
|
b'\xbb' : '>>', |
|
b'\xbc' : '1/4', |
|
b'\xbd' : '1/2', |
|
b'\xbe' : '3/4', |
|
b'\xbf' : '?', |
|
b'\xc0' : 'A', |
|
b'\xc1' : 'A', |
|
b'\xc2' : 'A', |
|
b'\xc3' : 'A', |
|
b'\xc4' : 'A', |
|
b'\xc5' : 'A', |
|
b'\xc6' : 'AE', |
|
b'\xc7' : 'C', |
|
b'\xc8' : 'E', |
|
b'\xc9' : 'E', |
|
b'\xca' : 'E', |
|
b'\xcb' : 'E', |
|
b'\xcc' : 'I', |
|
b'\xcd' : 'I', |
|
b'\xce' : 'I', |
|
b'\xcf' : 'I', |
|
b'\xd0' : 'D', |
|
b'\xd1' : 'N', |
|
b'\xd2' : 'O', |
|
b'\xd3' : 'O', |
|
b'\xd4' : 'O', |
|
b'\xd5' : 'O', |
|
b'\xd6' : 'O', |
|
b'\xd7' : '*', |
|
b'\xd8' : 'O', |
|
b'\xd9' : 'U', |
|
b'\xda' : 'U', |
|
b'\xdb' : 'U', |
|
b'\xdc' : 'U', |
|
b'\xdd' : 'Y', |
|
b'\xde' : 'b', |
|
b'\xdf' : 'B', |
|
b'\xe0' : 'a', |
|
b'\xe1' : 'a', |
|
b'\xe2' : 'a', |
|
b'\xe3' : 'a', |
|
b'\xe4' : 'a', |
|
b'\xe5' : 'a', |
|
b'\xe6' : 'ae', |
|
b'\xe7' : 'c', |
|
b'\xe8' : 'e', |
|
b'\xe9' : 'e', |
|
b'\xea' : 'e', |
|
b'\xeb' : 'e', |
|
b'\xec' : 'i', |
|
b'\xed' : 'i', |
|
b'\xee' : 'i', |
|
b'\xef' : 'i', |
|
b'\xf0' : 'o', |
|
b'\xf1' : 'n', |
|
b'\xf2' : 'o', |
|
b'\xf3' : 'o', |
|
b'\xf4' : 'o', |
|
b'\xf5' : 'o', |
|
b'\xf6' : 'o', |
|
b'\xf7' : '/', |
|
b'\xf8' : 'o', |
|
b'\xf9' : 'u', |
|
b'\xfa' : 'u', |
|
b'\xfb' : 'u', |
|
b'\xfc' : 'u', |
|
b'\xfd' : 'y', |
|
b'\xfe' : 'b', |
|
b'\xff' : 'y', |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
WINDOWS_1252_TO_UTF8 = { |
|
0x80 : b'\xe2\x82\xac', |
|
0x82 : b'\xe2\x80\x9a', |
|
0x83 : b'\xc6\x92', |
|
0x84 : b'\xe2\x80\x9e', |
|
0x85 : b'\xe2\x80\xa6', |
|
0x86 : b'\xe2\x80\xa0', |
|
0x87 : b'\xe2\x80\xa1', |
|
0x88 : b'\xcb\x86', |
|
0x89 : b'\xe2\x80\xb0', |
|
0x8a : b'\xc5\xa0', |
|
0x8b : b'\xe2\x80\xb9', |
|
0x8c : b'\xc5\x92', |
|
0x8e : b'\xc5\xbd', |
|
0x91 : b'\xe2\x80\x98', |
|
0x92 : b'\xe2\x80\x99', |
|
0x93 : b'\xe2\x80\x9c', |
|
0x94 : b'\xe2\x80\x9d', |
|
0x95 : b'\xe2\x80\xa2', |
|
0x96 : b'\xe2\x80\x93', |
|
0x97 : b'\xe2\x80\x94', |
|
0x98 : b'\xcb\x9c', |
|
0x99 : b'\xe2\x84\xa2', |
|
0x9a : b'\xc5\xa1', |
|
0x9b : b'\xe2\x80\xba', |
|
0x9c : b'\xc5\x93', |
|
0x9e : b'\xc5\xbe', |
|
0x9f : b'\xc5\xb8', |
|
0xa0 : b'\xc2\xa0', |
|
0xa1 : b'\xc2\xa1', |
|
0xa2 : b'\xc2\xa2', |
|
0xa3 : b'\xc2\xa3', |
|
0xa4 : b'\xc2\xa4', |
|
0xa5 : b'\xc2\xa5', |
|
0xa6 : b'\xc2\xa6', |
|
0xa7 : b'\xc2\xa7', |
|
0xa8 : b'\xc2\xa8', |
|
0xa9 : b'\xc2\xa9', |
|
0xaa : b'\xc2\xaa', |
|
0xab : b'\xc2\xab', |
|
0xac : b'\xc2\xac', |
|
0xad : b'\xc2\xad', |
|
0xae : b'\xc2\xae', |
|
0xaf : b'\xc2\xaf', |
|
0xb0 : b'\xc2\xb0', |
|
0xb1 : b'\xc2\xb1', |
|
0xb2 : b'\xc2\xb2', |
|
0xb3 : b'\xc2\xb3', |
|
0xb4 : b'\xc2\xb4', |
|
0xb5 : b'\xc2\xb5', |
|
0xb6 : b'\xc2\xb6', |
|
0xb7 : b'\xc2\xb7', |
|
0xb8 : b'\xc2\xb8', |
|
0xb9 : b'\xc2\xb9', |
|
0xba : b'\xc2\xba', |
|
0xbb : b'\xc2\xbb', |
|
0xbc : b'\xc2\xbc', |
|
0xbd : b'\xc2\xbd', |
|
0xbe : b'\xc2\xbe', |
|
0xbf : b'\xc2\xbf', |
|
0xc0 : b'\xc3\x80', |
|
0xc1 : b'\xc3\x81', |
|
0xc2 : b'\xc3\x82', |
|
0xc3 : b'\xc3\x83', |
|
0xc4 : b'\xc3\x84', |
|
0xc5 : b'\xc3\x85', |
|
0xc6 : b'\xc3\x86', |
|
0xc7 : b'\xc3\x87', |
|
0xc8 : b'\xc3\x88', |
|
0xc9 : b'\xc3\x89', |
|
0xca : b'\xc3\x8a', |
|
0xcb : b'\xc3\x8b', |
|
0xcc : b'\xc3\x8c', |
|
0xcd : b'\xc3\x8d', |
|
0xce : b'\xc3\x8e', |
|
0xcf : b'\xc3\x8f', |
|
0xd0 : b'\xc3\x90', |
|
0xd1 : b'\xc3\x91', |
|
0xd2 : b'\xc3\x92', |
|
0xd3 : b'\xc3\x93', |
|
0xd4 : b'\xc3\x94', |
|
0xd5 : b'\xc3\x95', |
|
0xd6 : b'\xc3\x96', |
|
0xd7 : b'\xc3\x97', |
|
0xd8 : b'\xc3\x98', |
|
0xd9 : b'\xc3\x99', |
|
0xda : b'\xc3\x9a', |
|
0xdb : b'\xc3\x9b', |
|
0xdc : b'\xc3\x9c', |
|
0xdd : b'\xc3\x9d', |
|
0xde : b'\xc3\x9e', |
|
0xdf : b'\xc3\x9f', |
|
0xe0 : b'\xc3\xa0', |
|
0xe1 : b'\xa1', |
|
0xe2 : b'\xc3\xa2', |
|
0xe3 : b'\xc3\xa3', |
|
0xe4 : b'\xc3\xa4', |
|
0xe5 : b'\xc3\xa5', |
|
0xe6 : b'\xc3\xa6', |
|
0xe7 : b'\xc3\xa7', |
|
0xe8 : b'\xc3\xa8', |
|
0xe9 : b'\xc3\xa9', |
|
0xea : b'\xc3\xaa', |
|
0xeb : b'\xc3\xab', |
|
0xec : b'\xc3\xac', |
|
0xed : b'\xc3\xad', |
|
0xee : b'\xc3\xae', |
|
0xef : b'\xc3\xaf', |
|
0xf0 : b'\xc3\xb0', |
|
0xf1 : b'\xc3\xb1', |
|
0xf2 : b'\xc3\xb2', |
|
0xf3 : b'\xc3\xb3', |
|
0xf4 : b'\xc3\xb4', |
|
0xf5 : b'\xc3\xb5', |
|
0xf6 : b'\xc3\xb6', |
|
0xf7 : b'\xc3\xb7', |
|
0xf8 : b'\xc3\xb8', |
|
0xf9 : b'\xc3\xb9', |
|
0xfa : b'\xc3\xba', |
|
0xfb : b'\xc3\xbb', |
|
0xfc : b'\xc3\xbc', |
|
0xfd : b'\xc3\xbd', |
|
0xfe : b'\xc3\xbe', |
|
} |
|
|
|
MULTIBYTE_MARKERS_AND_SIZES = [ |
|
(0xc2, 0xdf, 2), |
|
(0xe0, 0xef, 3), |
|
(0xf0, 0xf4, 4), |
|
] |
|
|
|
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] |
|
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] |
|
|
|
@classmethod |
|
def detwingle(cls, in_bytes, main_encoding="utf8", |
|
embedded_encoding="windows-1252"): |
|
"""Fix characters from one encoding embedded in some other encoding. |
|
|
|
Currently the only situation supported is Windows-1252 (or its |
|
subset ISO-8859-1), embedded in UTF-8. |
|
|
|
:param in_bytes: A bytestring that you suspect contains |
|
characters from multiple encodings. Note that this _must_ |
|
be a bytestring. If you've already converted the document |
|
to Unicode, you're too late. |
|
:param main_encoding: The primary encoding of `in_bytes`. |
|
:param embedded_encoding: The encoding that was used to embed characters |
|
in the main document. |
|
:return: A bytestring in which `embedded_encoding` |
|
characters have been converted to their `main_encoding` |
|
equivalents. |
|
""" |
|
if embedded_encoding.replace('_', '-').lower() not in ( |
|
'windows-1252', 'windows_1252'): |
|
raise NotImplementedError( |
|
"Windows-1252 and ISO-8859-1 are the only currently supported " |
|
"embedded encodings.") |
|
|
|
if main_encoding.lower() not in ('utf8', 'utf-8'): |
|
raise NotImplementedError( |
|
"UTF-8 is the only currently supported main encoding.") |
|
|
|
byte_chunks = [] |
|
|
|
chunk_start = 0 |
|
pos = 0 |
|
while pos < len(in_bytes): |
|
byte = in_bytes[pos] |
|
if not isinstance(byte, int): |
|
|
|
byte = ord(byte) |
|
if (byte >= cls.FIRST_MULTIBYTE_MARKER |
|
and byte <= cls.LAST_MULTIBYTE_MARKER): |
|
|
|
|
|
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: |
|
if byte >= start and byte <= end: |
|
pos += size |
|
break |
|
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: |
|
|
|
|
|
byte_chunks.append(in_bytes[chunk_start:pos]) |
|
|
|
|
|
|
|
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) |
|
pos += 1 |
|
chunk_start = pos |
|
else: |
|
|
|
pos += 1 |
|
if chunk_start == 0: |
|
|
|
return in_bytes |
|
else: |
|
|
|
byte_chunks.append(in_bytes[chunk_start:]) |
|
return b''.join(byte_chunks) |
|
|
|
|