|
|
|
"""Use the HTMLParser library to parse HTML files that aren't too bad.""" |
|
|
|
|
|
__license__ = "MIT" |
|
|
|
__all__ = [ |
|
'HTMLParserTreeBuilder', |
|
] |
|
|
|
from html.parser import HTMLParser |
|
|
|
import sys |
|
import warnings |
|
|
|
from bs4.element import ( |
|
CData, |
|
Comment, |
|
Declaration, |
|
Doctype, |
|
ProcessingInstruction, |
|
) |
|
from bs4.dammit import EntitySubstitution, UnicodeDammit |
|
|
|
from bs4.builder import ( |
|
DetectsXMLParsedAsHTML, |
|
ParserRejectedMarkup, |
|
HTML, |
|
HTMLTreeBuilder, |
|
STRICT, |
|
) |
|
|
|
|
|
HTMLPARSER = 'html.parser' |
|
|
|
class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): |
|
"""A subclass of the Python standard library's HTMLParser class, which |
|
listens for HTMLParser events and translates them into calls |
|
to Beautiful Soup's tree construction API. |
|
""" |
|
|
|
|
|
IGNORE = 'ignore' |
|
REPLACE = 'replace' |
|
|
|
def __init__(self, *args, **kwargs): |
|
"""Constructor. |
|
|
|
:param on_duplicate_attribute: A strategy for what to do if a |
|
tag includes the same attribute more than once. Accepted |
|
values are: REPLACE (replace earlier values with later |
|
ones, the default), IGNORE (keep the earliest value |
|
encountered), or a callable. A callable must take three |
|
arguments: the dictionary of attributes already processed, |
|
the name of the duplicate attribute, and the most recent value |
|
encountered. |
|
""" |
|
self.on_duplicate_attribute = kwargs.pop( |
|
'on_duplicate_attribute', self.REPLACE |
|
) |
|
HTMLParser.__init__(self, *args, **kwargs) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.already_closed_empty_element = [] |
|
|
|
self._initialize_xml_detector() |
|
|
|
def error(self, message): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
raise ParserRejectedMarkup(message) |
|
|
|
def handle_startendtag(self, name, attrs): |
|
"""Handle an incoming empty-element tag. |
|
|
|
This is only called when the markup looks like <tag/>. |
|
|
|
:param name: Name of the tag. |
|
:param attrs: Dictionary of the tag's attributes. |
|
""" |
|
|
|
|
|
|
|
|
|
tag = self.handle_starttag(name, attrs, handle_empty_element=False) |
|
self.handle_endtag(name) |
|
|
|
def handle_starttag(self, name, attrs, handle_empty_element=True): |
|
"""Handle an opening tag, e.g. '<tag>' |
|
|
|
:param name: Name of the tag. |
|
:param attrs: Dictionary of the tag's attributes. |
|
:param handle_empty_element: True if this tag is known to be |
|
an empty-element tag (i.e. there is not expected to be any |
|
closing tag). |
|
""" |
|
|
|
attr_dict = {} |
|
for key, value in attrs: |
|
|
|
|
|
if value is None: |
|
value = '' |
|
if key in attr_dict: |
|
|
|
|
|
|
|
on_dupe = self.on_duplicate_attribute |
|
if on_dupe == self.IGNORE: |
|
pass |
|
elif on_dupe in (None, self.REPLACE): |
|
attr_dict[key] = value |
|
else: |
|
on_dupe(attr_dict, key, value) |
|
else: |
|
attr_dict[key] = value |
|
attrvalue = '""' |
|
|
|
sourceline, sourcepos = self.getpos() |
|
tag = self.soup.handle_starttag( |
|
name, None, None, attr_dict, sourceline=sourceline, |
|
sourcepos=sourcepos |
|
) |
|
if tag and tag.is_empty_element and handle_empty_element: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.handle_endtag(name, check_already_closed=False) |
|
|
|
|
|
|
|
self.already_closed_empty_element.append(name) |
|
|
|
if self._root_tag is None: |
|
self._root_tag_encountered(name) |
|
|
|
def handle_endtag(self, name, check_already_closed=True): |
|
"""Handle a closing tag, e.g. '</tag>' |
|
|
|
:param name: A tag name. |
|
:param check_already_closed: True if this tag is expected to |
|
be the closing portion of an empty-element tag, |
|
e.g. '<tag></tag>'. |
|
""" |
|
|
|
if check_already_closed and name in self.already_closed_empty_element: |
|
|
|
|
|
|
|
|
|
self.already_closed_empty_element.remove(name) |
|
else: |
|
self.soup.handle_endtag(name) |
|
|
|
def handle_data(self, data): |
|
"""Handle some textual data that shows up between tags.""" |
|
self.soup.handle_data(data) |
|
|
|
def handle_charref(self, name): |
|
"""Handle a numeric character reference by converting it to the |
|
corresponding Unicode character and treating it as textual |
|
data. |
|
|
|
:param name: Character number, possibly in hexadecimal. |
|
""" |
|
|
|
|
|
|
|
|
|
if name.startswith('x'): |
|
real_name = int(name.lstrip('x'), 16) |
|
elif name.startswith('X'): |
|
real_name = int(name.lstrip('X'), 16) |
|
else: |
|
real_name = int(name) |
|
|
|
data = None |
|
if real_name < 256: |
|
|
|
|
|
|
|
|
|
|
|
for encoding in (self.soup.original_encoding, 'windows-1252'): |
|
if not encoding: |
|
continue |
|
try: |
|
data = bytearray([real_name]).decode(encoding) |
|
except UnicodeDecodeError as e: |
|
pass |
|
if not data: |
|
try: |
|
data = chr(real_name) |
|
except (ValueError, OverflowError) as e: |
|
pass |
|
data = data or "\N{REPLACEMENT CHARACTER}" |
|
self.handle_data(data) |
|
|
|
def handle_entityref(self, name): |
|
"""Handle a named entity reference by converting it to the |
|
corresponding Unicode character(s) and treating it as textual |
|
data. |
|
|
|
:param name: Name of the entity reference. |
|
""" |
|
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) |
|
if character is not None: |
|
data = character |
|
else: |
|
|
|
|
|
|
|
|
|
|
|
data = "&%s" % name |
|
self.handle_data(data) |
|
|
|
def handle_comment(self, data): |
|
"""Handle an HTML comment. |
|
|
|
:param data: The text of the comment. |
|
""" |
|
self.soup.endData() |
|
self.soup.handle_data(data) |
|
self.soup.endData(Comment) |
|
|
|
def handle_decl(self, data): |
|
"""Handle a DOCTYPE declaration. |
|
|
|
:param data: The text of the declaration. |
|
""" |
|
self.soup.endData() |
|
data = data[len("DOCTYPE "):] |
|
self.soup.handle_data(data) |
|
self.soup.endData(Doctype) |
|
|
|
def unknown_decl(self, data): |
|
"""Handle a declaration of unknown type -- probably a CDATA block. |
|
|
|
:param data: The text of the declaration. |
|
""" |
|
if data.upper().startswith('CDATA['): |
|
cls = CData |
|
data = data[len('CDATA['):] |
|
else: |
|
cls = Declaration |
|
self.soup.endData() |
|
self.soup.handle_data(data) |
|
self.soup.endData(cls) |
|
|
|
def handle_pi(self, data): |
|
"""Handle a processing instruction. |
|
|
|
:param data: The text of the instruction. |
|
""" |
|
self.soup.endData() |
|
self.soup.handle_data(data) |
|
self._document_might_be_xml(data) |
|
self.soup.endData(ProcessingInstruction) |
|
|
|
|
|
class HTMLParserTreeBuilder(HTMLTreeBuilder): |
|
"""A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser, |
|
found in the Python standard library. |
|
""" |
|
is_xml = False |
|
picklable = True |
|
NAME = HTMLPARSER |
|
features = [NAME, HTML, STRICT] |
|
|
|
|
|
|
|
TRACKS_LINE_NUMBERS = True |
|
|
|
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): |
|
"""Constructor. |
|
|
|
:param parser_args: Positional arguments to pass into |
|
the BeautifulSoupHTMLParser constructor, once it's |
|
invoked. |
|
:param parser_kwargs: Keyword arguments to pass into |
|
the BeautifulSoupHTMLParser constructor, once it's |
|
invoked. |
|
:param kwargs: Keyword arguments for the superclass constructor. |
|
""" |
|
|
|
|
|
extra_parser_kwargs = dict() |
|
for arg in ('on_duplicate_attribute',): |
|
if arg in kwargs: |
|
value = kwargs.pop(arg) |
|
extra_parser_kwargs[arg] = value |
|
super(HTMLParserTreeBuilder, self).__init__(**kwargs) |
|
parser_args = parser_args or [] |
|
parser_kwargs = parser_kwargs or {} |
|
parser_kwargs.update(extra_parser_kwargs) |
|
parser_kwargs['convert_charrefs'] = False |
|
self.parser_args = (parser_args, parser_kwargs) |
|
|
|
def prepare_markup(self, markup, user_specified_encoding=None, |
|
document_declared_encoding=None, exclude_encodings=None): |
|
|
|
"""Run any preliminary steps necessary to make incoming markup |
|
acceptable to the parser. |
|
|
|
:param markup: Some markup -- probably a bytestring. |
|
:param user_specified_encoding: The user asked to try this encoding. |
|
:param document_declared_encoding: The markup itself claims to be |
|
in this encoding. |
|
:param exclude_encodings: The user asked _not_ to try any of |
|
these encodings. |
|
|
|
:yield: A series of 4-tuples: |
|
(markup, encoding, declared encoding, |
|
has undergone character replacement) |
|
|
|
Each 4-tuple represents a strategy for converting the |
|
document to Unicode and parsing it. Each strategy will be tried |
|
in turn. |
|
""" |
|
if isinstance(markup, str): |
|
|
|
yield (markup, None, None, False) |
|
return |
|
|
|
|
|
|
|
|
|
|
|
|
|
known_definite_encodings = [user_specified_encoding] |
|
|
|
|
|
|
|
user_encodings = [document_declared_encoding] |
|
|
|
try_encodings = [user_specified_encoding, document_declared_encoding] |
|
dammit = UnicodeDammit( |
|
markup, |
|
known_definite_encodings=known_definite_encodings, |
|
user_encodings=user_encodings, |
|
is_html=True, |
|
exclude_encodings=exclude_encodings |
|
) |
|
yield (dammit.markup, dammit.original_encoding, |
|
dammit.declared_html_encoding, |
|
dammit.contains_replacement_characters) |
|
|
|
def feed(self, markup): |
|
"""Run some incoming markup through some parsing process, |
|
populating the `BeautifulSoup` object in self.soup. |
|
""" |
|
args, kwargs = self.parser_args |
|
parser = BeautifulSoupHTMLParser(*args, **kwargs) |
|
parser.soup = self.soup |
|
try: |
|
parser.feed(markup) |
|
parser.close() |
|
except AssertionError as e: |
|
|
|
|
|
|
|
raise ParserRejectedMarkup(e) |
|
parser.already_closed_empty_element = [] |
|
|