|
"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". |
|
|
|
http://www.crummy.com/software/BeautifulSoup/ |
|
|
|
Beautiful Soup uses a pluggable XML or HTML parser to parse a |
|
(possibly invalid) document into a tree representation. Beautiful Soup |
|
provides methods and Pythonic idioms that make it easy to navigate, |
|
search, and modify the parse tree. |
|
|
|
Beautiful Soup works with Python 3.6 and up. It works better if lxml |
|
and/or html5lib is installed. |
|
|
|
For more than you ever wanted to know about Beautiful Soup, see the |
|
documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ |
|
""" |
|
|
|
__author__ = "Leonard Richardson ([email protected])" |
|
__version__ = "4.12.3" |
|
__copyright__ = "Copyright (c) 2004-2024 Leonard Richardson" |
|
|
|
__license__ = "MIT" |
|
|
|
__all__ = ['BeautifulSoup'] |
|
|
|
from collections import Counter |
|
import os |
|
import re |
|
import sys |
|
import traceback |
|
import warnings |
|
|
|
|
|
|
|
if sys.version_info.major < 3: |
|
raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.') |
|
|
|
from .builder import ( |
|
builder_registry, |
|
ParserRejectedMarkup, |
|
XMLParsedAsHTMLWarning, |
|
HTMLParserTreeBuilder |
|
) |
|
from .dammit import UnicodeDammit |
|
from .element import ( |
|
CData, |
|
Comment, |
|
CSS, |
|
DEFAULT_OUTPUT_ENCODING, |
|
Declaration, |
|
Doctype, |
|
NavigableString, |
|
PageElement, |
|
ProcessingInstruction, |
|
PYTHON_SPECIFIC_ENCODINGS, |
|
ResultSet, |
|
Script, |
|
Stylesheet, |
|
SoupStrainer, |
|
Tag, |
|
TemplateString, |
|
) |
|
|
|
|
|
class GuessedAtParserWarning(UserWarning): |
|
"""The warning issued when BeautifulSoup has to guess what parser to |
|
use -- probably because no parser was specified in the constructor. |
|
""" |
|
|
|
class MarkupResemblesLocatorWarning(UserWarning): |
|
"""The warning issued when BeautifulSoup is given 'markup' that |
|
actually looks like a resource locator -- a URL or a path to a file |
|
on disk. |
|
""" |
|
|
|
|
|
class BeautifulSoup(Tag): |
|
"""A data structure representing a parsed HTML or XML document. |
|
|
|
Most of the methods you'll call on a BeautifulSoup object are inherited from |
|
PageElement or Tag. |
|
|
|
Internally, this class defines the basic interface called by the |
|
tree builders when converting an HTML/XML document into a data |
|
structure. The interface abstracts away the differences between |
|
parsers. To write a new tree builder, you'll need to understand |
|
these methods as a whole. |
|
|
|
These methods will be called by the BeautifulSoup constructor: |
|
* reset() |
|
* feed(markup) |
|
|
|
The tree builder may call these methods from its feed() implementation: |
|
* handle_starttag(name, attrs) # See note about return value |
|
* handle_endtag(name) |
|
* handle_data(data) # Appends to the current data node |
|
* endData(containerClass) # Ends the current data node |
|
|
|
No matter how complicated the underlying parser is, you should be |
|
able to build a tree using 'start tag' events, 'end tag' events, |
|
'data' events, and "done with data" events. |
|
|
|
If you encounter an empty-element tag (aka a self-closing tag, |
|
like HTML's <br> tag), call handle_starttag and then |
|
handle_endtag. |
|
""" |
|
|
|
|
|
|
|
|
|
ROOT_TAG_NAME = '[document]' |
|
|
|
|
|
|
|
DEFAULT_BUILDER_FEATURES = ['html', 'fast'] |
|
|
|
|
|
|
|
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' |
|
|
|
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" |
|
|
|
def __init__(self, markup="", features=None, builder=None, |
|
parse_only=None, from_encoding=None, exclude_encodings=None, |
|
element_classes=None, **kwargs): |
|
"""Constructor. |
|
|
|
:param markup: A string or a file-like object representing |
|
markup to be parsed. |
|
|
|
:param features: Desirable features of the parser to be |
|
used. This may be the name of a specific parser ("lxml", |
|
"lxml-xml", "html.parser", or "html5lib") or it may be the |
|
type of markup to be used ("html", "html5", "xml"). It's |
|
recommended that you name a specific parser, so that |
|
Beautiful Soup gives you the same results across platforms |
|
and virtual environments. |
|
|
|
:param builder: A TreeBuilder subclass to instantiate (or |
|
instance to use) instead of looking one up based on |
|
`features`. You only need to use this if you've implemented a |
|
custom TreeBuilder. |
|
|
|
:param parse_only: A SoupStrainer. Only parts of the document |
|
matching the SoupStrainer will be considered. This is useful |
|
when parsing part of a document that would otherwise be too |
|
large to fit into memory. |
|
|
|
:param from_encoding: A string indicating the encoding of the |
|
document to be parsed. Pass this in if Beautiful Soup is |
|
guessing wrongly about the document's encoding. |
|
|
|
:param exclude_encodings: A list of strings indicating |
|
encodings known to be wrong. Pass this in if you don't know |
|
the document's encoding but you know Beautiful Soup's guess is |
|
wrong. |
|
|
|
:param element_classes: A dictionary mapping BeautifulSoup |
|
classes like Tag and NavigableString, to other classes you'd |
|
like to be instantiated instead as the parse tree is |
|
built. This is useful for subclassing Tag or NavigableString |
|
to modify default behavior. |
|
|
|
:param kwargs: For backwards compatibility purposes, the |
|
constructor accepts certain keyword arguments used in |
|
Beautiful Soup 3. None of these arguments do anything in |
|
Beautiful Soup 4; they will result in a warning and then be |
|
ignored. |
|
|
|
Apart from this, any keyword arguments passed into the |
|
BeautifulSoup constructor are propagated to the TreeBuilder |
|
constructor. This makes it possible to configure a |
|
TreeBuilder by passing in arguments, not just by saying which |
|
one to use. |
|
""" |
|
if 'convertEntities' in kwargs: |
|
del kwargs['convertEntities'] |
|
warnings.warn( |
|
"BS4 does not respect the convertEntities argument to the " |
|
"BeautifulSoup constructor. Entities are always converted " |
|
"to Unicode characters.") |
|
|
|
if 'markupMassage' in kwargs: |
|
del kwargs['markupMassage'] |
|
warnings.warn( |
|
"BS4 does not respect the markupMassage argument to the " |
|
"BeautifulSoup constructor. The tree builder is responsible " |
|
"for any necessary markup massage.") |
|
|
|
if 'smartQuotesTo' in kwargs: |
|
del kwargs['smartQuotesTo'] |
|
warnings.warn( |
|
"BS4 does not respect the smartQuotesTo argument to the " |
|
"BeautifulSoup constructor. Smart quotes are always converted " |
|
"to Unicode characters.") |
|
|
|
if 'selfClosingTags' in kwargs: |
|
del kwargs['selfClosingTags'] |
|
warnings.warn( |
|
"BS4 does not respect the selfClosingTags argument to the " |
|
"BeautifulSoup constructor. The tree builder is responsible " |
|
"for understanding self-closing tags.") |
|
|
|
if 'isHTML' in kwargs: |
|
del kwargs['isHTML'] |
|
warnings.warn( |
|
"BS4 does not respect the isHTML argument to the " |
|
"BeautifulSoup constructor. Suggest you use " |
|
"features='lxml' for HTML and features='lxml-xml' for " |
|
"XML.") |
|
|
|
def deprecated_argument(old_name, new_name): |
|
if old_name in kwargs: |
|
warnings.warn( |
|
'The "%s" argument to the BeautifulSoup constructor ' |
|
'has been renamed to "%s."' % (old_name, new_name), |
|
DeprecationWarning, stacklevel=3 |
|
) |
|
return kwargs.pop(old_name) |
|
return None |
|
|
|
parse_only = parse_only or deprecated_argument( |
|
"parseOnlyThese", "parse_only") |
|
|
|
from_encoding = from_encoding or deprecated_argument( |
|
"fromEncoding", "from_encoding") |
|
|
|
if from_encoding and isinstance(markup, str): |
|
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") |
|
from_encoding = None |
|
|
|
self.element_classes = element_classes or dict() |
|
|
|
|
|
|
|
|
|
original_builder = builder |
|
original_features = features |
|
|
|
if isinstance(builder, type): |
|
|
|
builder_class = builder |
|
builder = None |
|
elif builder is None: |
|
if isinstance(features, str): |
|
features = [features] |
|
if features is None or len(features) == 0: |
|
features = self.DEFAULT_BUILDER_FEATURES |
|
builder_class = builder_registry.lookup(*features) |
|
if builder_class is None: |
|
raise FeatureNotFound( |
|
"Couldn't find a tree builder with the features you " |
|
"requested: %s. Do you need to install a parser library?" |
|
% ",".join(features)) |
|
|
|
|
|
|
|
|
|
if builder is None: |
|
builder = builder_class(**kwargs) |
|
if not original_builder and not ( |
|
original_features == builder.NAME or |
|
original_features in builder.ALTERNATE_NAMES |
|
) and markup: |
|
|
|
|
|
if builder.is_xml: |
|
markup_type = "XML" |
|
else: |
|
markup_type = "HTML" |
|
|
|
|
|
|
|
|
|
caller = None |
|
try: |
|
caller = sys._getframe(1) |
|
except ValueError: |
|
pass |
|
if caller: |
|
globals = caller.f_globals |
|
line_number = caller.f_lineno |
|
else: |
|
globals = sys.__dict__ |
|
line_number= 1 |
|
filename = globals.get('__file__') |
|
if filename: |
|
fnl = filename.lower() |
|
if fnl.endswith((".pyc", ".pyo")): |
|
filename = filename[:-1] |
|
if filename: |
|
|
|
|
|
values = dict( |
|
filename=filename, |
|
line_number=line_number, |
|
parser=builder.NAME, |
|
markup_type=markup_type |
|
) |
|
warnings.warn( |
|
self.NO_PARSER_SPECIFIED_WARNING % values, |
|
GuessedAtParserWarning, stacklevel=2 |
|
) |
|
else: |
|
if kwargs: |
|
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") |
|
|
|
self.builder = builder |
|
self.is_xml = builder.is_xml |
|
self.known_xml = self.is_xml |
|
self._namespaces = dict() |
|
self.parse_only = parse_only |
|
|
|
if hasattr(markup, 'read'): |
|
markup = markup.read() |
|
elif len(markup) <= 256 and ( |
|
(isinstance(markup, bytes) and not b'<' in markup) |
|
or (isinstance(markup, str) and not '<' in markup) |
|
): |
|
|
|
|
|
|
|
|
|
if not self._markup_is_url(markup): |
|
self._markup_resembles_filename(markup) |
|
|
|
rejections = [] |
|
success = False |
|
for (self.markup, self.original_encoding, self.declared_html_encoding, |
|
self.contains_replacement_characters) in ( |
|
self.builder.prepare_markup( |
|
markup, from_encoding, exclude_encodings=exclude_encodings)): |
|
self.reset() |
|
self.builder.initialize_soup(self) |
|
try: |
|
self._feed() |
|
success = True |
|
break |
|
except ParserRejectedMarkup as e: |
|
rejections.append(e) |
|
pass |
|
|
|
if not success: |
|
other_exceptions = [str(e) for e in rejections] |
|
raise ParserRejectedMarkup( |
|
"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) |
|
) |
|
|
|
|
|
|
|
self.markup = None |
|
self.builder.soup = None |
|
|
|
def _clone(self): |
|
"""Create a new BeautifulSoup object with the same TreeBuilder, |
|
but not associated with any markup. |
|
|
|
This is the first step of the deepcopy process. |
|
""" |
|
clone = type(self)("", None, self.builder) |
|
|
|
|
|
|
|
clone.original_encoding = self.original_encoding |
|
return clone |
|
|
|
def __getstate__(self): |
|
|
|
d = dict(self.__dict__) |
|
if 'builder' in d and d['builder'] is not None and not self.builder.picklable: |
|
d['builder'] = type(self.builder) |
|
|
|
d['contents'] = [] |
|
d['markup'] = self.decode() |
|
|
|
|
|
|
|
|
|
if '_most_recent_element' in d: |
|
del d['_most_recent_element'] |
|
return d |
|
|
|
def __setstate__(self, state): |
|
|
|
self.__dict__ = state |
|
if isinstance(self.builder, type): |
|
self.builder = self.builder() |
|
elif not self.builder: |
|
|
|
|
|
self.builder = HTMLParserTreeBuilder() |
|
self.builder.soup = self |
|
self.reset() |
|
self._feed() |
|
return state |
|
|
|
|
|
@classmethod |
|
def _decode_markup(cls, markup): |
|
"""Ensure `markup` is bytes so it's safe to send into warnings.warn. |
|
|
|
TODO: warnings.warn had this problem back in 2010 but it might not |
|
anymore. |
|
""" |
|
if isinstance(markup, bytes): |
|
decoded = markup.decode('utf-8', 'replace') |
|
else: |
|
decoded = markup |
|
return decoded |
|
|
|
@classmethod |
|
def _markup_is_url(cls, markup): |
|
"""Error-handling method to raise a warning if incoming markup looks |
|
like a URL. |
|
|
|
:param markup: A string. |
|
:return: Whether or not the markup resembles a URL |
|
closely enough to justify a warning. |
|
""" |
|
if isinstance(markup, bytes): |
|
space = b' ' |
|
cant_start_with = (b"http:", b"https:") |
|
elif isinstance(markup, str): |
|
space = ' ' |
|
cant_start_with = ("http:", "https:") |
|
else: |
|
return False |
|
|
|
if any(markup.startswith(prefix) for prefix in cant_start_with): |
|
if not space in markup: |
|
warnings.warn( |
|
'The input looks more like a URL than markup. You may want to use' |
|
' an HTTP client like requests to get the document behind' |
|
' the URL, and feed that document to Beautiful Soup.', |
|
MarkupResemblesLocatorWarning, |
|
stacklevel=3 |
|
) |
|
return True |
|
return False |
|
|
|
@classmethod |
|
def _markup_resembles_filename(cls, markup): |
|
"""Error-handling method to raise a warning if incoming markup |
|
resembles a filename. |
|
|
|
:param markup: A bytestring or string. |
|
:return: Whether or not the markup resembles a filename |
|
closely enough to justify a warning. |
|
""" |
|
path_characters = '/\\' |
|
extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt'] |
|
if isinstance(markup, bytes): |
|
path_characters = path_characters.encode("utf8") |
|
extensions = [x.encode('utf8') for x in extensions] |
|
filelike = False |
|
if any(x in markup for x in path_characters): |
|
filelike = True |
|
else: |
|
lower = markup.lower() |
|
if any(lower.endswith(ext) for ext in extensions): |
|
filelike = True |
|
if filelike: |
|
warnings.warn( |
|
'The input looks more like a filename than markup. You may' |
|
' want to open this file and pass the filehandle into' |
|
' Beautiful Soup.', |
|
MarkupResemblesLocatorWarning, stacklevel=3 |
|
) |
|
return True |
|
return False |
|
|
|
def _feed(self): |
|
"""Internal method that parses previously set markup, creating a large |
|
number of Tag and NavigableString objects. |
|
""" |
|
|
|
self.builder.reset() |
|
|
|
self.builder.feed(self.markup) |
|
|
|
self.endData() |
|
while self.currentTag.name != self.ROOT_TAG_NAME: |
|
self.popTag() |
|
|
|
def reset(self): |
|
"""Reset this object to a state as though it had never parsed any |
|
markup. |
|
""" |
|
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) |
|
self.hidden = 1 |
|
self.builder.reset() |
|
self.current_data = [] |
|
self.currentTag = None |
|
self.tagStack = [] |
|
self.open_tag_counter = Counter() |
|
self.preserve_whitespace_tag_stack = [] |
|
self.string_container_stack = [] |
|
self._most_recent_element = None |
|
self.pushTag(self) |
|
|
|
def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, |
|
sourceline=None, sourcepos=None, **kwattrs): |
|
"""Create a new Tag associated with this BeautifulSoup object. |
|
|
|
:param name: The name of the new Tag. |
|
:param namespace: The URI of the new Tag's XML namespace, if any. |
|
:param prefix: The prefix for the new Tag's XML namespace, if any. |
|
:param attrs: A dictionary of this Tag's attribute values; can |
|
be used instead of `kwattrs` for attributes like 'class' |
|
that are reserved words in Python. |
|
:param sourceline: The line number where this tag was |
|
(purportedly) found in its source document. |
|
:param sourcepos: The character position within `sourceline` where this |
|
tag was (purportedly) found. |
|
:param kwattrs: Keyword arguments for the new Tag's attribute values. |
|
|
|
""" |
|
kwattrs.update(attrs) |
|
return self.element_classes.get(Tag, Tag)( |
|
None, self.builder, name, namespace, nsprefix, kwattrs, |
|
sourceline=sourceline, sourcepos=sourcepos |
|
) |
|
|
|
def string_container(self, base_class=None): |
|
container = base_class or NavigableString |
|
|
|
|
|
container = self.element_classes.get( |
|
container, container |
|
) |
|
|
|
|
|
|
|
if self.string_container_stack and container is NavigableString: |
|
container = self.builder.string_containers.get( |
|
self.string_container_stack[-1].name, container |
|
) |
|
return container |
|
|
|
def new_string(self, s, subclass=None): |
|
"""Create a new NavigableString associated with this BeautifulSoup |
|
object. |
|
""" |
|
container = self.string_container(subclass) |
|
return container(s) |
|
|
|
def insert_before(self, *args): |
|
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement |
|
it because there is nothing before or after it in the parse tree. |
|
""" |
|
raise NotImplementedError("BeautifulSoup objects don't support insert_before().") |
|
|
|
def insert_after(self, *args): |
|
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement |
|
it because there is nothing before or after it in the parse tree. |
|
""" |
|
raise NotImplementedError("BeautifulSoup objects don't support insert_after().") |
|
|
|
def popTag(self): |
|
"""Internal method called by _popToTag when a tag is closed.""" |
|
tag = self.tagStack.pop() |
|
if tag.name in self.open_tag_counter: |
|
self.open_tag_counter[tag.name] -= 1 |
|
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: |
|
self.preserve_whitespace_tag_stack.pop() |
|
if self.string_container_stack and tag == self.string_container_stack[-1]: |
|
self.string_container_stack.pop() |
|
|
|
if self.tagStack: |
|
self.currentTag = self.tagStack[-1] |
|
return self.currentTag |
|
|
|
def pushTag(self, tag): |
|
"""Internal method called by handle_starttag when a tag is opened.""" |
|
|
|
if self.currentTag is not None: |
|
self.currentTag.contents.append(tag) |
|
self.tagStack.append(tag) |
|
self.currentTag = self.tagStack[-1] |
|
if tag.name != self.ROOT_TAG_NAME: |
|
self.open_tag_counter[tag.name] += 1 |
|
if tag.name in self.builder.preserve_whitespace_tags: |
|
self.preserve_whitespace_tag_stack.append(tag) |
|
if tag.name in self.builder.string_containers: |
|
self.string_container_stack.append(tag) |
|
|
|
def endData(self, containerClass=None): |
|
"""Method called by the TreeBuilder when the end of a data segment |
|
occurs. |
|
""" |
|
if self.current_data: |
|
current_data = ''.join(self.current_data) |
|
|
|
|
|
|
|
if not self.preserve_whitespace_tag_stack: |
|
strippable = True |
|
for i in current_data: |
|
if i not in self.ASCII_SPACES: |
|
strippable = False |
|
break |
|
if strippable: |
|
if '\n' in current_data: |
|
current_data = '\n' |
|
else: |
|
current_data = ' ' |
|
|
|
|
|
self.current_data = [] |
|
|
|
|
|
if self.parse_only and len(self.tagStack) <= 1 and \ |
|
(not self.parse_only.text or \ |
|
not self.parse_only.search(current_data)): |
|
return |
|
|
|
containerClass = self.string_container(containerClass) |
|
o = containerClass(current_data) |
|
self.object_was_parsed(o) |
|
|
|
def object_was_parsed(self, o, parent=None, most_recent_element=None): |
|
"""Method called by the TreeBuilder to integrate an object into the parse tree.""" |
|
if parent is None: |
|
parent = self.currentTag |
|
if most_recent_element is not None: |
|
previous_element = most_recent_element |
|
else: |
|
previous_element = self._most_recent_element |
|
|
|
next_element = previous_sibling = next_sibling = None |
|
if isinstance(o, Tag): |
|
next_element = o.next_element |
|
next_sibling = o.next_sibling |
|
previous_sibling = o.previous_sibling |
|
if previous_element is None: |
|
previous_element = o.previous_element |
|
|
|
fix = parent.next_element is not None |
|
|
|
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) |
|
|
|
self._most_recent_element = o |
|
parent.contents.append(o) |
|
|
|
|
|
if fix: |
|
self._linkage_fixer(parent) |
|
|
|
def _linkage_fixer(self, el): |
|
"""Make sure linkage of this fragment is sound.""" |
|
|
|
first = el.contents[0] |
|
child = el.contents[-1] |
|
descendant = child |
|
|
|
if child is first and el.parent is not None: |
|
|
|
el.next_element = child |
|
|
|
prev_el = child.previous_element |
|
if prev_el is not None and prev_el is not el: |
|
prev_el.next_element = None |
|
|
|
child.previous_element = el |
|
child.previous_sibling = None |
|
|
|
|
|
child.next_sibling = None |
|
|
|
|
|
if isinstance(child, Tag) and child.contents: |
|
descendant = child._last_descendant(False) |
|
|
|
|
|
|
|
|
|
descendant.next_element = None |
|
descendant.next_sibling = None |
|
target = el |
|
while True: |
|
if target is None: |
|
break |
|
elif target.next_sibling is not None: |
|
descendant.next_element = target.next_sibling |
|
target.next_sibling.previous_element = child |
|
break |
|
target = target.parent |
|
|
|
def _popToTag(self, name, nsprefix=None, inclusivePop=True): |
|
"""Pops the tag stack up to and including the most recent |
|
instance of the given tag. |
|
|
|
If there are no open tags with the given name, nothing will be |
|
popped. |
|
|
|
:param name: Pop up to the most recent tag with this name. |
|
:param nsprefix: The namespace prefix that goes with `name`. |
|
:param inclusivePop: It this is false, pops the tag stack up |
|
to but *not* including the most recent instqance of the |
|
given tag. |
|
|
|
""" |
|
|
|
if name == self.ROOT_TAG_NAME: |
|
|
|
return |
|
|
|
most_recently_popped = None |
|
|
|
stack_size = len(self.tagStack) |
|
for i in range(stack_size - 1, 0, -1): |
|
if not self.open_tag_counter.get(name): |
|
break |
|
t = self.tagStack[i] |
|
if (name == t.name and nsprefix == t.prefix): |
|
if inclusivePop: |
|
most_recently_popped = self.popTag() |
|
break |
|
most_recently_popped = self.popTag() |
|
|
|
return most_recently_popped |
|
|
|
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, |
|
sourcepos=None, namespaces=None): |
|
"""Called by the tree builder when a new tag is encountered. |
|
|
|
:param name: Name of the tag. |
|
:param nsprefix: Namespace prefix for the tag. |
|
:param attrs: A dictionary of attribute values. |
|
:param sourceline: The line number where this tag was found in its |
|
source document. |
|
:param sourcepos: The character position within `sourceline` where this |
|
tag was found. |
|
:param namespaces: A dictionary of all namespace prefix mappings |
|
currently in scope in the document. |
|
|
|
If this method returns None, the tag was rejected by an active |
|
SoupStrainer. You should proceed as if the tag had not occurred |
|
in the document. For instance, if this was a self-closing tag, |
|
don't call handle_endtag. |
|
""" |
|
|
|
self.endData() |
|
|
|
if (self.parse_only and len(self.tagStack) <= 1 |
|
and (self.parse_only.text |
|
or not self.parse_only.search_tag(name, attrs))): |
|
return None |
|
|
|
tag = self.element_classes.get(Tag, Tag)( |
|
self, self.builder, name, namespace, nsprefix, attrs, |
|
self.currentTag, self._most_recent_element, |
|
sourceline=sourceline, sourcepos=sourcepos, |
|
namespaces=namespaces |
|
) |
|
if tag is None: |
|
return tag |
|
if self._most_recent_element is not None: |
|
self._most_recent_element.next_element = tag |
|
self._most_recent_element = tag |
|
self.pushTag(tag) |
|
return tag |
|
|
|
def handle_endtag(self, name, nsprefix=None): |
|
"""Called by the tree builder when an ending tag is encountered. |
|
|
|
:param name: Name of the tag. |
|
:param nsprefix: Namespace prefix for the tag. |
|
""" |
|
|
|
self.endData() |
|
self._popToTag(name, nsprefix) |
|
|
|
def handle_data(self, data): |
|
"""Called by the tree builder when a chunk of textual data is encountered.""" |
|
self.current_data.append(data) |
|
|
|
def decode(self, pretty_print=False, |
|
eventual_encoding=DEFAULT_OUTPUT_ENCODING, |
|
formatter="minimal", iterator=None): |
|
"""Returns a string or Unicode representation of the parse tree |
|
as an HTML or XML document. |
|
|
|
:param pretty_print: If this is True, indentation will be used to |
|
make the document more readable. |
|
:param eventual_encoding: The encoding of the final document. |
|
If this is None, the document will be a Unicode string. |
|
""" |
|
if self.is_xml: |
|
|
|
encoding_part = '' |
|
if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: |
|
|
|
|
|
|
|
eventual_encoding = None |
|
if eventual_encoding != None: |
|
encoding_part = ' encoding="%s"' % eventual_encoding |
|
prefix = '<?xml version="1.0"%s?>\n' % encoding_part |
|
else: |
|
prefix = '' |
|
if not pretty_print: |
|
indent_level = None |
|
else: |
|
indent_level = 0 |
|
return prefix + super(BeautifulSoup, self).decode( |
|
indent_level, eventual_encoding, formatter, iterator) |
|
|
|
|
|
_s = BeautifulSoup |
|
_soup = BeautifulSoup |
|
|
|
class BeautifulStoneSoup(BeautifulSoup): |
|
"""Deprecated interface to an XML parser.""" |
|
|
|
def __init__(self, *args, **kwargs): |
|
kwargs['features'] = 'xml' |
|
warnings.warn( |
|
'The BeautifulStoneSoup class is deprecated. Instead of using ' |
|
'it, pass features="xml" into the BeautifulSoup constructor.', |
|
DeprecationWarning, stacklevel=2 |
|
) |
|
super(BeautifulStoneSoup, self).__init__(*args, **kwargs) |
|
|
|
|
|
class StopParsing(Exception): |
|
"""Exception raised by a TreeBuilder if it's unable to continue parsing.""" |
|
pass |
|
|
|
class FeatureNotFound(ValueError): |
|
"""Exception raised by the BeautifulSoup constructor if no parser with the |
|
requested features is found. |
|
""" |
|
pass |
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
import sys |
|
soup = BeautifulSoup(sys.stdin) |
|
print((soup.prettify())) |
|
|