|
|
|
__license__ = "MIT" |
|
|
|
try: |
|
from collections.abc import Callable |
|
except ImportError as e: |
|
from collections import Callable |
|
import re |
|
import sys |
|
import warnings |
|
|
|
from bs4.css import CSS |
|
from bs4.formatter import ( |
|
Formatter, |
|
HTMLFormatter, |
|
XMLFormatter, |
|
) |
|
|
|
DEFAULT_OUTPUT_ENCODING = "utf-8" |
|
|
|
nonwhitespace_re = re.compile(r"\S+") |
|
|
|
|
|
|
|
whitespace_re = re.compile(r"\s+") |
|
|
|
def _alias(attr): |
|
"""Alias one attribute name to another for backward compatibility""" |
|
@property |
|
def alias(self): |
|
return getattr(self, attr) |
|
|
|
@alias.setter |
|
def alias(self): |
|
return setattr(self, attr) |
|
return alias |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PYTHON_SPECIFIC_ENCODINGS = set([ |
|
"idna", |
|
"mbcs", |
|
"oem", |
|
"palmos", |
|
"punycode", |
|
"raw_unicode_escape", |
|
"undefined", |
|
"unicode_escape", |
|
"raw-unicode-escape", |
|
"unicode-escape", |
|
"string-escape", |
|
"string_escape", |
|
]) |
|
|
|
|
|
class NamespacedAttribute(str): |
|
"""A namespaced string (e.g. 'xml:lang') that remembers the namespace |
|
('xml') and the name ('lang') that were used to create it. |
|
""" |
|
|
|
def __new__(cls, prefix, name=None, namespace=None): |
|
if not name: |
|
|
|
|
|
name = None |
|
|
|
if not name: |
|
obj = str.__new__(cls, prefix) |
|
elif not prefix: |
|
|
|
obj = str.__new__(cls, name) |
|
else: |
|
obj = str.__new__(cls, prefix + ":" + name) |
|
obj.prefix = prefix |
|
obj.name = name |
|
obj.namespace = namespace |
|
return obj |
|
|
|
class AttributeValueWithCharsetSubstitution(str): |
|
"""A stand-in object for a character encoding specified in HTML.""" |
|
|
|
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): |
|
"""A generic stand-in for the value of a meta tag's 'charset' attribute. |
|
|
|
When Beautiful Soup parses the markup '<meta charset="utf8">', the |
|
value of the 'charset' attribute will be one of these objects. |
|
""" |
|
|
|
def __new__(cls, original_value): |
|
obj = str.__new__(cls, original_value) |
|
obj.original_value = original_value |
|
return obj |
|
|
|
def encode(self, encoding): |
|
"""When an HTML document is being encoded to a given encoding, the |
|
value of a meta tag's 'charset' is the name of the encoding. |
|
""" |
|
if encoding in PYTHON_SPECIFIC_ENCODINGS: |
|
return '' |
|
return encoding |
|
|
|
|
|
class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): |
|
"""A generic stand-in for the value of a meta tag's 'content' attribute. |
|
|
|
When Beautiful Soup parses the markup: |
|
<meta http-equiv="content-type" content="text/html; charset=utf8"> |
|
|
|
The value of the 'content' attribute will be one of these objects. |
|
""" |
|
|
|
CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) |
|
|
|
def __new__(cls, original_value): |
|
match = cls.CHARSET_RE.search(original_value) |
|
if match is None: |
|
|
|
return str.__new__(str, original_value) |
|
|
|
obj = str.__new__(cls, original_value) |
|
obj.original_value = original_value |
|
return obj |
|
|
|
def encode(self, encoding): |
|
if encoding in PYTHON_SPECIFIC_ENCODINGS: |
|
return '' |
|
def rewrite(match): |
|
return match.group(1) + encoding |
|
return self.CHARSET_RE.sub(rewrite, self.original_value) |
|
|
|
|
|
class PageElement(object): |
|
"""Contains the navigational information for some part of the page: |
|
that is, its current location in the parse tree. |
|
|
|
NavigableString, Tag, etc. are all subclasses of PageElement. |
|
""" |
|
|
|
|
|
|
|
|
|
known_xml = None |
|
|
|
def setup(self, parent=None, previous_element=None, next_element=None, |
|
previous_sibling=None, next_sibling=None): |
|
"""Sets up the initial relations between this element and |
|
other elements. |
|
|
|
:param parent: The parent of this element. |
|
|
|
:param previous_element: The element parsed immediately before |
|
this one. |
|
|
|
:param next_element: The element parsed immediately before |
|
this one. |
|
|
|
:param previous_sibling: The most recently encountered element |
|
on the same level of the parse tree as this one. |
|
|
|
:param previous_sibling: The next element to be encountered |
|
on the same level of the parse tree as this one. |
|
""" |
|
self.parent = parent |
|
|
|
self.previous_element = previous_element |
|
if previous_element is not None: |
|
self.previous_element.next_element = self |
|
|
|
self.next_element = next_element |
|
if self.next_element is not None: |
|
self.next_element.previous_element = self |
|
|
|
self.next_sibling = next_sibling |
|
if self.next_sibling is not None: |
|
self.next_sibling.previous_sibling = self |
|
|
|
if (previous_sibling is None |
|
and self.parent is not None and self.parent.contents): |
|
previous_sibling = self.parent.contents[-1] |
|
|
|
self.previous_sibling = previous_sibling |
|
if previous_sibling is not None: |
|
self.previous_sibling.next_sibling = self |
|
|
|
def format_string(self, s, formatter): |
|
"""Format the given string using the given formatter. |
|
|
|
:param s: A string. |
|
:param formatter: A Formatter object, or a string naming one of the standard formatters. |
|
""" |
|
if formatter is None: |
|
return s |
|
if not isinstance(formatter, Formatter): |
|
formatter = self.formatter_for_name(formatter) |
|
output = formatter.substitute(s) |
|
return output |
|
|
|
def formatter_for_name(self, formatter): |
|
"""Look up or create a Formatter for the given identifier, |
|
if necessary. |
|
|
|
:param formatter: Can be a Formatter object (used as-is), a |
|
function (used as the entity substitution hook for an |
|
XMLFormatter or HTMLFormatter), or a string (used to look |
|
up an XMLFormatter or HTMLFormatter in the appropriate |
|
registry. |
|
""" |
|
if isinstance(formatter, Formatter): |
|
return formatter |
|
if self._is_xml: |
|
c = XMLFormatter |
|
else: |
|
c = HTMLFormatter |
|
if isinstance(formatter, Callable): |
|
return c(entity_substitution=formatter) |
|
return c.REGISTRY[formatter] |
|
|
|
@property |
|
def _is_xml(self): |
|
"""Is this element part of an XML tree or an HTML tree? |
|
|
|
This is used in formatter_for_name, when deciding whether an |
|
XMLFormatter or HTMLFormatter is more appropriate. It can be |
|
inefficient, but it should be called very rarely. |
|
""" |
|
if self.known_xml is not None: |
|
|
|
|
|
return self.known_xml |
|
|
|
|
|
|
|
|
|
if self.parent is None: |
|
|
|
|
|
|
|
return getattr(self, 'is_xml', False) |
|
return self.parent._is_xml |
|
|
|
nextSibling = _alias("next_sibling") |
|
previousSibling = _alias("previous_sibling") |
|
|
|
default = object() |
|
def _all_strings(self, strip=False, types=default): |
|
"""Yield all strings of certain classes, possibly stripping them. |
|
|
|
This is implemented differently in Tag and NavigableString. |
|
""" |
|
raise NotImplementedError() |
|
|
|
@property |
|
def stripped_strings(self): |
|
"""Yield all strings in this PageElement, stripping them first. |
|
|
|
:yield: A sequence of stripped strings. |
|
""" |
|
for string in self._all_strings(True): |
|
yield string |
|
|
|
def get_text(self, separator="", strip=False, |
|
types=default): |
|
"""Get all child strings of this PageElement, concatenated using the |
|
given separator. |
|
|
|
:param separator: Strings will be concatenated using this separator. |
|
|
|
:param strip: If True, strings will be stripped before being |
|
concatenated. |
|
|
|
:param types: A tuple of NavigableString subclasses. Any |
|
strings of a subclass not found in this list will be |
|
ignored. Although there are exceptions, the default |
|
behavior in most cases is to consider only NavigableString |
|
and CData objects. That means no comments, processing |
|
instructions, etc. |
|
|
|
:return: A string. |
|
""" |
|
return separator.join([s for s in self._all_strings( |
|
strip, types=types)]) |
|
getText = get_text |
|
text = property(get_text) |
|
|
|
def replace_with(self, *args): |
|
"""Replace this PageElement with one or more PageElements, keeping the |
|
rest of the tree the same. |
|
|
|
:param args: One or more PageElements. |
|
:return: `self`, no longer part of the tree. |
|
""" |
|
if self.parent is None: |
|
raise ValueError( |
|
"Cannot replace one element with another when the " |
|
"element to be replaced is not part of a tree.") |
|
if len(args) == 1 and args[0] is self: |
|
return |
|
if any(x is self.parent for x in args): |
|
raise ValueError("Cannot replace a Tag with its parent.") |
|
old_parent = self.parent |
|
my_index = self.parent.index(self) |
|
self.extract(_self_index=my_index) |
|
for idx, replace_with in enumerate(args, start=my_index): |
|
old_parent.insert(idx, replace_with) |
|
return self |
|
replaceWith = replace_with |
|
|
|
def unwrap(self): |
|
"""Replace this PageElement with its contents. |
|
|
|
:return: `self`, no longer part of the tree. |
|
""" |
|
my_parent = self.parent |
|
if self.parent is None: |
|
raise ValueError( |
|
"Cannot replace an element with its contents when that" |
|
"element is not part of a tree.") |
|
my_index = self.parent.index(self) |
|
self.extract(_self_index=my_index) |
|
for child in reversed(self.contents[:]): |
|
my_parent.insert(my_index, child) |
|
return self |
|
replace_with_children = unwrap |
|
replaceWithChildren = unwrap |
|
|
|
def wrap(self, wrap_inside): |
|
"""Wrap this PageElement inside another one. |
|
|
|
:param wrap_inside: A PageElement. |
|
:return: `wrap_inside`, occupying the position in the tree that used |
|
to be occupied by `self`, and with `self` inside it. |
|
""" |
|
me = self.replace_with(wrap_inside) |
|
wrap_inside.append(me) |
|
return wrap_inside |
|
|
|
def extract(self, _self_index=None): |
|
"""Destructively rips this element out of the tree. |
|
|
|
:param _self_index: The location of this element in its parent's |
|
.contents, if known. Passing this in allows for a performance |
|
optimization. |
|
|
|
:return: `self`, no longer part of the tree. |
|
""" |
|
if self.parent is not None: |
|
if _self_index is None: |
|
_self_index = self.parent.index(self) |
|
del self.parent.contents[_self_index] |
|
|
|
|
|
|
|
|
|
last_child = self._last_descendant() |
|
next_element = last_child.next_element |
|
|
|
if (self.previous_element is not None and |
|
self.previous_element is not next_element): |
|
self.previous_element.next_element = next_element |
|
if next_element is not None and next_element is not self.previous_element: |
|
next_element.previous_element = self.previous_element |
|
self.previous_element = None |
|
last_child.next_element = None |
|
|
|
self.parent = None |
|
if (self.previous_sibling is not None |
|
and self.previous_sibling is not self.next_sibling): |
|
self.previous_sibling.next_sibling = self.next_sibling |
|
if (self.next_sibling is not None |
|
and self.next_sibling is not self.previous_sibling): |
|
self.next_sibling.previous_sibling = self.previous_sibling |
|
self.previous_sibling = self.next_sibling = None |
|
return self |
|
|
|
def _last_descendant(self, is_initialized=True, accept_self=True): |
|
"""Finds the last element beneath this object to be parsed. |
|
|
|
:param is_initialized: Has `setup` been called on this PageElement |
|
yet? |
|
:param accept_self: Is `self` an acceptable answer to the question? |
|
""" |
|
if is_initialized and self.next_sibling is not None: |
|
last_child = self.next_sibling.previous_element |
|
else: |
|
last_child = self |
|
while isinstance(last_child, Tag) and last_child.contents: |
|
last_child = last_child.contents[-1] |
|
if not accept_self and last_child is self: |
|
last_child = None |
|
return last_child |
|
|
|
_lastRecursiveChild = _last_descendant |
|
|
|
def insert(self, position, new_child): |
|
"""Insert a new PageElement in the list of this PageElement's children. |
|
|
|
This works the same way as `list.insert`. |
|
|
|
:param position: The numeric position that should be occupied |
|
in `self.children` by the new PageElement. |
|
:param new_child: A PageElement. |
|
""" |
|
if new_child is None: |
|
raise ValueError("Cannot insert None into a tag.") |
|
if new_child is self: |
|
raise ValueError("Cannot insert a tag into itself.") |
|
if (isinstance(new_child, str) |
|
and not isinstance(new_child, NavigableString)): |
|
new_child = NavigableString(new_child) |
|
|
|
from bs4 import BeautifulSoup |
|
if isinstance(new_child, BeautifulSoup): |
|
|
|
|
|
for subchild in list(new_child.contents): |
|
self.insert(position, subchild) |
|
position += 1 |
|
return |
|
position = min(position, len(self.contents)) |
|
if hasattr(new_child, 'parent') and new_child.parent is not None: |
|
|
|
|
|
if new_child.parent is self: |
|
current_index = self.index(new_child) |
|
if current_index < position: |
|
|
|
|
|
|
|
|
|
position -= 1 |
|
new_child.extract() |
|
|
|
new_child.parent = self |
|
previous_child = None |
|
if position == 0: |
|
new_child.previous_sibling = None |
|
new_child.previous_element = self |
|
else: |
|
previous_child = self.contents[position - 1] |
|
new_child.previous_sibling = previous_child |
|
new_child.previous_sibling.next_sibling = new_child |
|
new_child.previous_element = previous_child._last_descendant(False) |
|
if new_child.previous_element is not None: |
|
new_child.previous_element.next_element = new_child |
|
|
|
new_childs_last_element = new_child._last_descendant(False) |
|
|
|
if position >= len(self.contents): |
|
new_child.next_sibling = None |
|
|
|
parent = self |
|
parents_next_sibling = None |
|
while parents_next_sibling is None and parent is not None: |
|
parents_next_sibling = parent.next_sibling |
|
parent = parent.parent |
|
if parents_next_sibling is not None: |
|
|
|
break |
|
if parents_next_sibling is not None: |
|
new_childs_last_element.next_element = parents_next_sibling |
|
else: |
|
|
|
|
|
new_childs_last_element.next_element = None |
|
else: |
|
next_child = self.contents[position] |
|
new_child.next_sibling = next_child |
|
if new_child.next_sibling is not None: |
|
new_child.next_sibling.previous_sibling = new_child |
|
new_childs_last_element.next_element = next_child |
|
|
|
if new_childs_last_element.next_element is not None: |
|
new_childs_last_element.next_element.previous_element = new_childs_last_element |
|
self.contents.insert(position, new_child) |
|
|
|
def append(self, tag): |
|
"""Appends the given PageElement to the contents of this one. |
|
|
|
:param tag: A PageElement. |
|
""" |
|
self.insert(len(self.contents), tag) |
|
|
|
def extend(self, tags): |
|
"""Appends the given PageElements to this one's contents. |
|
|
|
:param tags: A list of PageElements. If a single Tag is |
|
provided instead, this PageElement's contents will be extended |
|
with that Tag's contents. |
|
""" |
|
if isinstance(tags, Tag): |
|
tags = tags.contents |
|
if isinstance(tags, list): |
|
|
|
|
|
tags = list(tags) |
|
for tag in tags: |
|
self.append(tag) |
|
|
|
def insert_before(self, *args): |
|
"""Makes the given element(s) the immediate predecessor of this one. |
|
|
|
All the elements will have the same parent, and the given elements |
|
will be immediately before this one. |
|
|
|
:param args: One or more PageElements. |
|
""" |
|
parent = self.parent |
|
if parent is None: |
|
raise ValueError( |
|
"Element has no parent, so 'before' has no meaning.") |
|
if any(x is self for x in args): |
|
raise ValueError("Can't insert an element before itself.") |
|
for predecessor in args: |
|
|
|
|
|
if isinstance(predecessor, PageElement): |
|
predecessor.extract() |
|
index = parent.index(self) |
|
parent.insert(index, predecessor) |
|
|
|
def insert_after(self, *args): |
|
"""Makes the given element(s) the immediate successor of this one. |
|
|
|
The elements will have the same parent, and the given elements |
|
will be immediately after this one. |
|
|
|
:param args: One or more PageElements. |
|
""" |
|
|
|
parent = self.parent |
|
if parent is None: |
|
raise ValueError( |
|
"Element has no parent, so 'after' has no meaning.") |
|
if any(x is self for x in args): |
|
raise ValueError("Can't insert an element after itself.") |
|
|
|
offset = 0 |
|
for successor in args: |
|
|
|
|
|
if isinstance(successor, PageElement): |
|
successor.extract() |
|
index = parent.index(self) |
|
parent.insert(index+1+offset, successor) |
|
offset += 1 |
|
|
|
def find_next(self, name=None, attrs={}, string=None, **kwargs): |
|
"""Find the first PageElement that matches the given criteria and |
|
appears later in the document than this PageElement. |
|
|
|
All find_* methods take a common set of arguments. See the online |
|
documentation for detailed explanations. |
|
|
|
:param name: A filter on tag name. |
|
:param attrs: A dictionary of filters on attribute values. |
|
:param string: A filter for a NavigableString with specific text. |
|
:kwargs: A dictionary of filters on attribute values. |
|
:return: A PageElement. |
|
:rtype: bs4.element.Tag | bs4.element.NavigableString |
|
""" |
|
return self._find_one(self.find_all_next, name, attrs, string, **kwargs) |
|
findNext = find_next |
|
|
|
def find_all_next(self, name=None, attrs={}, string=None, limit=None, |
|
**kwargs): |
|
"""Find all PageElements that match the given criteria and appear |
|
later in the document than this PageElement. |
|
|
|
All find_* methods take a common set of arguments. See the online |
|
documentation for detailed explanations. |
|
|
|
:param name: A filter on tag name. |
|
:param attrs: A dictionary of filters on attribute values. |
|
:param string: A filter for a NavigableString with specific text. |
|
:param limit: Stop looking after finding this many results. |
|
:kwargs: A dictionary of filters on attribute values. |
|
:return: A ResultSet containing PageElements. |
|
""" |
|
_stacklevel = kwargs.pop('_stacklevel', 2) |
|
return self._find_all(name, attrs, string, limit, self.next_elements, |
|
_stacklevel=_stacklevel+1, **kwargs) |
|
findAllNext = find_all_next |
|
|
|
def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs): |
|
"""Find the closest sibling to this PageElement that matches the |
|
given criteria and appears later in the document. |
|
|
|
All find_* methods take a common set of arguments. See the |
|
online documentation for detailed explanations. |
|
|
|
:param name: A filter on tag name. |
|
:param attrs: A dictionary of filters on attribute values. |
|
:param string: A filter for a NavigableString with specific text. |
|
:kwargs: A dictionary of filters on attribute values. |
|
:return: A PageElement. |
|
:rtype: bs4.element.Tag | bs4.element.NavigableString |
|
""" |
|
return self._find_one(self.find_next_siblings, name, attrs, string, |
|
**kwargs) |
|
findNextSibling = find_next_sibling |
|
|
|
def find_next_siblings(self, name=None, attrs={}, string=None, limit=None, |
|
**kwargs): |
|
"""Find all siblings of this PageElement that match the given criteria |
|
and appear later in the document. |
|
|
|
All find_* methods take a common set of arguments. See the online |
|
documentation for detailed explanations. |
|
|
|
:param name: A filter on tag name. |
|
:param attrs: A dictionary of filters on attribute values. |
|
:param string: A filter for a NavigableString with specific text. |
|
:param limit: Stop looking after finding this many results. |
|
:kwargs: A dictionary of filters on attribute values. |
|
:return: A ResultSet of PageElements. |
|
:rtype: bs4.element.ResultSet |
|
""" |
|
_stacklevel = kwargs.pop('_stacklevel', 2) |
|
return self._find_all( |
|
name, attrs, string, limit, |
|
self.next_siblings, _stacklevel=_stacklevel+1, **kwargs |
|
) |
|
findNextSiblings = find_next_siblings |
|
fetchNextSiblings = find_next_siblings |
|
|
|
def find_previous(self, name=None, attrs={}, string=None, **kwargs): |
|
"""Look backwards in the document from this PageElement and find the |
|
first PageElement that matches the given criteria. |
|
|
|
All find_* methods take a common set of arguments. See the online |
|
documentation for detailed explanations. |
|
|
|
:param name: A filter on tag name. |
|
:param attrs: A dictionary of filters on attribute values. |
|
:param string: A filter for a NavigableString with specific text. |
|
:kwargs: A dictionary of filters on attribute values. |
|
:return: A PageElement. |
|
:rtype: bs4.element.Tag | bs4.element.NavigableString |
|
""" |
|
return self._find_one( |
|
self.find_all_previous, name, attrs, string, **kwargs) |
|
findPrevious = find_previous |
|
|
|
def find_all_previous(self, name=None, attrs={}, string=None, limit=None, |
|
**kwargs): |
|
"""Look backwards in the document from this PageElement and find all |
|
PageElements that match the given criteria. |
|
|
|
All find_* methods take a common set of arguments. See the online |
|
documentation for detailed explanations. |
|
|
|
:param name: A filter on tag name. |
|
:param attrs: A dictionary of filters on attribute values. |
|
:param string: A filter for a NavigableString with specific text. |
|
:param limit: Stop looking after finding this many results. |
|
:kwargs: A dictionary of filters on attribute values. |
|
:return: A ResultSet of PageElements. |
|
:rtype: bs4.element.ResultSet |
|
""" |
|
_stacklevel = kwargs.pop('_stacklevel', 2) |
|
return self._find_all( |
|
name, attrs, string, limit, self.previous_elements, |
|
_stacklevel=_stacklevel+1, **kwargs |
|
) |
|
findAllPrevious = find_all_previous |
|
fetchPrevious = find_all_previous |
|
|
|
def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs): |
|
"""Returns the closest sibling to this PageElement that matches the |
|
given criteria and appears earlier in the document. |
|
|
|
All find_* methods take a common set of arguments. See the online |
|
documentation for detailed explanations. |
|
|
|
:param name: A filter on tag name. |
|
:param attrs: A dictionary of filters on attribute values. |
|
:param string: A filter for a NavigableString with specific text. |
|
:kwargs: A dictionary of filters on attribute values. |
|
:return: A PageElement. |
|
:rtype: bs4.element.Tag | bs4.element.NavigableString |
|
""" |
|
return self._find_one(self.find_previous_siblings, name, attrs, string, |
|
**kwargs) |
|
findPreviousSibling = find_previous_sibling |
|
|
|
def find_previous_siblings(self, name=None, attrs={}, string=None, |
|
limit=None, **kwargs): |
|
"""Returns all siblings to this PageElement that match the |
|
given criteria and appear earlier in the document. |
|
|
|
All find_* methods take a common set of arguments. See the online |
|
documentation for detailed explanations. |
|
|
|
:param name: A filter on tag name. |
|
:param attrs: A dictionary of filters on attribute values. |
|
:param string: A filter for a NavigableString with specific text. |
|
:param limit: Stop looking after finding this many results. |
|
:kwargs: A dictionary of filters on attribute values. |
|
:return: A ResultSet of PageElements. |
|
:rtype: bs4.element.ResultSet |
|
""" |
|
_stacklevel = kwargs.pop('_stacklevel', 2) |
|
return self._find_all( |
|
name, attrs, string, limit, |
|
self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs |
|
) |
|
findPreviousSiblings = find_previous_siblings |
|
fetchPreviousSiblings = find_previous_siblings |
|
|
|
def find_parent(self, name=None, attrs={}, **kwargs): |
|
"""Find the closest parent of this PageElement that matches the given |
|
criteria. |
|
|
|
All find_* methods take a common set of arguments. See the online |
|
documentation for detailed explanations. |
|
|
|
:param name: A filter on tag name. |
|
:param attrs: A dictionary of filters on attribute values. |
|
:kwargs: A dictionary of filters on attribute values. |
|
|
|
:return: A PageElement. |
|
:rtype: bs4.element.Tag | bs4.element.NavigableString |
|
""" |
|
|
|
|
|
r = None |
|
l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs) |
|
if l: |
|
r = l[0] |
|
return r |
|
findParent = find_parent |
|
|
|
def find_parents(self, name=None, attrs={}, limit=None, **kwargs): |
|
"""Find all parents of this PageElement that match the given criteria. |
|
|
|
All find_* methods take a common set of arguments. See the online |
|
documentation for detailed explanations. |
|
|
|
:param name: A filter on tag name. |
|
:param attrs: A dictionary of filters on attribute values. |
|
:param limit: Stop looking after finding this many results. |
|
:kwargs: A dictionary of filters on attribute values. |
|
|
|
:return: A PageElement. |
|
:rtype: bs4.element.Tag | bs4.element.NavigableString |
|
""" |
|
_stacklevel = kwargs.pop('_stacklevel', 2) |
|
return self._find_all(name, attrs, None, limit, self.parents, |
|
_stacklevel=_stacklevel+1, **kwargs) |
|
findParents = find_parents |
|
fetchParents = find_parents |
|
|
|
@property |
|
def next(self): |
|
"""The PageElement, if any, that was parsed just after this one. |
|
|
|
:return: A PageElement. |
|
:rtype: bs4.element.Tag | bs4.element.NavigableString |
|
""" |
|
return self.next_element |
|
|
|
@property |
|
def previous(self): |
|
"""The PageElement, if any, that was parsed just before this one. |
|
|
|
:return: A PageElement. |
|
:rtype: bs4.element.Tag | bs4.element.NavigableString |
|
""" |
|
return self.previous_element |
|
|
|
|
|
|
|
def _find_one(self, method, name, attrs, string, **kwargs): |
|
r = None |
|
l = method(name, attrs, string, 1, _stacklevel=4, **kwargs) |
|
if l: |
|
r = l[0] |
|
return r |
|
|
|
def _find_all(self, name, attrs, string, limit, generator, **kwargs): |
|
"Iterates over a generator looking for things that match." |
|
_stacklevel = kwargs.pop('_stacklevel', 3) |
|
|
|
if string is None and 'text' in kwargs: |
|
string = kwargs.pop('text') |
|
warnings.warn( |
|
"The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", |
|
DeprecationWarning, stacklevel=_stacklevel |
|
) |
|
|
|
if isinstance(name, SoupStrainer): |
|
strainer = name |
|
else: |
|
strainer = SoupStrainer(name, attrs, string, **kwargs) |
|
|
|
if string is None and not limit and not attrs and not kwargs: |
|
if name is True or name is None: |
|
|
|
result = (element for element in generator |
|
if isinstance(element, Tag)) |
|
return ResultSet(strainer, result) |
|
elif isinstance(name, str): |
|
|
|
if name.count(':') == 1: |
|
|
|
|
|
|
|
prefix, local_name = name.split(':', 1) |
|
else: |
|
prefix = None |
|
local_name = name |
|
result = (element for element in generator |
|
if isinstance(element, Tag) |
|
and ( |
|
element.name == name |
|
) or ( |
|
element.name == local_name |
|
and (prefix is None or element.prefix == prefix) |
|
) |
|
) |
|
return ResultSet(strainer, result) |
|
results = ResultSet(strainer) |
|
while True: |
|
try: |
|
i = next(generator) |
|
except StopIteration: |
|
break |
|
if i: |
|
found = strainer.search(i) |
|
if found: |
|
results.append(found) |
|
if limit and len(results) >= limit: |
|
break |
|
return results |
|
|
|
|
|
|
|
@property |
|
def next_elements(self): |
|
"""All PageElements that were parsed after this one. |
|
|
|
:yield: A sequence of PageElements. |
|
""" |
|
i = self.next_element |
|
while i is not None: |
|
yield i |
|
i = i.next_element |
|
|
|
@property |
|
def next_siblings(self): |
|
"""All PageElements that are siblings of this one but were parsed |
|
later. |
|
|
|
:yield: A sequence of PageElements. |
|
""" |
|
i = self.next_sibling |
|
while i is not None: |
|
yield i |
|
i = i.next_sibling |
|
|
|
@property |
|
def previous_elements(self): |
|
"""All PageElements that were parsed before this one. |
|
|
|
:yield: A sequence of PageElements. |
|
""" |
|
i = self.previous_element |
|
while i is not None: |
|
yield i |
|
i = i.previous_element |
|
|
|
@property |
|
def previous_siblings(self): |
|
"""All PageElements that are siblings of this one but were parsed |
|
earlier. |
|
|
|
:yield: A sequence of PageElements. |
|
""" |
|
i = self.previous_sibling |
|
while i is not None: |
|
yield i |
|
i = i.previous_sibling |
|
|
|
@property |
|
def parents(self): |
|
"""All PageElements that are parents of this PageElement. |
|
|
|
:yield: A sequence of PageElements. |
|
""" |
|
i = self.parent |
|
while i is not None: |
|
yield i |
|
i = i.parent |
|
|
|
@property |
|
def decomposed(self): |
|
"""Check whether a PageElement has been decomposed. |
|
|
|
:rtype: bool |
|
""" |
|
return getattr(self, '_decomposed', False) or False |
|
|
|
|
|
|
|
def nextGenerator(self): |
|
return self.next_elements |
|
|
|
def nextSiblingGenerator(self): |
|
return self.next_siblings |
|
|
|
def previousGenerator(self): |
|
return self.previous_elements |
|
|
|
def previousSiblingGenerator(self): |
|
return self.previous_siblings |
|
|
|
def parentGenerator(self): |
|
return self.parents |
|
|
|
|
|
class NavigableString(str, PageElement): |
|
"""A Python Unicode string that is part of a parse tree. |
|
|
|
When Beautiful Soup parses the markup <b>penguin</b>, it will |
|
create a NavigableString for the string "penguin". |
|
""" |
|
|
|
PREFIX = '' |
|
SUFFIX = '' |
|
|
|
def __new__(cls, value): |
|
"""Create a new NavigableString. |
|
|
|
When unpickling a NavigableString, this method is called with |
|
the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be |
|
passed in to the superclass's __new__ or the superclass won't know |
|
how to handle non-ASCII characters. |
|
""" |
|
if isinstance(value, str): |
|
u = str.__new__(cls, value) |
|
else: |
|
u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) |
|
u.setup() |
|
return u |
|
|
|
def __deepcopy__(self, memo, recursive=False): |
|
"""A copy of a NavigableString has the same contents and class |
|
as the original, but it is not connected to the parse tree. |
|
|
|
:param recursive: This parameter is ignored; it's only defined |
|
so that NavigableString.__deepcopy__ implements the same |
|
signature as Tag.__deepcopy__. |
|
""" |
|
return type(self)(self) |
|
|
|
def __copy__(self): |
|
"""A copy of a NavigableString can only be a deep copy, because |
|
only one PageElement can occupy a given place in a parse tree. |
|
""" |
|
return self.__deepcopy__({}) |
|
|
|
def __getnewargs__(self): |
|
return (str(self),) |
|
|
|
def __getattr__(self, attr): |
|
"""text.string gives you text. This is for backwards |
|
compatibility for Navigable*String, but for CData* it lets you |
|
get the string without the CData wrapper.""" |
|
if attr == 'string': |
|
return self |
|
else: |
|
raise AttributeError( |
|
"'%s' object has no attribute '%s'" % ( |
|
self.__class__.__name__, attr)) |
|
|
|
def output_ready(self, formatter="minimal"): |
|
"""Run the string through the provided formatter. |
|
|
|
:param formatter: A Formatter object, or a string naming one of the standard formatters. |
|
""" |
|
output = self.format_string(self, formatter) |
|
return self.PREFIX + output + self.SUFFIX |
|
|
|
@property |
|
def name(self): |
|
"""Since a NavigableString is not a Tag, it has no .name. |
|
|
|
This property is implemented so that code like this doesn't crash |
|
when run on a mixture of Tag and NavigableString objects: |
|
[x.name for x in tag.children] |
|
""" |
|
return None |
|
|
|
@name.setter |
|
def name(self, name): |
|
"""Prevent NavigableString.name from ever being set.""" |
|
raise AttributeError("A NavigableString cannot be given a name.") |
|
|
|
def _all_strings(self, strip=False, types=PageElement.default): |
|
"""Yield all strings of certain classes, possibly stripping them. |
|
|
|
This makes it easy for NavigableString to implement methods |
|
like get_text() as conveniences, creating a consistent |
|
text-extraction API across all PageElements. |
|
|
|
:param strip: If True, all strings will be stripped before being |
|
yielded. |
|
|
|
:param types: A tuple of NavigableString subclasses. If this |
|
NavigableString isn't one of those subclasses, the |
|
sequence will be empty. By default, the subclasses |
|
considered are NavigableString and CData objects. That |
|
means no comments, processing instructions, etc. |
|
|
|
:yield: A sequence that either contains this string, or is empty. |
|
|
|
""" |
|
if types is self.default: |
|
|
|
|
|
types = Tag.DEFAULT_INTERESTING_STRING_TYPES |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
my_type = type(self) |
|
if types is not None: |
|
if isinstance(types, type): |
|
|
|
if my_type is not types: |
|
return |
|
elif my_type not in types: |
|
|
|
return |
|
|
|
value = self |
|
if strip: |
|
value = value.strip() |
|
if len(value) > 0: |
|
yield value |
|
strings = property(_all_strings) |
|
|
|
class PreformattedString(NavigableString): |
|
"""A NavigableString not subject to the normal formatting rules. |
|
|
|
This is an abstract class used for special kinds of strings such |
|
as comments (the Comment class) and CDATA blocks (the CData |
|
class). |
|
""" |
|
|
|
PREFIX = '' |
|
SUFFIX = '' |
|
|
|
def output_ready(self, formatter=None): |
|
"""Make this string ready for output by adding any subclass-specific |
|
prefix or suffix. |
|
|
|
:param formatter: A Formatter object, or a string naming one |
|
of the standard formatters. The string will be passed into the |
|
Formatter, but only to trigger any side effects: the return |
|
value is ignored. |
|
|
|
:return: The string, with any subclass-specific prefix and |
|
suffix added on. |
|
""" |
|
if formatter is not None: |
|
ignore = self.format_string(self, formatter) |
|
return self.PREFIX + self + self.SUFFIX |
|
|
|
class CData(PreformattedString): |
|
"""A CDATA block.""" |
|
PREFIX = '<![CDATA[' |
|
SUFFIX = ']]>' |
|
|
|
class ProcessingInstruction(PreformattedString): |
|
"""A SGML processing instruction.""" |
|
|
|
PREFIX = '<?' |
|
SUFFIX = '>' |
|
|
|
class XMLProcessingInstruction(ProcessingInstruction): |
|
"""An XML processing instruction.""" |
|
PREFIX = '<?' |
|
SUFFIX = '?>' |
|
|
|
class Comment(PreformattedString): |
|
"""An HTML or XML comment.""" |
|
PREFIX = '<!--' |
|
SUFFIX = '-->' |
|
|
|
|
|
class Declaration(PreformattedString): |
|
"""An XML declaration.""" |
|
PREFIX = '<?' |
|
SUFFIX = '?>' |
|
|
|
|
|
class Doctype(PreformattedString): |
|
"""A document type declaration.""" |
|
@classmethod |
|
def for_name_and_ids(cls, name, pub_id, system_id): |
|
"""Generate an appropriate document type declaration for a given |
|
public ID and system ID. |
|
|
|
:param name: The name of the document's root element, e.g. 'html'. |
|
:param pub_id: The Formal Public Identifier for this document type, |
|
e.g. '-//W3C//DTD XHTML 1.1//EN' |
|
:param system_id: The system identifier for this document type, |
|
e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' |
|
|
|
:return: A Doctype. |
|
""" |
|
value = name or '' |
|
if pub_id is not None: |
|
value += ' PUBLIC "%s"' % pub_id |
|
if system_id is not None: |
|
value += ' "%s"' % system_id |
|
elif system_id is not None: |
|
value += ' SYSTEM "%s"' % system_id |
|
|
|
return Doctype(value) |
|
|
|
PREFIX = '<!DOCTYPE ' |
|
SUFFIX = '>\n' |
|
|
|
|
|
class Stylesheet(NavigableString): |
|
"""A NavigableString representing an stylesheet (probably |
|
CSS). |
|
|
|
Used to distinguish embedded stylesheets from textual content. |
|
""" |
|
pass |
|
|
|
|
|
class Script(NavigableString): |
|
"""A NavigableString representing an executable script (probably |
|
Javascript). |
|
|
|
Used to distinguish executable code from textual content. |
|
""" |
|
pass |
|
|
|
|
|
class TemplateString(NavigableString): |
|
"""A NavigableString representing a string found inside an HTML |
|
template embedded in a larger document. |
|
|
|
Used to distinguish such strings from the main body of the document. |
|
""" |
|
pass |
|
|
|
|
|
class RubyTextString(NavigableString): |
|
"""A NavigableString representing the contents of the <rt> HTML |
|
element. |
|
|
|
https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element |
|
|
|
Can be used to distinguish such strings from the strings they're |
|
annotating. |
|
""" |
|
pass |
|
|
|
|
|
class RubyParenthesisString(NavigableString): |
|
"""A NavigableString representing the contents of the <rp> HTML |
|
element. |
|
|
|
https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element |
|
""" |
|
pass |
|
|
|
|
|
class Tag(PageElement): |
|
"""Represents an HTML or XML tag that is part of a parse tree, along |
|
with its attributes and contents. |
|
|
|
When Beautiful Soup parses the markup <b>penguin</b>, it will |
|
create a Tag object representing the <b> tag. |
|
""" |
|
|
|
def __init__(self, parser=None, builder=None, name=None, namespace=None, |
|
prefix=None, attrs=None, parent=None, previous=None, |
|
is_xml=None, sourceline=None, sourcepos=None, |
|
can_be_empty_element=None, cdata_list_attributes=None, |
|
preserve_whitespace_tags=None, |
|
interesting_string_types=None, |
|
namespaces=None |
|
): |
|
"""Basic constructor. |
|
|
|
:param parser: A BeautifulSoup object. |
|
:param builder: A TreeBuilder. |
|
:param name: The name of the tag. |
|
:param namespace: The URI of this Tag's XML namespace, if any. |
|
:param prefix: The prefix for this Tag's XML namespace, if any. |
|
:param attrs: A dictionary of this Tag's attribute values. |
|
:param parent: The PageElement to use as this Tag's parent. |
|
:param previous: The PageElement that was parsed immediately before |
|
this tag. |
|
:param is_xml: If True, this is an XML tag. Otherwise, this is an |
|
HTML tag. |
|
:param sourceline: The line number where this tag was found in its |
|
source document. |
|
:param sourcepos: The character position within `sourceline` where this |
|
tag was found. |
|
:param can_be_empty_element: If True, this tag should be |
|
represented as <tag/>. If False, this tag should be represented |
|
as <tag></tag>. |
|
:param cdata_list_attributes: A list of attributes whose values should |
|
be treated as CDATA if they ever show up on this tag. |
|
:param preserve_whitespace_tags: A list of tag names whose contents |
|
should have their whitespace preserved. |
|
:param interesting_string_types: This is a NavigableString |
|
subclass or a tuple of them. When iterating over this |
|
Tag's strings in methods like Tag.strings or Tag.get_text, |
|
these are the types of strings that are interesting enough |
|
to be considered. The default is to consider |
|
NavigableString and CData the only interesting string |
|
subtypes. |
|
:param namespaces: A dictionary mapping currently active |
|
namespace prefixes to URIs. This can be used later to |
|
construct CSS selectors. |
|
""" |
|
if parser is None: |
|
self.parser_class = None |
|
else: |
|
|
|
|
|
self.parser_class = parser.__class__ |
|
if name is None: |
|
raise ValueError("No value provided for new tag's name.") |
|
self.name = name |
|
self.namespace = namespace |
|
self._namespaces = namespaces or {} |
|
self.prefix = prefix |
|
if ((not builder or builder.store_line_numbers) |
|
and (sourceline is not None or sourcepos is not None)): |
|
self.sourceline = sourceline |
|
self.sourcepos = sourcepos |
|
if attrs is None: |
|
attrs = {} |
|
elif attrs: |
|
if builder is not None and builder.cdata_list_attributes: |
|
attrs = builder._replace_cdata_list_attribute_values( |
|
self.name, attrs) |
|
else: |
|
attrs = dict(attrs) |
|
else: |
|
attrs = dict(attrs) |
|
|
|
|
|
|
|
if builder: |
|
self.known_xml = builder.is_xml |
|
else: |
|
self.known_xml = is_xml |
|
self.attrs = attrs |
|
self.contents = [] |
|
self.setup(parent, previous) |
|
self.hidden = False |
|
|
|
if builder is None: |
|
|
|
|
|
|
|
self.can_be_empty_element = can_be_empty_element |
|
self.cdata_list_attributes = cdata_list_attributes |
|
self.preserve_whitespace_tags = preserve_whitespace_tags |
|
self.interesting_string_types = interesting_string_types |
|
else: |
|
|
|
builder.set_up_substitutions(self) |
|
|
|
|
|
self.can_be_empty_element = builder.can_be_empty_element(name) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.cdata_list_attributes = builder.cdata_list_attributes |
|
|
|
|
|
|
|
self.preserve_whitespace_tags = builder.preserve_whitespace_tags |
|
|
|
if self.name in builder.string_containers: |
|
|
|
|
|
self.interesting_string_types = builder.string_containers[self.name] |
|
else: |
|
self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES |
|
|
|
parserClass = _alias("parser_class") |
|
|
|
def __deepcopy__(self, memo, recursive=True): |
|
"""A deepcopy of a Tag is a new Tag, unconnected to the parse tree. |
|
Its contents are a copy of the old Tag's contents. |
|
""" |
|
clone = self._clone() |
|
|
|
if recursive: |
|
|
|
|
|
tag_stack = [clone] |
|
for event, element in self._event_stream(self.descendants): |
|
if event is Tag.END_ELEMENT_EVENT: |
|
|
|
|
|
tag_stack.pop() |
|
else: |
|
descendant_clone = element.__deepcopy__( |
|
memo, recursive=False |
|
) |
|
|
|
tag_stack[-1].append(descendant_clone) |
|
|
|
if event is Tag.START_ELEMENT_EVENT: |
|
|
|
|
|
tag_stack.append(descendant_clone) |
|
return clone |
|
|
|
def __copy__(self): |
|
"""A copy of a Tag must always be a deep copy, because a Tag's |
|
children can only have one parent at a time. |
|
""" |
|
return self.__deepcopy__({}) |
|
|
|
def _clone(self): |
|
"""Create a new Tag just like this one, but with no |
|
contents and unattached to any parse tree. |
|
|
|
This is the first step in the deepcopy process. |
|
""" |
|
clone = type(self)( |
|
None, None, self.name, self.namespace, |
|
self.prefix, self.attrs, is_xml=self._is_xml, |
|
sourceline=self.sourceline, sourcepos=self.sourcepos, |
|
can_be_empty_element=self.can_be_empty_element, |
|
cdata_list_attributes=self.cdata_list_attributes, |
|
preserve_whitespace_tags=self.preserve_whitespace_tags, |
|
interesting_string_types=self.interesting_string_types |
|
) |
|
for attr in ('can_be_empty_element', 'hidden'): |
|
setattr(clone, attr, getattr(self, attr)) |
|
return clone |
|
|
|
@property |
|
def is_empty_element(self): |
|
"""Is this tag an empty-element tag? (aka a self-closing tag) |
|
|
|
A tag that has contents is never an empty-element tag. |
|
|
|
A tag that has no contents may or may not be an empty-element |
|
tag. It depends on the builder used to create the tag. If the |
|
builder has a designated list of empty-element tags, then only |
|
a tag whose name shows up in that list is considered an |
|
empty-element tag. |
|
|
|
If the builder has no designated list of empty-element tags, |
|
then any tag with no contents is an empty-element tag. |
|
""" |
|
return len(self.contents) == 0 and self.can_be_empty_element |
|
isSelfClosing = is_empty_element |
|
|
|
@property |
|
def string(self): |
|
"""Convenience property to get the single string within this |
|
PageElement. |
|
|
|
TODO It might make sense to have NavigableString.string return |
|
itself. |
|
|
|
:return: If this element has a single string child, return |
|
value is that string. If this element has one child tag, |
|
return value is the 'string' attribute of the child tag, |
|
recursively. If this element is itself a string, has no |
|
children, or has more than one child, return value is None. |
|
""" |
|
if len(self.contents) != 1: |
|
return None |
|
child = self.contents[0] |
|
if isinstance(child, NavigableString): |
|
return child |
|
return child.string |
|
|
|
@string.setter |
|
def string(self, string): |
|
"""Replace this PageElement's contents with `string`.""" |
|
self.clear() |
|
self.append(string.__class__(string)) |
|
|
|
DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData) |
|
def _all_strings(self, strip=False, types=PageElement.default): |
|
"""Yield all strings of certain classes, possibly stripping them. |
|
|
|
:param strip: If True, all strings will be stripped before being |
|
yielded. |
|
|
|
:param types: A tuple of NavigableString subclasses. Any strings of |
|
a subclass not found in this list will be ignored. By |
|
default, the subclasses considered are the ones found in |
|
self.interesting_string_types. If that's not specified, |
|
only NavigableString and CData objects will be |
|
considered. That means no comments, processing |
|
instructions, etc. |
|
|
|
:yield: A sequence of strings. |
|
|
|
""" |
|
if types is self.default: |
|
types = self.interesting_string_types |
|
|
|
for descendant in self.descendants: |
|
if (types is None and not isinstance(descendant, NavigableString)): |
|
continue |
|
descendant_type = type(descendant) |
|
if isinstance(types, type): |
|
if descendant_type is not types: |
|
|
|
continue |
|
elif types is not None and descendant_type not in types: |
|
|
|
continue |
|
if strip: |
|
descendant = descendant.strip() |
|
if len(descendant) == 0: |
|
continue |
|
yield descendant |
|
strings = property(_all_strings) |
|
|
|
def decompose(self): |
|
"""Recursively destroys this PageElement and its children. |
|
|
|
This element will be removed from the tree and wiped out; so |
|
will everything beneath it. |
|
|
|
The behavior of a decomposed PageElement is undefined and you |
|
should never use one for anything, but if you need to _check_ |
|
whether an element has been decomposed, you can use the |
|
`decomposed` property. |
|
""" |
|
self.extract() |
|
i = self |
|
while i is not None: |
|
n = i.next_element |
|
i.__dict__.clear() |
|
i.contents = [] |
|
i._decomposed = True |
|
i = n |
|
|
|
def clear(self, decompose=False): |
|
"""Wipe out all children of this PageElement by calling extract() |
|
on them. |
|
|
|
:param decompose: If this is True, decompose() (a more |
|
destructive method) will be called instead of extract(). |
|
""" |
|
if decompose: |
|
for element in self.contents[:]: |
|
if isinstance(element, Tag): |
|
element.decompose() |
|
else: |
|
element.extract() |
|
else: |
|
for element in self.contents[:]: |
|
element.extract() |
|
|
|
def smooth(self): |
|
"""Smooth out this element's children by consolidating consecutive |
|
strings. |
|
|
|
This makes pretty-printed output look more natural following a |
|
lot of operations that modified the tree. |
|
""" |
|
|
|
|
|
|
|
|
|
marked = [] |
|
for i, a in enumerate(self.contents): |
|
if isinstance(a, Tag): |
|
|
|
a.smooth() |
|
if i == len(self.contents)-1: |
|
|
|
|
|
continue |
|
b = self.contents[i+1] |
|
if (isinstance(a, NavigableString) |
|
and isinstance(b, NavigableString) |
|
and not isinstance(a, PreformattedString) |
|
and not isinstance(b, PreformattedString) |
|
): |
|
marked.append(i) |
|
|
|
|
|
|
|
|
|
for i in reversed(marked): |
|
a = self.contents[i] |
|
b = self.contents[i+1] |
|
b.extract() |
|
n = NavigableString(a+b) |
|
a.replace_with(n) |
|
|
|
def index(self, element): |
|
"""Find the index of a child by identity, not value. |
|
|
|
Avoids issues with tag.contents.index(element) getting the |
|
index of equal elements. |
|
|
|
:param element: Look for this PageElement in `self.contents`. |
|
""" |
|
for i, child in enumerate(self.contents): |
|
if child is element: |
|
return i |
|
raise ValueError("Tag.index: element not in tag") |
|
|
|
def get(self, key, default=None): |
|
"""Returns the value of the 'key' attribute for the tag, or |
|
the value given for 'default' if it doesn't have that |
|
attribute.""" |
|
return self.attrs.get(key, default) |
|
|
|
def get_attribute_list(self, key, default=None): |
|
"""The same as get(), but always returns a list. |
|
|
|
:param key: The attribute to look for. |
|
:param default: Use this value if the attribute is not present |
|
on this PageElement. |
|
:return: A list of values, probably containing only a single |
|
value. |
|
""" |
|
value = self.get(key, default) |
|
if not isinstance(value, list): |
|
value = [value] |
|
return value |
|
|
|
def has_attr(self, key): |
|
"""Does this PageElement have an attribute with the given name?""" |
|
return key in self.attrs |
|
|
|
def __hash__(self): |
|
return str(self).__hash__() |
|
|
|
def __getitem__(self, key): |
|
"""tag[key] returns the value of the 'key' attribute for the Tag, |
|
and throws an exception if it's not there.""" |
|
return self.attrs[key] |
|
|
|
def __iter__(self): |
|
"Iterating over a Tag iterates over its contents." |
|
return iter(self.contents) |
|
|
|
def __len__(self): |
|
"The length of a Tag is the length of its list of contents." |
|
return len(self.contents) |
|
|
|
def __contains__(self, x): |
|
return x in self.contents |
|
|
|
def __bool__(self): |
|
"A tag is non-None even if it has no contents." |
|
return True |
|
|
|
def __setitem__(self, key, value): |
|
"""Setting tag[key] sets the value of the 'key' attribute for the |
|
tag.""" |
|
self.attrs[key] = value |
|
|
|
def __delitem__(self, key): |
|
"Deleting tag[key] deletes all 'key' attributes for the tag." |
|
self.attrs.pop(key, None) |
|
|
|
def __call__(self, *args, **kwargs): |
|
"""Calling a Tag like a function is the same as calling its |
|
find_all() method. Eg. tag('a') returns a list of all the A tags |
|
found within this tag.""" |
|
return self.find_all(*args, **kwargs) |
|
|
|
def __getattr__(self, tag): |
|
"""Calling tag.subtag is the same as calling tag.find(name="subtag")""" |
|
|
|
if len(tag) > 3 and tag.endswith('Tag'): |
|
|
|
tag_name = tag[:-3] |
|
warnings.warn( |
|
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( |
|
name=tag_name |
|
), |
|
DeprecationWarning, stacklevel=2 |
|
) |
|
return self.find(tag_name) |
|
|
|
elif not tag.startswith("__") and not tag == "contents": |
|
return self.find(tag) |
|
raise AttributeError( |
|
"'%s' object has no attribute '%s'" % (self.__class__, tag)) |
|
|
|
def __eq__(self, other): |
|
"""Returns true iff this Tag has the same name, the same attributes, |
|
and the same contents (recursively) as `other`.""" |
|
if self is other: |
|
return True |
|
if (not hasattr(other, 'name') or |
|
not hasattr(other, 'attrs') or |
|
not hasattr(other, 'contents') or |
|
self.name != other.name or |
|
self.attrs != other.attrs or |
|
len(self) != len(other)): |
|
return False |
|
for i, my_child in enumerate(self.contents): |
|
if my_child != other.contents[i]: |
|
return False |
|
return True |
|
|
|
def __ne__(self, other): |
|
"""Returns true iff this Tag is not identical to `other`, |
|
as defined in __eq__.""" |
|
return not self == other |
|
|
|
def __repr__(self, encoding="unicode-escape"): |
|
"""Renders this PageElement as a string. |
|
|
|
:param encoding: The encoding to use (Python 2 only). |
|
TODO: This is now ignored and a warning should be issued |
|
if a value is provided. |
|
:return: A (Unicode) string. |
|
""" |
|
|
|
return self.decode() |
|
|
|
def __unicode__(self): |
|
"""Renders this PageElement as a Unicode string.""" |
|
return self.decode() |
|
|
|
__str__ = __repr__ = __unicode__ |
|
|
|
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, |
|
indent_level=None, formatter="minimal", |
|
errors="xmlcharrefreplace"): |
|
"""Render a bytestring representation of this PageElement and its |
|
contents. |
|
|
|
:param encoding: The destination encoding. |
|
:param indent_level: Each line of the rendering will be |
|
indented this many levels. (The formatter decides what a |
|
'level' means in terms of spaces or other characters |
|
output.) Used internally in recursive calls while |
|
pretty-printing. |
|
:param formatter: A Formatter object, or a string naming one of |
|
the standard formatters. |
|
:param errors: An error handling strategy such as |
|
'xmlcharrefreplace'. This value is passed along into |
|
encode() and its value should be one of the constants |
|
defined by Python. |
|
:return: A bytestring. |
|
|
|
""" |
|
|
|
|
|
u = self.decode(indent_level, encoding, formatter) |
|
return u.encode(encoding, errors) |
|
|
|
def decode(self, indent_level=None, |
|
eventual_encoding=DEFAULT_OUTPUT_ENCODING, |
|
formatter="minimal", |
|
iterator=None): |
|
pieces = [] |
|
|
|
|
|
|
|
if not isinstance(formatter, Formatter): |
|
formatter = self.formatter_for_name(formatter) |
|
|
|
if indent_level is True: |
|
indent_level = 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
string_literal_tag = None |
|
|
|
for event, element in self._event_stream(iterator): |
|
if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): |
|
piece = element._format_tag( |
|
eventual_encoding, formatter, opening=True |
|
) |
|
elif event is Tag.END_ELEMENT_EVENT: |
|
piece = element._format_tag( |
|
eventual_encoding, formatter, opening=False |
|
) |
|
if indent_level is not None: |
|
indent_level -= 1 |
|
else: |
|
piece = element.output_ready(formatter) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if string_literal_tag: |
|
indent_before = indent_after = False |
|
else: |
|
indent_before = indent_after = True |
|
|
|
|
|
|
|
|
|
if (event is Tag.START_ELEMENT_EVENT |
|
and not string_literal_tag |
|
and not element._should_pretty_print()): |
|
|
|
|
|
|
|
|
|
indent_before = True |
|
indent_after = False |
|
string_literal_tag = element |
|
elif (event is Tag.END_ELEMENT_EVENT |
|
and element is string_literal_tag): |
|
|
|
|
|
|
|
indent_before = False |
|
indent_after = True |
|
string_literal_tag = None |
|
|
|
|
|
|
|
if indent_level is not None: |
|
if (indent_before or indent_after): |
|
if isinstance(element, NavigableString): |
|
piece = piece.strip() |
|
if piece: |
|
piece = self._indent_string( |
|
piece, indent_level, formatter, |
|
indent_before, indent_after |
|
) |
|
if event == Tag.START_ELEMENT_EVENT: |
|
indent_level += 1 |
|
pieces.append(piece) |
|
return "".join(pieces) |
|
|
|
|
|
START_ELEMENT_EVENT = object() |
|
END_ELEMENT_EVENT = object() |
|
EMPTY_ELEMENT_EVENT = object() |
|
STRING_ELEMENT_EVENT = object() |
|
|
|
def _event_stream(self, iterator=None): |
|
"""Yield a sequence of events that can be used to reconstruct the DOM |
|
for this element. |
|
|
|
This lets us recreate the nested structure of this element |
|
(e.g. when formatting it as a string) without using recursive |
|
method calls. |
|
|
|
This is similar in concept to the SAX API, but it's a simpler |
|
interface designed for internal use. The events are different |
|
from SAX and the arguments associated with the events are Tags |
|
and other Beautiful Soup objects. |
|
|
|
:param iterator: An alternate iterator to use when traversing |
|
the tree. |
|
""" |
|
tag_stack = [] |
|
|
|
iterator = iterator or self.self_and_descendants |
|
|
|
for c in iterator: |
|
|
|
|
|
|
|
while tag_stack and c.parent != tag_stack[-1]: |
|
now_closed_tag = tag_stack.pop() |
|
yield Tag.END_ELEMENT_EVENT, now_closed_tag |
|
|
|
if isinstance(c, Tag): |
|
if c.is_empty_element: |
|
yield Tag.EMPTY_ELEMENT_EVENT, c |
|
else: |
|
yield Tag.START_ELEMENT_EVENT, c |
|
tag_stack.append(c) |
|
continue |
|
else: |
|
yield Tag.STRING_ELEMENT_EVENT, c |
|
|
|
while tag_stack: |
|
now_closed_tag = tag_stack.pop() |
|
yield Tag.END_ELEMENT_EVENT, now_closed_tag |
|
|
|
def _indent_string(self, s, indent_level, formatter, |
|
indent_before, indent_after): |
|
"""Add indentation whitespace before and/or after a string. |
|
|
|
:param s: The string to amend with whitespace. |
|
:param indent_level: The indentation level; affects how much |
|
whitespace goes before the string. |
|
:param indent_before: Whether or not to add whitespace |
|
before the string. |
|
:param indent_after: Whether or not to add whitespace |
|
(a newline) after the string. |
|
""" |
|
space_before = '' |
|
if indent_before and indent_level: |
|
space_before = (formatter.indent * indent_level) |
|
|
|
space_after = '' |
|
if indent_after: |
|
space_after = "\n" |
|
|
|
return space_before + s + space_after |
|
|
|
def _format_tag(self, eventual_encoding, formatter, opening): |
|
if self.hidden: |
|
|
|
|
|
return '' |
|
|
|
|
|
|
|
|
|
closing_slash = '' |
|
if not opening: |
|
closing_slash = '/' |
|
|
|
|
|
prefix = '' |
|
if self.prefix: |
|
prefix = self.prefix + ":" |
|
|
|
|
|
attribute_string = '' |
|
if opening: |
|
attributes = formatter.attributes(self) |
|
attrs = [] |
|
for key, val in attributes: |
|
if val is None: |
|
decoded = key |
|
else: |
|
if isinstance(val, list) or isinstance(val, tuple): |
|
val = ' '.join(val) |
|
elif not isinstance(val, str): |
|
val = str(val) |
|
elif ( |
|
isinstance(val, AttributeValueWithCharsetSubstitution) |
|
and eventual_encoding is not None |
|
): |
|
val = val.encode(eventual_encoding) |
|
|
|
text = formatter.attribute_value(val) |
|
decoded = ( |
|
str(key) + '=' |
|
+ formatter.quoted_attribute_value(text)) |
|
attrs.append(decoded) |
|
if attrs: |
|
attribute_string = ' ' + ' '.join(attrs) |
|
|
|
|
|
|
|
void_element_closing_slash = '' |
|
if self.is_empty_element: |
|
void_element_closing_slash = formatter.void_element_close_prefix or '' |
|
|
|
|
|
return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>' |
|
|
|
def _should_pretty_print(self, indent_level=1): |
|
"""Should this tag be pretty-printed? |
|
|
|
Most of them should, but some (such as <pre> in HTML |
|
documents) should not. |
|
""" |
|
return ( |
|
indent_level is not None |
|
and ( |
|
not self.preserve_whitespace_tags |
|
or self.name not in self.preserve_whitespace_tags |
|
) |
|
) |
|
|
|
def prettify(self, encoding=None, formatter="minimal"): |
|
"""Pretty-print this PageElement as a string. |
|
|
|
:param encoding: The eventual encoding of the string. If this is None, |
|
a Unicode string will be returned. |
|
:param formatter: A Formatter object, or a string naming one of |
|
the standard formatters. |
|
:return: A Unicode string (if encoding==None) or a bytestring |
|
(otherwise). |
|
""" |
|
if encoding is None: |
|
return self.decode(True, formatter=formatter) |
|
else: |
|
return self.encode(encoding, True, formatter=formatter) |
|
|
|
def decode_contents(self, indent_level=None, |
|
eventual_encoding=DEFAULT_OUTPUT_ENCODING, |
|
formatter="minimal"): |
|
"""Renders the contents of this tag as a Unicode string. |
|
|
|
:param indent_level: Each line of the rendering will be |
|
indented this many levels. (The formatter decides what a |
|
'level' means in terms of spaces or other characters |
|
output.) Used internally in recursive calls while |
|
pretty-printing. |
|
|
|
:param eventual_encoding: The tag is destined to be |
|
encoded into this encoding. decode_contents() is _not_ |
|
responsible for performing that encoding. This information |
|
is passed in so that it can be substituted in if the |
|
document contains a <META> tag that mentions the document's |
|
encoding. |
|
|
|
:param formatter: A Formatter object, or a string naming one of |
|
the standard Formatters. |
|
|
|
""" |
|
return self.decode(indent_level, eventual_encoding, formatter, |
|
iterator=self.descendants) |
|
|
|
def encode_contents( |
|
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, |
|
formatter="minimal"): |
|
"""Renders the contents of this PageElement as a bytestring. |
|
|
|
:param indent_level: Each line of the rendering will be |
|
indented this many levels. (The formatter decides what a |
|
'level' means in terms of spaces or other characters |
|
output.) Used internally in recursive calls while |
|
pretty-printing. |
|
|
|
:param eventual_encoding: The bytestring will be in this encoding. |
|
|
|
:param formatter: A Formatter object, or a string naming one of |
|
the standard Formatters. |
|
|
|
:return: A bytestring. |
|
""" |
|
contents = self.decode_contents(indent_level, encoding, formatter) |
|
return contents.encode(encoding) |
|
|
|
|
|
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, |
|
prettyPrint=False, indentLevel=0): |
|
"""Deprecated method for BS3 compatibility.""" |
|
if not prettyPrint: |
|
indentLevel = None |
|
return self.encode_contents( |
|
indent_level=indentLevel, encoding=encoding) |
|
|
|
|
|
|
|
def find(self, name=None, attrs={}, recursive=True, string=None, |
|
**kwargs): |
|
"""Look in the children of this PageElement and find the first |
|
PageElement that matches the given criteria. |
|
|
|
All find_* methods take a common set of arguments. See the online |
|
documentation for detailed explanations. |
|
|
|
:param name: A filter on tag name. |
|
:param attrs: A dictionary of filters on attribute values. |
|
:param recursive: If this is True, find() will perform a |
|
recursive search of this PageElement's children. Otherwise, |
|
only the direct children will be considered. |
|
:param limit: Stop looking after finding this many results. |
|
:kwargs: A dictionary of filters on attribute values. |
|
:return: A PageElement. |
|
:rtype: bs4.element.Tag | bs4.element.NavigableString |
|
""" |
|
r = None |
|
l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, |
|
**kwargs) |
|
if l: |
|
r = l[0] |
|
return r |
|
findChild = find |
|
|
|
def find_all(self, name=None, attrs={}, recursive=True, string=None, |
|
limit=None, **kwargs): |
|
"""Look in the children of this PageElement and find all |
|
PageElements that match the given criteria. |
|
|
|
All find_* methods take a common set of arguments. See the online |
|
documentation for detailed explanations. |
|
|
|
:param name: A filter on tag name. |
|
:param attrs: A dictionary of filters on attribute values. |
|
:param recursive: If this is True, find_all() will perform a |
|
recursive search of this PageElement's children. Otherwise, |
|
only the direct children will be considered. |
|
:param limit: Stop looking after finding this many results. |
|
:kwargs: A dictionary of filters on attribute values. |
|
:return: A ResultSet of PageElements. |
|
:rtype: bs4.element.ResultSet |
|
""" |
|
generator = self.descendants |
|
if not recursive: |
|
generator = self.children |
|
_stacklevel = kwargs.pop('_stacklevel', 2) |
|
return self._find_all(name, attrs, string, limit, generator, |
|
_stacklevel=_stacklevel+1, **kwargs) |
|
findAll = find_all |
|
findChildren = find_all |
|
|
|
|
|
@property |
|
def children(self): |
|
"""Iterate over all direct children of this PageElement. |
|
|
|
:yield: A sequence of PageElements. |
|
""" |
|
|
|
return iter(self.contents) |
|
|
|
@property |
|
def self_and_descendants(self): |
|
"""Iterate over this PageElement and its children in a |
|
breadth-first sequence. |
|
|
|
:yield: A sequence of PageElements. |
|
""" |
|
if not self.hidden: |
|
yield self |
|
for i in self.descendants: |
|
yield i |
|
|
|
@property |
|
def descendants(self): |
|
"""Iterate over all children of this PageElement in a |
|
breadth-first sequence. |
|
|
|
:yield: A sequence of PageElements. |
|
""" |
|
if not len(self.contents): |
|
return |
|
stopNode = self._last_descendant().next_element |
|
current = self.contents[0] |
|
while current is not stopNode: |
|
yield current |
|
current = current.next_element |
|
|
|
|
|
def select_one(self, selector, namespaces=None, **kwargs): |
|
"""Perform a CSS selection operation on the current element. |
|
|
|
:param selector: A CSS selector. |
|
|
|
:param namespaces: A dictionary mapping namespace prefixes |
|
used in the CSS selector to namespace URIs. By default, |
|
Beautiful Soup will use the prefixes it encountered while |
|
parsing the document. |
|
|
|
:param kwargs: Keyword arguments to be passed into Soup Sieve's |
|
soupsieve.select() method. |
|
|
|
:return: A Tag. |
|
:rtype: bs4.element.Tag |
|
""" |
|
return self.css.select_one(selector, namespaces, **kwargs) |
|
|
|
def select(self, selector, namespaces=None, limit=None, **kwargs): |
|
"""Perform a CSS selection operation on the current element. |
|
|
|
This uses the SoupSieve library. |
|
|
|
:param selector: A string containing a CSS selector. |
|
|
|
:param namespaces: A dictionary mapping namespace prefixes |
|
used in the CSS selector to namespace URIs. By default, |
|
Beautiful Soup will use the prefixes it encountered while |
|
parsing the document. |
|
|
|
:param limit: After finding this number of results, stop looking. |
|
|
|
:param kwargs: Keyword arguments to be passed into SoupSieve's |
|
soupsieve.select() method. |
|
|
|
:return: A ResultSet of Tags. |
|
:rtype: bs4.element.ResultSet |
|
""" |
|
return self.css.select(selector, namespaces, limit, **kwargs) |
|
|
|
@property |
|
def css(self): |
|
"""Return an interface to the CSS selector API.""" |
|
return CSS(self) |
|
|
|
|
|
def childGenerator(self): |
|
"""Deprecated generator.""" |
|
return self.children |
|
|
|
def recursiveChildGenerator(self): |
|
"""Deprecated generator.""" |
|
return self.descendants |
|
|
|
def has_key(self, key): |
|
"""Deprecated method. This was kind of misleading because has_key() |
|
(attributes) was different from __in__ (contents). |
|
|
|
has_key() is gone in Python 3, anyway. |
|
""" |
|
warnings.warn( |
|
'has_key is deprecated. Use has_attr(key) instead.', |
|
DeprecationWarning, stacklevel=2 |
|
) |
|
return self.has_attr(key) |
|
|
|
|
|
class SoupStrainer(object): |
|
"""Encapsulates a number of ways of matching a markup element (tag or |
|
string). |
|
|
|
This is primarily used to underpin the find_* methods, but you can |
|
create one yourself and pass it in as `parse_only` to the |
|
`BeautifulSoup` constructor, to parse a subset of a large |
|
document. |
|
""" |
|
|
|
def __init__(self, name=None, attrs={}, string=None, **kwargs): |
|
"""Constructor. |
|
|
|
The SoupStrainer constructor takes the same arguments passed |
|
into the find_* methods. See the online documentation for |
|
detailed explanations. |
|
|
|
:param name: A filter on tag name. |
|
:param attrs: A dictionary of filters on attribute values. |
|
:param string: A filter for a NavigableString with specific text. |
|
:kwargs: A dictionary of filters on attribute values. |
|
""" |
|
if string is None and 'text' in kwargs: |
|
string = kwargs.pop('text') |
|
warnings.warn( |
|
"The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.", |
|
DeprecationWarning, stacklevel=2 |
|
) |
|
|
|
self.name = self._normalize_search_value(name) |
|
if not isinstance(attrs, dict): |
|
|
|
|
|
kwargs['class'] = attrs |
|
attrs = None |
|
|
|
if 'class_' in kwargs: |
|
|
|
|
|
kwargs['class'] = kwargs['class_'] |
|
del kwargs['class_'] |
|
|
|
if kwargs: |
|
if attrs: |
|
attrs = attrs.copy() |
|
attrs.update(kwargs) |
|
else: |
|
attrs = kwargs |
|
normalized_attrs = {} |
|
for key, value in list(attrs.items()): |
|
normalized_attrs[key] = self._normalize_search_value(value) |
|
|
|
self.attrs = normalized_attrs |
|
self.string = self._normalize_search_value(string) |
|
|
|
|
|
self.text = self.string |
|
|
|
def _normalize_search_value(self, value): |
|
|
|
|
|
if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') |
|
or isinstance(value, bool) or value is None): |
|
return value |
|
|
|
|
|
if isinstance(value, bytes): |
|
return value.decode("utf8") |
|
|
|
|
|
if hasattr(value, '__iter__'): |
|
new_value = [] |
|
for v in value: |
|
if (hasattr(v, '__iter__') and not isinstance(v, bytes) |
|
and not isinstance(v, str)): |
|
|
|
|
|
|
|
new_value.append(v) |
|
else: |
|
new_value.append(self._normalize_search_value(v)) |
|
return new_value |
|
|
|
|
|
|
|
|
|
return str(str(value)) |
|
|
|
def __str__(self): |
|
"""A human-readable representation of this SoupStrainer.""" |
|
if self.string: |
|
return self.string |
|
else: |
|
return "%s|%s" % (self.name, self.attrs) |
|
|
|
def search_tag(self, markup_name=None, markup_attrs={}): |
|
"""Check whether a Tag with the given name and attributes would |
|
match this SoupStrainer. |
|
|
|
Used prospectively to decide whether to even bother creating a Tag |
|
object. |
|
|
|
:param markup_name: A tag name as found in some markup. |
|
:param markup_attrs: A dictionary of attributes as found in some markup. |
|
|
|
:return: True if the prospective tag would match this SoupStrainer; |
|
False otherwise. |
|
""" |
|
found = None |
|
markup = None |
|
if isinstance(markup_name, Tag): |
|
markup = markup_name |
|
markup_attrs = markup |
|
|
|
if isinstance(self.name, str): |
|
|
|
|
|
|
|
if markup and not markup.prefix and self.name != markup.name: |
|
return False |
|
|
|
call_function_with_tag_data = ( |
|
isinstance(self.name, Callable) |
|
and not isinstance(markup_name, Tag)) |
|
|
|
if ((not self.name) |
|
or call_function_with_tag_data |
|
or (markup and self._matches(markup, self.name)) |
|
or (not markup and self._matches(markup_name, self.name))): |
|
if call_function_with_tag_data: |
|
match = self.name(markup_name, markup_attrs) |
|
else: |
|
match = True |
|
markup_attr_map = None |
|
for attr, match_against in list(self.attrs.items()): |
|
if not markup_attr_map: |
|
if hasattr(markup_attrs, 'get'): |
|
markup_attr_map = markup_attrs |
|
else: |
|
markup_attr_map = {} |
|
for k, v in markup_attrs: |
|
markup_attr_map[k] = v |
|
attr_value = markup_attr_map.get(attr) |
|
if not self._matches(attr_value, match_against): |
|
match = False |
|
break |
|
if match: |
|
if markup: |
|
found = markup |
|
else: |
|
found = markup_name |
|
if found and self.string and not self._matches(found.string, self.string): |
|
found = None |
|
return found |
|
|
|
|
|
searchTag = search_tag |
|
|
|
def search(self, markup): |
|
"""Find all items in `markup` that match this SoupStrainer. |
|
|
|
Used by the core _find_all() method, which is ultimately |
|
called by all find_* methods. |
|
|
|
:param markup: A PageElement or a list of them. |
|
""" |
|
|
|
found = None |
|
|
|
|
|
if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): |
|
for element in markup: |
|
if isinstance(element, NavigableString) \ |
|
and self.search(element): |
|
found = element |
|
break |
|
|
|
|
|
elif isinstance(markup, Tag): |
|
if not self.string or self.name or self.attrs: |
|
found = self.search_tag(markup) |
|
|
|
elif isinstance(markup, NavigableString) or \ |
|
isinstance(markup, str): |
|
if not self.name and not self.attrs and self._matches(markup, self.string): |
|
found = markup |
|
else: |
|
raise Exception( |
|
"I don't know how to match against a %s" % markup.__class__) |
|
return found |
|
|
|
def _matches(self, markup, match_against, already_tried=None): |
|
|
|
result = False |
|
if isinstance(markup, list) or isinstance(markup, tuple): |
|
|
|
|
|
for item in markup: |
|
if self._matches(item, match_against): |
|
return True |
|
|
|
|
|
|
|
if self._matches(' '.join(markup), match_against): |
|
return True |
|
return False |
|
|
|
if match_against is True: |
|
|
|
return markup is not None |
|
|
|
if isinstance(match_against, Callable): |
|
return match_against(markup) |
|
|
|
|
|
|
|
original_markup = markup |
|
if isinstance(markup, Tag): |
|
markup = markup.name |
|
|
|
|
|
markup = self._normalize_search_value(markup) |
|
|
|
if markup is None: |
|
|
|
return not match_against |
|
|
|
if (hasattr(match_against, '__iter__') |
|
and not isinstance(match_against, str)): |
|
|
|
|
|
|
|
|
|
|
|
|
|
if not already_tried: |
|
already_tried = set() |
|
for item in match_against: |
|
if item.__hash__: |
|
key = item |
|
else: |
|
key = id(item) |
|
if key in already_tried: |
|
continue |
|
else: |
|
already_tried.add(key) |
|
if self._matches(original_markup, item, already_tried): |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
|
|
match = False |
|
|
|
if not match and isinstance(match_against, str): |
|
|
|
match = markup == match_against |
|
|
|
if not match and hasattr(match_against, 'search'): |
|
|
|
return match_against.search(markup) |
|
|
|
if (not match |
|
and isinstance(original_markup, Tag) |
|
and original_markup.prefix): |
|
|
|
return self._matches( |
|
original_markup.prefix + ':' + original_markup.name, match_against |
|
) |
|
|
|
return match |
|
|
|
|
|
class ResultSet(list): |
|
"""A ResultSet is just a list that keeps track of the SoupStrainer |
|
that created it.""" |
|
def __init__(self, source, result=()): |
|
"""Constructor. |
|
|
|
:param source: A SoupStrainer. |
|
:param result: A list of PageElements. |
|
""" |
|
super(ResultSet, self).__init__(result) |
|
self.source = source |
|
|
|
def __getattr__(self, key): |
|
"""Raise a helpful exception to explain a common code fix.""" |
|
raise AttributeError( |
|
"ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key |
|
) |
|
|