Spaces:
Sleeping
Sleeping
# Python Markdown | |
# A Python implementation of John Gruber's Markdown. | |
# Documentation: https://python-markdown.github.io/ | |
# GitHub: https://github.com/Python-Markdown/markdown/ | |
# PyPI: https://pypi.org/project/Markdown/ | |
# Started by Manfred Stienstra (http://www.dwerg.net/). | |
# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). | |
# Currently maintained by Waylan Limberg (https://github.com/waylan), | |
# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). | |
# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) | |
# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) | |
# Copyright 2004 Manfred Stienstra (the original version) | |
# License: BSD (see LICENSE.md for details). | |
""" | |
Tree processors manipulate the tree created by block processors. They can even create an entirely | |
new `ElementTree` object. This is an excellent place for creating summaries, adding collected | |
references, or last minute adjustments. | |
""" | |
from __future__ import annotations | |
import re | |
import xml.etree.ElementTree as etree | |
from typing import TYPE_CHECKING, Any | |
from . import util | |
from . import inlinepatterns | |
if TYPE_CHECKING: # pragma: no cover | |
from markdown import Markdown | |
def build_treeprocessors(md: Markdown, **kwargs: Any) -> util.Registry[Treeprocessor]: | |
""" Build the default `treeprocessors` for Markdown. """ | |
treeprocessors = util.Registry() | |
treeprocessors.register(InlineProcessor(md), 'inline', 20) | |
treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10) | |
treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0) | |
return treeprocessors | |
def isString(s: object) -> bool: | |
""" Return `True` if object is a string but not an [`AtomicString`][markdown.util.AtomicString]. """ | |
if not isinstance(s, util.AtomicString): | |
return isinstance(s, str) | |
return False | |
class Treeprocessor(util.Processor): | |
""" | |
`Treeprocessor`s are run on the `ElementTree` object before serialization. | |
Each `Treeprocessor` implements a `run` method that takes a pointer to an | |
`Element` and modifies it as necessary. | |
`Treeprocessors` must extend `markdown.Treeprocessor`. | |
""" | |
def run(self, root: etree.Element) -> etree.Element | None: | |
""" | |
Subclasses of `Treeprocessor` should implement a `run` method, which | |
takes a root `Element`. This method can return another `Element` | |
object, and the existing root `Element` will be replaced, or it can | |
modify the current tree and return `None`. | |
""" | |
pass # pragma: no cover | |
class InlineProcessor(Treeprocessor): | |
""" | |
A `Treeprocessor` that traverses a tree, applying inline patterns. | |
""" | |
def __init__(self, md: Markdown): | |
self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX | |
self.__placeholder_suffix = util.ETX | |
self.__placeholder_length = 4 + len(self.__placeholder_prefix) \ | |
+ len(self.__placeholder_suffix) | |
self.__placeholder_re = util.INLINE_PLACEHOLDER_RE | |
self.md = md | |
self.inlinePatterns = md.inlinePatterns | |
self.ancestors: list[str] = [] | |
def __makePlaceholder(self, type: str) -> tuple[str, str]: | |
""" Generate a placeholder """ | |
id = "%04d" % len(self.stashed_nodes) | |
hash = util.INLINE_PLACEHOLDER % id | |
return hash, id | |
def __findPlaceholder(self, data: str, index: int) -> tuple[str | None, int]: | |
""" | |
Extract id from data string, start from index. | |
Arguments: | |
data: String. | |
index: Index, from which we start search. | |
Returns: | |
Placeholder id and string index, after the found placeholder. | |
""" | |
m = self.__placeholder_re.search(data, index) | |
if m: | |
return m.group(1), m.end() | |
else: | |
return None, index + 1 | |
def __stashNode(self, node: etree.Element | str, type: str) -> str: | |
""" Add node to stash. """ | |
placeholder, id = self.__makePlaceholder(type) | |
self.stashed_nodes[id] = node | |
return placeholder | |
def __handleInline(self, data: str, patternIndex: int = 0) -> str: | |
""" | |
Process string with inline patterns and replace it with placeholders. | |
Arguments: | |
data: A line of Markdown text. | |
patternIndex: The index of the `inlinePattern` to start with. | |
Returns: | |
String with placeholders. | |
""" | |
if not isinstance(data, util.AtomicString): | |
startIndex = 0 | |
count = len(self.inlinePatterns) | |
while patternIndex < count: | |
data, matched, startIndex = self.__applyPattern( | |
self.inlinePatterns[patternIndex], data, patternIndex, startIndex | |
) | |
if not matched: | |
patternIndex += 1 | |
return data | |
def __processElementText(self, node: etree.Element, subnode: etree.Element, isText: bool = True) -> None: | |
""" | |
Process placeholders in `Element.text` or `Element.tail` | |
of Elements popped from `self.stashed_nodes`. | |
Arguments: | |
node: Parent node. | |
subnode: Processing node. | |
isText: Boolean variable, True - it's text, False - it's a tail. | |
""" | |
if isText: | |
text = subnode.text | |
subnode.text = None | |
else: | |
text = subnode.tail | |
subnode.tail = None | |
childResult = self.__processPlaceholders(text, subnode, isText) | |
if not isText and node is not subnode: | |
pos = list(node).index(subnode) + 1 | |
else: | |
pos = 0 | |
childResult.reverse() | |
for newChild in childResult: | |
node.insert(pos, newChild[0]) | |
def __processPlaceholders( | |
self, | |
data: str | None, | |
parent: etree.Element, | |
isText: bool = True | |
) -> list[tuple[etree.Element, list[str]]]: | |
""" | |
Process string with placeholders and generate `ElementTree` tree. | |
Arguments: | |
data: String with placeholders instead of `ElementTree` elements. | |
parent: Element, which contains processing inline data. | |
isText: Boolean variable, True - it's text, False - it's a tail. | |
Returns: | |
List with `ElementTree` elements with applied inline patterns. | |
""" | |
def linkText(text: str | None) -> None: | |
if text: | |
if result: | |
if result[-1][0].tail: | |
result[-1][0].tail += text | |
else: | |
result[-1][0].tail = text | |
elif not isText: | |
if parent.tail: | |
parent.tail += text | |
else: | |
parent.tail = text | |
else: | |
if parent.text: | |
parent.text += text | |
else: | |
parent.text = text | |
result = [] | |
strartIndex = 0 | |
while data: | |
index = data.find(self.__placeholder_prefix, strartIndex) | |
if index != -1: | |
id, phEndIndex = self.__findPlaceholder(data, index) | |
if id in self.stashed_nodes: | |
node = self.stashed_nodes.get(id) | |
if index > 0: | |
text = data[strartIndex:index] | |
linkText(text) | |
if not isinstance(node, str): # it's Element | |
for child in [node] + list(node): | |
if child.tail: | |
if child.tail.strip(): | |
self.__processElementText( | |
node, child, False | |
) | |
if child.text: | |
if child.text.strip(): | |
self.__processElementText(child, child) | |
else: # it's just a string | |
linkText(node) | |
strartIndex = phEndIndex | |
continue | |
strartIndex = phEndIndex | |
result.append((node, self.ancestors[:])) | |
else: # wrong placeholder | |
end = index + len(self.__placeholder_prefix) | |
linkText(data[strartIndex:end]) | |
strartIndex = end | |
else: | |
text = data[strartIndex:] | |
if isinstance(data, util.AtomicString): | |
# We don't want to loose the `AtomicString` | |
text = util.AtomicString(text) | |
linkText(text) | |
data = "" | |
return result | |
def __applyPattern( | |
self, | |
pattern: inlinepatterns.Pattern, | |
data: str, | |
patternIndex: int, | |
startIndex: int = 0 | |
) -> tuple[str, bool, int]: | |
""" | |
Check if the line fits the pattern, create the necessary | |
elements, add it to `stashed_nodes`. | |
Arguments: | |
data: The text to be processed. | |
pattern: The pattern to be checked. | |
patternIndex: Index of current pattern. | |
startIndex: String index, from which we start searching. | |
Returns: | |
String with placeholders instead of `ElementTree` elements. | |
""" | |
new_style = isinstance(pattern, inlinepatterns.InlineProcessor) | |
for exclude in pattern.ANCESTOR_EXCLUDES: | |
if exclude.lower() in self.ancestors: | |
return data, False, 0 | |
if new_style: | |
match = None | |
# Since `handleMatch` may reject our first match, | |
# we iterate over the buffer looking for matches | |
# until we can't find any more. | |
for match in pattern.getCompiledRegExp().finditer(data, startIndex): | |
node, start, end = pattern.handleMatch(match, data) | |
if start is None or end is None: | |
startIndex += match.end(0) | |
match = None | |
continue | |
break | |
else: # pragma: no cover | |
match = pattern.getCompiledRegExp().match(data[startIndex:]) | |
leftData = data[:startIndex] | |
if not match: | |
return data, False, 0 | |
if not new_style: # pragma: no cover | |
node = pattern.handleMatch(match) | |
start = match.start(0) | |
end = match.end(0) | |
if node is None: | |
return data, True, end | |
if not isinstance(node, str): | |
if not isinstance(node.text, util.AtomicString): | |
# We need to process current node too | |
for child in [node] + list(node): | |
if not isString(node): | |
if child.text: | |
self.ancestors.append(child.tag.lower()) | |
child.text = self.__handleInline( | |
child.text, patternIndex + 1 | |
) | |
self.ancestors.pop() | |
if child.tail: | |
child.tail = self.__handleInline( | |
child.tail, patternIndex | |
) | |
placeholder = self.__stashNode(node, pattern.type()) | |
if new_style: | |
return "{}{}{}".format(data[:start], | |
placeholder, data[end:]), True, 0 | |
else: # pragma: no cover | |
return "{}{}{}{}".format(leftData, | |
match.group(1), | |
placeholder, match.groups()[-1]), True, 0 | |
def __build_ancestors(self, parent: etree.Element | None, parents: list[str]) -> None: | |
"""Build the ancestor list.""" | |
ancestors = [] | |
while parent is not None: | |
if parent is not None: | |
ancestors.append(parent.tag.lower()) | |
parent = self.parent_map.get(parent) | |
ancestors.reverse() | |
parents.extend(ancestors) | |
def run(self, tree: etree.Element, ancestors: list[str] | None = None) -> etree.Element: | |
"""Apply inline patterns to a parsed Markdown tree. | |
Iterate over `Element`, find elements with inline tag, apply inline | |
patterns and append newly created Elements to tree. To avoid further | |
processing of string with inline patterns, instead of normal string, | |
use subclass [`AtomicString`][markdown.util.AtomicString]: | |
node.text = markdown.util.AtomicString("This will not be processed.") | |
Arguments: | |
tree: `Element` object, representing Markdown tree. | |
ancestors: List of parent tag names that precede the tree node (if needed). | |
Returns: | |
An element tree object with applied inline patterns. | |
""" | |
self.stashed_nodes: dict[str, etree.Element | str] = {} | |
# Ensure a valid parent list, but copy passed in lists | |
# to ensure we don't have the user accidentally change it on us. | |
tree_parents = [] if ancestors is None else ancestors[:] | |
self.parent_map = {c: p for p in tree.iter() for c in p} | |
stack = [(tree, tree_parents)] | |
while stack: | |
currElement, parents = stack.pop() | |
self.ancestors = parents | |
self.__build_ancestors(currElement, self.ancestors) | |
insertQueue = [] | |
for child in currElement: | |
if child.text and not isinstance( | |
child.text, util.AtomicString | |
): | |
self.ancestors.append(child.tag.lower()) | |
text = child.text | |
child.text = None | |
lst = self.__processPlaceholders( | |
self.__handleInline(text), child | |
) | |
for item in lst: | |
self.parent_map[item[0]] = child | |
stack += lst | |
insertQueue.append((child, lst)) | |
self.ancestors.pop() | |
if child.tail: | |
tail = self.__handleInline(child.tail) | |
dumby = etree.Element('d') | |
child.tail = None | |
tailResult = self.__processPlaceholders(tail, dumby, False) | |
if dumby.tail: | |
child.tail = dumby.tail | |
pos = list(currElement).index(child) + 1 | |
tailResult.reverse() | |
for newChild in tailResult: | |
self.parent_map[newChild[0]] = currElement | |
currElement.insert(pos, newChild[0]) | |
if len(child): | |
self.parent_map[child] = currElement | |
stack.append((child, self.ancestors[:])) | |
for element, lst in insertQueue: | |
for i, obj in enumerate(lst): | |
newChild = obj[0] | |
element.insert(i, newChild) | |
return tree | |
class PrettifyTreeprocessor(Treeprocessor): | |
""" Add line breaks to the html document. """ | |
def _prettifyETree(self, elem: etree.Element) -> None: | |
""" Recursively add line breaks to `ElementTree` children. """ | |
i = "\n" | |
if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']: | |
if (not elem.text or not elem.text.strip()) \ | |
and len(elem) and self.md.is_block_level(elem[0].tag): | |
elem.text = i | |
for e in elem: | |
if self.md.is_block_level(e.tag): | |
self._prettifyETree(e) | |
if not elem.tail or not elem.tail.strip(): | |
elem.tail = i | |
def run(self, root: etree.Element) -> None: | |
""" Add line breaks to `Element` object and its children. """ | |
self._prettifyETree(root) | |
# Do `<br />`'s separately as they are often in the middle of | |
# inline content and missed by `_prettifyETree`. | |
brs = root.iter('br') | |
for br in brs: | |
if not br.tail or not br.tail.strip(): | |
br.tail = '\n' | |
else: | |
br.tail = '\n%s' % br.tail | |
# Clean up extra empty lines at end of code blocks. | |
pres = root.iter('pre') | |
for pre in pres: | |
if len(pre) and pre[0].tag == 'code': | |
code = pre[0] | |
# Only prettify code containing text only | |
if not len(code) and code.text is not None: | |
code.text = util.AtomicString(code.text.rstrip() + '\n') | |
class UnescapeTreeprocessor(Treeprocessor): | |
""" Restore escaped chars """ | |
RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX)) | |
def _unescape(self, m: re.Match[str]) -> str: | |
return chr(int(m.group(1))) | |
def unescape(self, text: str) -> str: | |
return self.RE.sub(self._unescape, text) | |
def run(self, root: etree.Element) -> None: | |
""" Loop over all elements and unescape all text. """ | |
for elem in root.iter(): | |
# Unescape text content | |
if elem.text and not elem.tag == 'code': | |
elem.text = self.unescape(elem.text) | |
# Unescape tail content | |
if elem.tail: | |
elem.tail = self.unescape(elem.tail) | |
# Unescape attribute values | |
for key, value in elem.items(): | |
elem.set(key, self.unescape(value)) | |