Spaces:
Running
Running
# Python-Markdown Markdown in HTML Extension | |
# =============================== | |
# An implementation of [PHP Markdown Extra](http://michelf.com/projects/php-markdown/extra/)'s | |
# parsing of Markdown syntax in raw HTML. | |
# See https://Python-Markdown.github.io/extensions/raw_html | |
# for documentation. | |
# Copyright The Python Markdown Project | |
# License: [BSD](https://opensource.org/licenses/bsd-license.php) | |
""" | |
An implementation of [PHP Markdown Extra](http://michelf.com/projects/php-markdown/extra/)'s | |
parsing of Markdown syntax in raw HTML. | |
See the [documentation](https://Python-Markdown.github.io/extensions/raw_html) | |
for details. | |
""" | |
from __future__ import annotations | |
from . import Extension | |
from ..blockprocessors import BlockProcessor | |
from ..preprocessors import Preprocessor | |
from ..postprocessors import RawHtmlPostprocessor | |
from .. import util | |
from ..htmlparser import HTMLExtractor, blank_line_re | |
import xml.etree.ElementTree as etree | |
from typing import TYPE_CHECKING, Literal, Mapping | |
if TYPE_CHECKING: # pragma: no cover | |
from markdown import Markdown | |
class HTMLExtractorExtra(HTMLExtractor): | |
""" | |
Override `HTMLExtractor` and create `etree` `Elements` for any elements which should have content parsed as | |
Markdown. | |
""" | |
def __init__(self, md: Markdown, *args, **kwargs): | |
# All block-level tags. | |
self.block_level_tags = set(md.block_level_elements.copy()) | |
# Block-level tags in which the content only gets span level parsing | |
self.span_tags = set( | |
['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'summary', 'td', 'th'] | |
) | |
# Block-level tags which never get their content parsed. | |
self.raw_tags = set(['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea']) | |
super().__init__(md, *args, **kwargs) | |
# Block-level tags in which the content gets parsed as blocks | |
self.block_tags = set(self.block_level_tags) - (self.span_tags | self.raw_tags | self.empty_tags) | |
self.span_and_blocks_tags = self.block_tags | self.span_tags | |
def reset(self): | |
"""Reset this instance. Loses all unprocessed data.""" | |
self.mdstack: list[str] = [] # When markdown=1, stack contains a list of tags | |
self.treebuilder = etree.TreeBuilder() | |
self.mdstate: list[Literal['block', 'span', 'off', None]] = [] | |
super().reset() | |
def close(self): | |
"""Handle any buffered data.""" | |
super().close() | |
# Handle any unclosed tags. | |
if self.mdstack: | |
# Close the outermost parent. `handle_endtag` will close all unclosed children. | |
self.handle_endtag(self.mdstack[0]) | |
def get_element(self) -> etree.Element: | |
""" Return element from `treebuilder` and reset `treebuilder` for later use. """ | |
element = self.treebuilder.close() | |
self.treebuilder = etree.TreeBuilder() | |
return element | |
def get_state(self, tag, attrs: Mapping[str, str]) -> Literal['block', 'span', 'off', None]: | |
""" Return state from tag and `markdown` attribute. One of 'block', 'span', or 'off'. """ | |
md_attr = attrs.get('markdown', '0') | |
if md_attr == 'markdown': | |
# `<tag markdown>` is the same as `<tag markdown='1'>`. | |
md_attr = '1' | |
parent_state = self.mdstate[-1] if self.mdstate else None | |
if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'): | |
# Only use the parent state if it is more restrictive than the markdown attribute. | |
md_attr = parent_state | |
if ((md_attr == '1' and tag in self.block_tags) or | |
(md_attr == 'block' and tag in self.span_and_blocks_tags)): | |
return 'block' | |
elif ((md_attr == '1' and tag in self.span_tags) or | |
(md_attr == 'span' and tag in self.span_and_blocks_tags)): | |
return 'span' | |
elif tag in self.block_level_tags: | |
return 'off' | |
else: # pragma: no cover | |
return None | |
def handle_starttag(self, tag, attrs): | |
# Handle tags that should always be empty and do not specify a closing tag | |
if tag in self.empty_tags and (self.at_line_start() or self.intail): | |
attrs = {key: value if value is not None else key for key, value in attrs} | |
if "markdown" in attrs: | |
attrs.pop('markdown') | |
element = etree.Element(tag, attrs) | |
data = etree.tostring(element, encoding='unicode', method='html') | |
else: | |
data = self.get_starttag_text() | |
self.handle_empty_tag(data, True) | |
return | |
if tag in self.block_level_tags and (self.at_line_start() or self.intail): | |
# Valueless attribute (ex: `<tag checked>`) results in `[('checked', None)]`. | |
# Convert to `{'checked': 'checked'}`. | |
attrs = {key: value if value is not None else key for key, value in attrs} | |
state = self.get_state(tag, attrs) | |
if self.inraw or (state in [None, 'off'] and not self.mdstack): | |
# fall back to default behavior | |
attrs.pop('markdown', None) | |
super().handle_starttag(tag, attrs) | |
else: | |
if 'p' in self.mdstack and tag in self.block_level_tags: | |
# Close unclosed 'p' tag | |
self.handle_endtag('p') | |
self.mdstate.append(state) | |
self.mdstack.append(tag) | |
attrs['markdown'] = state | |
self.treebuilder.start(tag, attrs) | |
else: | |
# Span level tag | |
if self.inraw: | |
super().handle_starttag(tag, attrs) | |
else: | |
text = self.get_starttag_text() | |
if self.mdstate and self.mdstate[-1] == "off": | |
self.handle_data(self.md.htmlStash.store(text)) | |
else: | |
self.handle_data(text) | |
if tag in self.CDATA_CONTENT_ELEMENTS: | |
# This is presumably a standalone tag in a code span (see #1036). | |
self.clear_cdata_mode() | |
def handle_endtag(self, tag): | |
if tag in self.block_level_tags: | |
if self.inraw: | |
super().handle_endtag(tag) | |
elif tag in self.mdstack: | |
# Close element and any unclosed children | |
while self.mdstack: | |
item = self.mdstack.pop() | |
self.mdstate.pop() | |
self.treebuilder.end(item) | |
if item == tag: | |
break | |
if not self.mdstack: | |
# Last item in stack is closed. Stash it | |
element = self.get_element() | |
# Get last entry to see if it ends in newlines | |
# If it is an element, assume there is no newlines | |
item = self.cleandoc[-1] if self.cleandoc else '' | |
# If we only have one newline before block element, add another | |
if not item.endswith('\n\n') and item.endswith('\n'): | |
self.cleandoc.append('\n') | |
self.cleandoc.append(self.md.htmlStash.store(element)) | |
self.cleandoc.append('\n\n') | |
self.state = [] | |
# Check if element has a tail | |
if not blank_line_re.match( | |
self.rawdata[self.line_offset + self.offset + len(self.get_endtag_text(tag)):]): | |
# More content exists after `endtag`. | |
self.intail = True | |
else: | |
# Treat orphan closing tag as a span level tag. | |
text = self.get_endtag_text(tag) | |
if self.mdstate and self.mdstate[-1] == "off": | |
self.handle_data(self.md.htmlStash.store(text)) | |
else: | |
self.handle_data(text) | |
else: | |
# Span level tag | |
if self.inraw: | |
super().handle_endtag(tag) | |
else: | |
text = self.get_endtag_text(tag) | |
if self.mdstate and self.mdstate[-1] == "off": | |
self.handle_data(self.md.htmlStash.store(text)) | |
else: | |
self.handle_data(text) | |
def handle_startendtag(self, tag, attrs): | |
if tag in self.empty_tags: | |
attrs = {key: value if value is not None else key for key, value in attrs} | |
if "markdown" in attrs: | |
attrs.pop('markdown') | |
element = etree.Element(tag, attrs) | |
data = etree.tostring(element, encoding='unicode', method='html') | |
else: | |
data = self.get_starttag_text() | |
else: | |
data = self.get_starttag_text() | |
self.handle_empty_tag(data, is_block=self.md.is_block_level(tag)) | |
def handle_data(self, data): | |
if self.intail and '\n' in data: | |
self.intail = False | |
if self.inraw or not self.mdstack: | |
super().handle_data(data) | |
else: | |
self.treebuilder.data(data) | |
def handle_empty_tag(self, data, is_block): | |
if self.inraw or not self.mdstack: | |
super().handle_empty_tag(data, is_block) | |
else: | |
if self.at_line_start() and is_block: | |
self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n') | |
else: | |
self.handle_data(self.md.htmlStash.store(data)) | |
def parse_pi(self, i: int) -> int: | |
if self.at_line_start() or self.intail or self.mdstack: | |
# The same override exists in `HTMLExtractor` without the check | |
# for `mdstack`. Therefore, use parent of `HTMLExtractor` instead. | |
return super(HTMLExtractor, self).parse_pi(i) | |
# This is not the beginning of a raw block so treat as plain data | |
# and avoid consuming any tags which may follow (see #1066). | |
self.handle_data('<?') | |
return i + 2 | |
def parse_html_declaration(self, i: int) -> int: | |
if self.at_line_start() or self.intail or self.mdstack: | |
# The same override exists in `HTMLExtractor` without the check | |
# for `mdstack`. Therefore, use parent of `HTMLExtractor` instead. | |
return super(HTMLExtractor, self).parse_html_declaration(i) | |
# This is not the beginning of a raw block so treat as plain data | |
# and avoid consuming any tags which may follow (see #1066). | |
self.handle_data('<!') | |
return i + 2 | |
class HtmlBlockPreprocessor(Preprocessor): | |
"""Remove html blocks from the text and store them for later retrieval.""" | |
def run(self, lines: list[str]) -> list[str]: | |
source = '\n'.join(lines) | |
parser = HTMLExtractorExtra(self.md) | |
parser.feed(source) | |
parser.close() | |
return ''.join(parser.cleandoc).split('\n') | |
class MarkdownInHtmlProcessor(BlockProcessor): | |
"""Process Markdown Inside HTML Blocks which have been stored in the `HtmlStash`.""" | |
def test(self, parent: etree.Element, block: str) -> bool: | |
# Always return True. `run` will return `False` it not a valid match. | |
return True | |
def parse_element_content(self, element: etree.Element) -> None: | |
""" | |
Recursively parse the text content of an `etree` Element as Markdown. | |
Any block level elements generated from the Markdown will be inserted as children of the element in place | |
of the text content. All `markdown` attributes are removed. For any elements in which Markdown parsing has | |
been disabled, the text content of it and its children are wrapped in an `AtomicString`. | |
""" | |
md_attr = element.attrib.pop('markdown', 'off') | |
if md_attr == 'block': | |
# Parse content as block level | |
# The order in which the different parts are parsed (text, children, tails) is important here as the | |
# order of elements needs to be preserved. We can't be inserting items at a later point in the current | |
# iteration as we don't want to do raw processing on elements created from parsing Markdown text (for | |
# example). Therefore, the order of operations is children, tails, text. | |
# Recursively parse existing children from raw HTML | |
for child in list(element): | |
self.parse_element_content(child) | |
# Parse Markdown text in tail of children. Do this separate to avoid raw HTML parsing. | |
# Save the position of each item to be inserted later in reverse. | |
tails = [] | |
for pos, child in enumerate(element): | |
if child.tail: | |
block = child.tail.rstrip('\n') | |
child.tail = '' | |
# Use a dummy placeholder element. | |
dummy = etree.Element('div') | |
self.parser.parseBlocks(dummy, block.split('\n\n')) | |
children = list(dummy) | |
children.reverse() | |
tails.append((pos + 1, children)) | |
# Insert the elements created from the tails in reverse. | |
tails.reverse() | |
for pos, tail in tails: | |
for item in tail: | |
element.insert(pos, item) | |
# Parse Markdown text content. Do this last to avoid raw HTML parsing. | |
if element.text: | |
block = element.text.rstrip('\n') | |
element.text = '' | |
# Use a dummy placeholder element as the content needs to get inserted before existing children. | |
dummy = etree.Element('div') | |
self.parser.parseBlocks(dummy, block.split('\n\n')) | |
children = list(dummy) | |
children.reverse() | |
for child in children: | |
element.insert(0, child) | |
elif md_attr == 'span': | |
# Span level parsing will be handled by inline processors. | |
# Walk children here to remove any `markdown` attributes. | |
for child in list(element): | |
self.parse_element_content(child) | |
else: | |
# Disable inline parsing for everything else | |
if element.text is None: | |
element.text = '' | |
element.text = util.AtomicString(element.text) | |
for child in list(element): | |
self.parse_element_content(child) | |
if child.tail: | |
child.tail = util.AtomicString(child.tail) | |
def run(self, parent: etree.Element, blocks: list[str]) -> bool: | |
m = util.HTML_PLACEHOLDER_RE.match(blocks[0]) | |
if m: | |
index = int(m.group(1)) | |
element = self.parser.md.htmlStash.rawHtmlBlocks[index] | |
if isinstance(element, etree.Element): | |
# We have a matched element. Process it. | |
blocks.pop(0) | |
self.parse_element_content(element) | |
parent.append(element) | |
# Cleanup stash. Replace element with empty string to avoid confusing postprocessor. | |
self.parser.md.htmlStash.rawHtmlBlocks.pop(index) | |
self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '') | |
# Confirm the match to the `blockparser`. | |
return True | |
# No match found. | |
return False | |
class MarkdownInHTMLPostprocessor(RawHtmlPostprocessor): | |
def stash_to_string(self, text: str | etree.Element) -> str: | |
""" Override default to handle any `etree` elements still in the stash. """ | |
if isinstance(text, etree.Element): | |
return self.md.serializer(text) | |
else: | |
return str(text) | |
class MarkdownInHtmlExtension(Extension): | |
"""Add Markdown parsing in HTML to Markdown class.""" | |
def extendMarkdown(self, md): | |
""" Register extension instances. """ | |
# Replace raw HTML preprocessor | |
md.preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20) | |
# Add `blockprocessor` which handles the placeholders for `etree` elements | |
md.parser.blockprocessors.register( | |
MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105 | |
) | |
# Replace raw HTML postprocessor | |
md.postprocessors.register(MarkdownInHTMLPostprocessor(md), 'raw_html', 30) | |
def makeExtension(**kwargs): # pragma: no cover | |
return MarkdownInHtmlExtension(**kwargs) | |