Spaces:
Running
Running
File size: 14,332 Bytes
122d3ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 |
# Python Markdown
# A Python implementation of John Gruber's Markdown.
# Documentation: https://python-markdown.github.io/
# GitHub: https://github.com/Python-Markdown/markdown/
# PyPI: https://pypi.org/project/Markdown/
# Started by Manfred Stienstra (http://www.dwerg.net/).
# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
# Currently maintained by Waylan Limberg (https://github.com/waylan),
# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
# Copyright 2004 Manfred Stienstra (the original version)
# License: BSD (see LICENSE.md for details).
"""
This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.
A copy is imported rather than the module being directly imported as this ensures that the user can import
and use the unmodified library for their own needs.
"""
from __future__ import annotations
import re
import importlib.util
import sys
from typing import TYPE_CHECKING, Sequence
if TYPE_CHECKING: # pragma: no cover
from markdown import Markdown
# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
# Users can still do `from html import parser` and get the default behavior.
spec = importlib.util.find_spec('html.parser')
htmlparser = importlib.util.module_from_spec(spec)
spec.loader.exec_module(htmlparser)
sys.modules['htmlparser'] = htmlparser
# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.
htmlparser.piclose = re.compile(r'\?>')
# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.
htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block,
# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete,
# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
htmlparser.incomplete = htmlparser.entityref
# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value.
htmlparser.locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
(?:[\s/]* # optional whitespace before attribute name
(?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
(?:\s*=+\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|"[^"]*" # LIT-enclosed value
|(?!['"])[^`>\s]* # bare value <= added backtick here
)
(?:\s*,)* # possibly followed by a comma
)?(?:\s|/(?!>))*
)*
)?
\s* # trailing whitespace
""", re.VERBOSE)
# Match a blank line at the start of a block of text (two newlines).
# The newlines may be preceded by additional whitespace.
blank_line_re = re.compile(r'^([ ]*\n){2}')
class HTMLExtractor(htmlparser.HTMLParser):
"""
Extract raw HTML from text.
The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the
[`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text
is stored in `cleandoc` as a list of strings.
"""
def __init__(self, md: Markdown, *args, **kwargs):
if 'convert_charrefs' not in kwargs:
kwargs['convert_charrefs'] = False
# Block tags that should contain no content (self closing)
self.empty_tags = set(['hr'])
self.lineno_start_cache = [0]
# This calls self.reset
super().__init__(*args, **kwargs)
self.md = md
def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.inraw = False
self.intail = False
self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags
self._cache: list[str] = []
self.cleandoc: list[str] = []
self.lineno_start_cache = [0]
super().reset()
def close(self):
"""Handle any buffered data."""
super().close()
if len(self.rawdata):
# Temp fix for https://bugs.python.org/issue41989
# TODO: remove this when the bug is fixed in all supported Python versions.
if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
self.handle_data(htmlparser.unescape(self.rawdata))
else:
self.handle_data(self.rawdata)
# Handle any unclosed tags.
if len(self._cache):
self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
self._cache = []
@property
def line_offset(self) -> int:
"""Returns char index in `self.rawdata` for the start of the current line. """
for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):
last_line_start_pos = self.lineno_start_cache[ii]
lf_pos = self.rawdata.find('\n', last_line_start_pos)
if lf_pos == -1:
# No more newlines found. Use end of raw data as start of line beyond end.
lf_pos = len(self.rawdata)
self.lineno_start_cache.append(lf_pos+1)
return self.lineno_start_cache[self.lineno-1]
def at_line_start(self) -> bool:
"""
Returns True if current position is at start of line.
Allows for up to three blank spaces at start of line.
"""
if self.offset == 0:
return True
if self.offset > 3:
return False
# Confirm up to first 3 chars are whitespace
return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''
def get_endtag_text(self, tag: str) -> str:
"""
Returns the text of the end tag.
If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
"""
# Attempt to extract actual tag from raw source text
start = self.line_offset + self.offset
m = htmlparser.endendtag.search(self.rawdata, start)
if m:
return self.rawdata[start:m.end()]
else: # pragma: no cover
# Failed to extract from raw data. Assume well formed and lowercase.
return '</{}>'.format(tag)
def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):
# Handle tags that should always be empty and do not specify a closing tag
if tag in self.empty_tags:
self.handle_startendtag(tag, attrs)
return
if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
# Started a new raw block. Prepare stack.
self.inraw = True
self.cleandoc.append('\n')
text = self.get_starttag_text()
if self.inraw:
self.stack.append(tag)
self._cache.append(text)
else:
self.cleandoc.append(text)
if tag in self.CDATA_CONTENT_ELEMENTS:
# This is presumably a standalone tag in a code span (see #1036).
self.clear_cdata_mode()
def handle_endtag(self, tag: str):
text = self.get_endtag_text(tag)
if self.inraw:
self._cache.append(text)
if tag in self.stack:
# Remove tag from stack
while self.stack:
if self.stack.pop() == tag:
break
if len(self.stack) == 0:
# End of raw block.
if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
# Preserve blank line and end of raw block.
self._cache.append('\n')
else:
# More content exists after `endtag`.
self.intail = True
# Reset stack.
self.inraw = False
self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
# Insert blank line between this and next line.
self.cleandoc.append('\n\n')
self._cache = []
else:
self.cleandoc.append(text)
def handle_data(self, data: str):
if self.intail and '\n' in data:
self.intail = False
if self.inraw:
self._cache.append(data)
else:
self.cleandoc.append(data)
def handle_empty_tag(self, data: str, is_block: bool):
""" Handle empty tags (`<data>`). """
if self.inraw or self.intail:
# Append this to the existing raw block
self._cache.append(data)
elif self.at_line_start() and is_block:
# Handle this as a standalone raw block
if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
# Preserve blank line after tag in raw block.
data += '\n'
else:
# More content exists after tag.
self.intail = True
item = self.cleandoc[-1] if self.cleandoc else ''
# If we only have one newline before block element, add another
if not item.endswith('\n\n') and item.endswith('\n'):
self.cleandoc.append('\n')
self.cleandoc.append(self.md.htmlStash.store(data))
# Insert blank line between this and next line.
self.cleandoc.append('\n\n')
else:
self.cleandoc.append(data)
def handle_startendtag(self, tag: str, attrs):
self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))
def handle_charref(self, name: str):
self.handle_empty_tag('&#{};'.format(name), is_block=False)
def handle_entityref(self, name: str):
self.handle_empty_tag('&{};'.format(name), is_block=False)
def handle_comment(self, data: str):
self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)
def handle_decl(self, data: str):
self.handle_empty_tag('<!{}>'.format(data), is_block=True)
def handle_pi(self, data: str):
self.handle_empty_tag('<?{}?>'.format(data), is_block=True)
def unknown_decl(self, data: str):
end = ']]>' if data.startswith('CDATA[') else ']>'
self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
def parse_pi(self, i: int) -> int:
if self.at_line_start() or self.intail:
return super().parse_pi(i)
# This is not the beginning of a raw block so treat as plain data
# and avoid consuming any tags which may follow (see #1066).
self.handle_data('<?')
return i + 2
def parse_html_declaration(self, i: int) -> int:
if self.at_line_start() or self.intail:
return super().parse_html_declaration(i)
# This is not the beginning of a raw block so treat as plain data
# and avoid consuming any tags which may follow (see #1066).
self.handle_data('<!')
return i + 2
def parse_bogus_comment(self, i: int, report: int = 0) -> int:
# Override the default behavior so that bogus comments get passed
# through unaltered by setting `report` to `0` (see #1425).
pos = super().parse_bogus_comment(i, report)
if pos == -1: # pragma: no cover
return -1
self.handle_empty_tag(self.rawdata[i:pos], is_block=False)
return pos
# The rest has been copied from base class in standard lib to address #1036.
# As `__startag_text` is private, all references to it must be in this subclass.
# The last few lines of `parse_starttag` are reversed so that `handle_starttag`
# can override `cdata_mode` in certain situations (in a code span).
__starttag_text: str | None = None
def get_starttag_text(self) -> str:
"""Return full source of start tag: `<...>`."""
return self.__starttag_text
def parse_starttag(self, i: int) -> int: # pragma: no cover
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between `i+1` and `j` into a tag and `attrs`
attrs = []
match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = match.group(1).lower()
while k < endpos:
m = htmlparser.attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = htmlparser.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n") # noqa: E127
else:
offset = offset + len(self.__starttag_text)
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: `<span attr="value" />`
self.handle_startendtag(tag, attrs)
else:
# *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) ***
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
self.handle_starttag(tag, attrs)
return endpos
|