|
import re |
|
from typing import Optional, List, Tuple, Match |
|
from .util import ( |
|
unikey, |
|
escape_url, |
|
expand_tab, |
|
expand_leading_tab, |
|
) |
|
from .core import Parser, BlockState |
|
from .helpers import ( |
|
LINK_LABEL, |
|
HTML_TAGNAME, |
|
HTML_ATTRIBUTES, |
|
BLOCK_TAGS, |
|
PRE_TAGS, |
|
unescape_char, |
|
parse_link_href, |
|
parse_link_title, |
|
) |
|
from .list_parser import parse_list, LIST_PATTERN |
|
|
|
_INDENT_CODE_TRIM = re.compile(r'^ {1,4}', flags=re.M) |
|
_AXT_HEADING_TRIM = re.compile(r'(\s+|^)#+\s*$') |
|
_BLOCK_QUOTE_TRIM = re.compile(r'^ ?', flags=re.M) |
|
_BLOCK_QUOTE_LEADING = re.compile(r'^ *>', flags=re.M) |
|
|
|
_LINE_BLANK_END = re.compile(r'\n[ \t]*\n$') |
|
_BLANK_TO_LINE = re.compile(r'[ \t]*\n') |
|
|
|
_BLOCK_TAGS_PATTERN = '|'.join(BLOCK_TAGS) + '|' + '|'.join(PRE_TAGS) |
|
_OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r'[ \t]*>[ \t]*(?:\n|$)') |
|
_CLOSE_TAG_END = re.compile(r'[ \t]*>[ \t]*(?:\n|$)') |
|
_STRICT_BLOCK_QUOTE = re.compile(r'( {0,3}>[^\n]*(?:\n|$))+') |
|
|
|
|
|
class BlockParser(Parser): |
|
BLANK_LINE = re.compile(r'(^[ \t\v\f]*\n)+', re.M) |
|
|
|
RAW_HTML = ( |
|
r'^ {0,3}(' |
|
r'</?' + HTML_TAGNAME + r'|' |
|
r'<!--|' |
|
r'<\?|' |
|
r'<![A-Z]|' |
|
r'<!\[CDATA\[)' |
|
) |
|
|
|
BLOCK_HTML = ( |
|
r'^ {0,3}(?:' |
|
r'(?:</?' + _BLOCK_TAGS_PATTERN + r'(?:[ \t]+|\n|$))' |
|
r'|<!--' |
|
r'|<\?' |
|
r'|<![A-Z]' |
|
r'|<!\[CDATA\[)' |
|
) |
|
|
|
SPECIFICATION = { |
|
'blank_line': r'(^[ \t\v\f]*\n)+', |
|
'axt_heading': r'^ {0,3}(?P<axt_1>#{1,6})(?!#+)(?P<axt_2>[ \t]*|[ \t]+.*?)$', |
|
'setex_heading': r'^ {0,3}(?P<setext_1>=|-){1,}[ \t]*$', |
|
'fenced_code': ( |
|
r'^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}|~{3,})' |
|
r'[ \t]*(?P<fenced_3>.*?)$' |
|
), |
|
'indent_code': ( |
|
r'^(?: {4}| *\t)[^\n]+(?:\n+|$)' |
|
r'((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*' |
|
), |
|
'thematic_break': r'^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$', |
|
'ref_link': r'^ {0,3}\[(?P<reflink_1>' + LINK_LABEL + r')\]:', |
|
'block_quote': r'^ {0,3}>(?P<quote_1>.*?)$', |
|
'list': LIST_PATTERN, |
|
'block_html': BLOCK_HTML, |
|
'raw_html': RAW_HTML, |
|
} |
|
|
|
DEFAULT_RULES = ( |
|
'fenced_code', |
|
'indent_code', |
|
'axt_heading', |
|
'setex_heading', |
|
'thematic_break', |
|
'block_quote', |
|
'list', |
|
'ref_link', |
|
'raw_html', |
|
'blank_line', |
|
) |
|
|
|
def __init__( |
|
self, |
|
block_quote_rules: Optional[List[str]]=None, |
|
list_rules: Optional[List[str]]=None, |
|
max_nested_level: int=6 |
|
): |
|
super(BlockParser, self).__init__() |
|
|
|
if block_quote_rules is None: |
|
block_quote_rules = list(self.DEFAULT_RULES) |
|
|
|
if list_rules is None: |
|
list_rules = list(self.DEFAULT_RULES) |
|
|
|
self.block_quote_rules = block_quote_rules |
|
self.list_rules = list_rules |
|
self.max_nested_level = max_nested_level |
|
|
|
self._methods = { |
|
name: getattr(self, 'parse_' + name) for name in self.SPECIFICATION |
|
} |
|
|
|
def parse_blank_line(self, m: Match, state: BlockState) -> int: |
|
"""Parse token for blank lines.""" |
|
state.append_token({'type': 'blank_line'}) |
|
return m.end() |
|
|
|
def parse_thematic_break(self, m: Match, state: BlockState) -> int: |
|
"""Parse token for thematic break, e.g. ``<hr>`` tag in HTML.""" |
|
state.append_token({'type': 'thematic_break'}) |
|
|
|
return m.end() + 1 |
|
|
|
def parse_indent_code(self, m: Match, state: BlockState) -> int: |
|
"""Parse token for code block which is indented by 4 spaces.""" |
|
|
|
end_pos = state.append_paragraph() |
|
if end_pos: |
|
return end_pos |
|
|
|
code = m.group(0) |
|
code = expand_leading_tab(code) |
|
code = _INDENT_CODE_TRIM.sub('', code) |
|
code = code.strip('\n') |
|
state.append_token({'type': 'block_code', 'raw': code, 'style': 'indent'}) |
|
return m.end() |
|
|
|
def parse_fenced_code(self, m: Match, state: BlockState) -> Optional[int]: |
|
"""Parse token for fenced code block. A fenced code block is started with |
|
3 or more backtick(`) or tilde(~). |
|
|
|
An example of a fenced code block: |
|
|
|
.. code-block:: markdown |
|
|
|
```python |
|
def markdown(text): |
|
return mistune.html(text) |
|
``` |
|
""" |
|
spaces = m.group('fenced_1') |
|
marker = m.group('fenced_2') |
|
info = m.group('fenced_3') |
|
|
|
c = marker[0] |
|
if info and c == '`': |
|
|
|
|
|
if info.find(c) != -1: |
|
return |
|
|
|
_end = re.compile( |
|
r'^ {0,3}' + c + '{' + str(len(marker)) + r',}[ \t]*(?:\n|$)', re.M) |
|
cursor_start = m.end() + 1 |
|
|
|
m2 = _end.search(state.src, cursor_start) |
|
if m2: |
|
code = state.src[cursor_start:m2.start()] |
|
end_pos = m2.end() |
|
else: |
|
code = state.src[cursor_start:] |
|
end_pos = state.cursor_max |
|
|
|
if spaces and code: |
|
_trim_pattern = re.compile('^ {0,' + str(len(spaces)) + '}', re.M) |
|
code = _trim_pattern.sub('', code) |
|
|
|
token = {'type': 'block_code', 'raw': code, 'style': 'fenced', 'marker': marker} |
|
if info: |
|
info = unescape_char(info) |
|
token['attrs'] = {'info': info.strip()} |
|
|
|
state.append_token(token) |
|
return end_pos |
|
|
|
def parse_axt_heading(self, m: Match, state: BlockState) -> int: |
|
"""Parse token for AXT heading. An AXT heading is started with 1 to 6 |
|
symbol of ``#``.""" |
|
level = len(m.group('axt_1')) |
|
text = m.group('axt_2').strip() |
|
|
|
if text: |
|
text = _AXT_HEADING_TRIM.sub('', text) |
|
|
|
token = {'type': 'heading', 'text': text, 'attrs': {'level': level}, 'style': 'axt'} |
|
state.append_token(token) |
|
return m.end() + 1 |
|
|
|
def parse_setex_heading(self, m: Match, state: BlockState) -> Optional[int]: |
|
"""Parse token for setex style heading. A setex heading syntax looks like: |
|
|
|
.. code-block:: markdown |
|
|
|
H1 title |
|
======== |
|
""" |
|
last_token = state.last_token() |
|
if last_token and last_token['type'] == 'paragraph': |
|
level = 1 if m.group('setext_1') == '=' else 2 |
|
last_token['type'] = 'heading' |
|
last_token['style'] = 'setext' |
|
last_token['attrs'] = {'level': level} |
|
return m.end() + 1 |
|
|
|
sc = self.compile_sc(['thematic_break', 'list']) |
|
m = sc.match(state.src, state.cursor) |
|
if m: |
|
return self.parse_method(m, state) |
|
|
|
def parse_ref_link(self, m: Match, state: BlockState) -> Optional[int]: |
|
"""Parse link references and save the link information into ``state.env``. |
|
|
|
Here is an example of a link reference: |
|
|
|
.. code-block:: markdown |
|
|
|
a [link][example] |
|
|
|
[example]: https://example.com "Optional title" |
|
|
|
This method will save the link reference into ``state.env`` as:: |
|
|
|
state.env['ref_links']['example'] = { |
|
'url': 'https://example.com', |
|
'title': "Optional title", |
|
} |
|
""" |
|
end_pos = state.append_paragraph() |
|
if end_pos: |
|
return end_pos |
|
|
|
label = m.group('reflink_1') |
|
key = unikey(label) |
|
if not key: |
|
return |
|
|
|
href, href_pos = parse_link_href(state.src, m.end(), block=True) |
|
if href is None: |
|
return |
|
|
|
_blank = self.BLANK_LINE.search(state.src, href_pos) |
|
if _blank: |
|
max_pos = _blank.start() |
|
else: |
|
max_pos = state.cursor_max |
|
|
|
title, title_pos = parse_link_title(state.src, href_pos, max_pos) |
|
if title_pos: |
|
m = _BLANK_TO_LINE.match(state.src, title_pos) |
|
if m: |
|
title_pos = m.end() |
|
else: |
|
title_pos = None |
|
title = None |
|
|
|
if title_pos is None: |
|
m = _BLANK_TO_LINE.match(state.src, href_pos) |
|
if m: |
|
href_pos = m.end() |
|
else: |
|
href_pos = None |
|
href = None |
|
|
|
end_pos = title_pos or href_pos |
|
if not end_pos: |
|
return |
|
|
|
if key not in state.env['ref_links']: |
|
href = unescape_char(href) |
|
data = {'url': escape_url(href), 'label': label} |
|
if title: |
|
data['title'] = title |
|
state.env['ref_links'][key] = data |
|
return end_pos |
|
|
|
def extract_block_quote(self, m: Match, state: BlockState) -> Tuple[str, int]: |
|
"""Extract text and cursor end position of a block quote.""" |
|
|
|
|
|
text = m.group('quote_1') + '\n' |
|
text = expand_leading_tab(text, 3) |
|
text = _BLOCK_QUOTE_TRIM.sub('', text) |
|
|
|
sc = self.compile_sc(['blank_line', 'indent_code', 'fenced_code']) |
|
require_marker = bool(sc.match(text)) |
|
|
|
state.cursor = m.end() + 1 |
|
|
|
end_pos = None |
|
if require_marker: |
|
m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) |
|
if m: |
|
quote = m.group(0) |
|
quote = _BLOCK_QUOTE_LEADING.sub('', quote) |
|
quote = expand_leading_tab(quote, 3) |
|
quote = _BLOCK_QUOTE_TRIM.sub('', quote) |
|
text += quote |
|
state.cursor = m.end() |
|
else: |
|
prev_blank_line = False |
|
break_sc = self.compile_sc([ |
|
'blank_line', 'thematic_break', 'fenced_code', |
|
'list', 'block_html', |
|
]) |
|
while state.cursor < state.cursor_max: |
|
m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) |
|
if m: |
|
quote = m.group(0) |
|
quote = _BLOCK_QUOTE_LEADING.sub('', quote) |
|
quote = expand_leading_tab(quote, 3) |
|
quote = _BLOCK_QUOTE_TRIM.sub('', quote) |
|
text += quote |
|
state.cursor = m.end() |
|
if not quote.strip(): |
|
prev_blank_line = True |
|
else: |
|
prev_blank_line = bool(_LINE_BLANK_END.search(quote)) |
|
continue |
|
|
|
if prev_blank_line: |
|
|
|
|
|
|
|
break |
|
|
|
m = break_sc.match(state.src, state.cursor) |
|
if m: |
|
end_pos = self.parse_method(m, state) |
|
if end_pos: |
|
break |
|
|
|
|
|
pos = state.find_line_end() |
|
line = state.get_text(pos) |
|
line = expand_leading_tab(line, 3) |
|
text += line |
|
state.cursor = pos |
|
|
|
|
|
|
|
return expand_tab(text), end_pos |
|
|
|
def parse_block_quote(self, m: Match, state: BlockState) -> int: |
|
"""Parse token for block quote. Here is an example of the syntax: |
|
|
|
.. code-block:: markdown |
|
|
|
> a block quote starts |
|
> with right arrows |
|
""" |
|
text, end_pos = self.extract_block_quote(m, state) |
|
|
|
child = state.child_state(text) |
|
if state.depth() >= self.max_nested_level - 1: |
|
rules = list(self.block_quote_rules) |
|
rules.remove('block_quote') |
|
else: |
|
rules = self.block_quote_rules |
|
|
|
self.parse(child, rules) |
|
token = {'type': 'block_quote', 'children': child.tokens} |
|
if end_pos: |
|
state.prepend_token(token) |
|
return end_pos |
|
state.append_token(token) |
|
return state.cursor |
|
|
|
def parse_list(self, m: Match, state: BlockState) -> int: |
|
"""Parse tokens for ordered and unordered list.""" |
|
return parse_list(self, m, state) |
|
|
|
def parse_block_html(self, m: Match, state: BlockState) -> Optional[int]: |
|
return self.parse_raw_html(m, state) |
|
|
|
def parse_raw_html(self, m: Match, state: BlockState) -> Optional[int]: |
|
marker = m.group(0).strip() |
|
|
|
|
|
if marker == '<!--': |
|
return _parse_html_to_end(state, '-->', m.end()) |
|
|
|
|
|
if marker == '<?': |
|
return _parse_html_to_end(state, '?>', m.end()) |
|
|
|
|
|
if marker == '<![CDATA[': |
|
return _parse_html_to_end(state, ']]>', m.end()) |
|
|
|
|
|
if marker.startswith('<!'): |
|
return _parse_html_to_end(state, '>', m.end()) |
|
|
|
close_tag = None |
|
open_tag = None |
|
if marker.startswith('</'): |
|
close_tag = marker[2:].lower() |
|
|
|
if close_tag in BLOCK_TAGS: |
|
return _parse_html_to_newline(state, self.BLANK_LINE) |
|
else: |
|
open_tag = marker[1:].lower() |
|
|
|
if open_tag in PRE_TAGS: |
|
end_tag = '</' + open_tag + '>' |
|
return _parse_html_to_end(state, end_tag, m.end()) |
|
|
|
if open_tag in BLOCK_TAGS: |
|
return _parse_html_to_newline(state, self.BLANK_LINE) |
|
|
|
|
|
end_pos = state.append_paragraph() |
|
if end_pos: |
|
return end_pos |
|
|
|
|
|
start_pos = m.end() |
|
end_pos = state.find_line_end() |
|
if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or \ |
|
(close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos)): |
|
return _parse_html_to_newline(state, self.BLANK_LINE) |
|
|
|
def parse(self, state: BlockState, rules: Optional[List[str]]=None) -> None: |
|
sc = self.compile_sc(rules) |
|
|
|
while state.cursor < state.cursor_max: |
|
m = sc.search(state.src, state.cursor) |
|
if not m: |
|
break |
|
|
|
end_pos = m.start() |
|
if end_pos > state.cursor: |
|
text = state.get_text(end_pos) |
|
state.add_paragraph(text) |
|
state.cursor = end_pos |
|
|
|
end_pos = self.parse_method(m, state) |
|
if end_pos: |
|
state.cursor = end_pos |
|
else: |
|
end_pos = state.find_line_end() |
|
text = state.get_text(end_pos) |
|
state.add_paragraph(text) |
|
state.cursor = end_pos |
|
|
|
if state.cursor < state.cursor_max: |
|
text = state.src[state.cursor:] |
|
state.add_paragraph(text) |
|
state.cursor = state.cursor_max |
|
|
|
|
|
def _parse_html_to_end(state, end_marker, start_pos): |
|
marker_pos = state.src.find(end_marker, start_pos) |
|
if marker_pos == -1: |
|
text = state.src[state.cursor:] |
|
end_pos = state.cursor_max |
|
else: |
|
text = state.get_text(marker_pos) |
|
state.cursor = marker_pos |
|
end_pos = state.find_line_end() |
|
text += state.get_text(end_pos) |
|
|
|
state.append_token({'type': 'block_html', 'raw': text}) |
|
return end_pos |
|
|
|
|
|
def _parse_html_to_newline(state, newline): |
|
m = newline.search(state.src, state.cursor) |
|
if m: |
|
end_pos = m.start() |
|
text = state.get_text(end_pos) |
|
else: |
|
text = state.src[state.cursor:] |
|
end_pos = state.cursor_max |
|
|
|
state.append_token({'type': 'block_html', 'raw': text}) |
|
return end_pos |
|
|