|
import re |
|
from typing import Optional, List, Dict, Any, Match |
|
from .core import Parser, InlineState |
|
from .util import ( |
|
escape, |
|
escape_url, |
|
unikey, |
|
) |
|
from .helpers import ( |
|
PREVENT_BACKSLASH, |
|
PUNCTUATION, |
|
HTML_TAGNAME, |
|
HTML_ATTRIBUTES, |
|
unescape_char, |
|
parse_link, |
|
parse_link_label, |
|
parse_link_text, |
|
) |
|
|
|
PAREN_END_RE = re.compile(r'\s*\)') |
|
|
|
AUTO_EMAIL = ( |
|
r'''<[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9]''' |
|
r'(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?' |
|
r'(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>' |
|
) |
|
|
|
INLINE_HTML = ( |
|
r'<' + HTML_TAGNAME + HTML_ATTRIBUTES + r'\s*/?>|' |
|
r'</' + HTML_TAGNAME + r'\s*>|' |
|
r'<!--(?!>|->)(?:(?!--)[\s\S])+?(?<!-)-->|' |
|
r'<\?[\s\S]+?\?>|' |
|
r'<![A-Z][\s\S]+?>|' |
|
r'<!\[CDATA[\s\S]+?\]\]>' |
|
) |
|
|
|
EMPHASIS_END_RE = { |
|
'*': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*(?!\*)'), |
|
'_': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])_(?!_)\b'), |
|
|
|
'**': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*\*(?!\*)'), |
|
'__': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])__(?!_)\b'), |
|
|
|
'***': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*\*\*(?!\*)'), |
|
'___': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])___(?!_)\b'), |
|
} |
|
|
|
|
|
class InlineParser(Parser): |
|
sc_flag = 0 |
|
state_cls = InlineState |
|
|
|
|
|
STD_LINEBREAK = r'(?:\\| {2,})\n\s*' |
|
|
|
|
|
HARD_LINEBREAK = r' *\n\s*' |
|
|
|
|
|
SPECIFICATION = { |
|
|
|
'escape': r'(?:\\' + PUNCTUATION + ')+', |
|
|
|
|
|
'codespan': r'`{1,}', |
|
|
|
|
|
'emphasis': r'\*{1,3}(?=[^\s*])|\b_{1,3}(?=[^\s_])', |
|
|
|
|
|
'link': r'!?\[', |
|
|
|
|
|
'auto_link': r'<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>', |
|
'auto_email': AUTO_EMAIL, |
|
|
|
'inline_html': INLINE_HTML, |
|
|
|
'linebreak': STD_LINEBREAK, |
|
'softbreak': HARD_LINEBREAK, |
|
|
|
'prec_auto_link': r'<[A-Za-z][A-Za-z\d.+-]{1,31}:', |
|
'prec_inline_html': r'</?' + HTML_TAGNAME + r'|<!|<\?', |
|
} |
|
DEFAULT_RULES = ( |
|
'escape', |
|
'codespan', |
|
'emphasis', |
|
'link', |
|
'auto_link', |
|
'auto_email', |
|
'inline_html', |
|
'linebreak', |
|
) |
|
|
|
def __init__(self, hard_wrap: bool=False): |
|
super(InlineParser, self).__init__() |
|
|
|
self.hard_wrap = hard_wrap |
|
|
|
if hard_wrap: |
|
self.specification['linebreak'] = self.HARD_LINEBREAK |
|
else: |
|
self.rules.append('softbreak') |
|
|
|
self._methods = { |
|
name: getattr(self, 'parse_' + name) for name in self.rules |
|
} |
|
|
|
def parse_escape(self, m: Match, state: InlineState) -> int: |
|
text = m.group(0) |
|
text = unescape_char(text) |
|
state.append_token({ |
|
'type': 'text', |
|
'raw': text, |
|
}) |
|
return m.end() |
|
|
|
def parse_link(self, m: Match, state: InlineState) -> Optional[int]: |
|
pos = m.end() |
|
|
|
marker = m.group(0) |
|
is_image = marker[0] == '!' |
|
if is_image and state.in_image: |
|
state.append_token({'type': 'text', 'raw': marker}) |
|
return pos |
|
elif not is_image and state.in_link: |
|
state.append_token({'type': 'text', 'raw': marker}) |
|
return pos |
|
|
|
text = None |
|
label, end_pos = parse_link_label(state.src, pos) |
|
if label is None: |
|
text, end_pos = parse_link_text(state.src, pos) |
|
if text is None: |
|
return |
|
|
|
if text is None: |
|
text = label |
|
|
|
if end_pos >= len(state.src) and label is None: |
|
return |
|
|
|
rules = ['codespan', 'prec_auto_link', 'prec_inline_html'] |
|
prec_pos = self.precedence_scan(m, state, end_pos, rules) |
|
if prec_pos: |
|
return prec_pos |
|
|
|
if end_pos < len(state.src): |
|
c = state.src[end_pos] |
|
if c == '(': |
|
|
|
attrs, pos2 = parse_link(state.src, end_pos + 1) |
|
if pos2: |
|
token = self.__parse_link_token(is_image, text, attrs, state) |
|
state.append_token(token) |
|
return pos2 |
|
|
|
elif c == '[': |
|
|
|
label2, pos2 = parse_link_label(state.src, end_pos + 1) |
|
if pos2: |
|
end_pos = pos2 |
|
if label2: |
|
label = label2 |
|
|
|
if label is None: |
|
return |
|
|
|
ref_links = state.env.get('ref_links') |
|
if not ref_links: |
|
return |
|
|
|
key = unikey(label) |
|
env = ref_links.get(key) |
|
if env: |
|
attrs = {'url': env['url'], 'title': env.get('title')} |
|
token = self.__parse_link_token(is_image, text, attrs, state) |
|
token['ref'] = key |
|
token['label'] = label |
|
state.append_token(token) |
|
return end_pos |
|
|
|
def __parse_link_token(self, is_image, text, attrs, state): |
|
new_state = state.copy() |
|
new_state.src = text |
|
if is_image: |
|
new_state.in_image = True |
|
token = { |
|
'type': 'image', |
|
'children': self.render(new_state), |
|
'attrs': attrs, |
|
} |
|
else: |
|
new_state.in_link = True |
|
token = { |
|
'type': 'link', |
|
'children': self.render(new_state), |
|
'attrs': attrs, |
|
} |
|
return token |
|
|
|
def parse_auto_link(self, m: Match, state: InlineState) -> int: |
|
text = m.group(0) |
|
pos = m.end() |
|
if state.in_link: |
|
self.process_text(text, state) |
|
return pos |
|
|
|
text = text[1:-1] |
|
self._add_auto_link(text, text, state) |
|
return pos |
|
|
|
def parse_auto_email(self, m: Match, state: InlineState) -> int: |
|
text = m.group(0) |
|
pos = m.end() |
|
if state.in_link: |
|
self.process_text(text, state) |
|
return pos |
|
|
|
text = text[1:-1] |
|
url = 'mailto:' + text |
|
self._add_auto_link(url, text, state) |
|
return pos |
|
|
|
def _add_auto_link(self, url, text, state): |
|
state.append_token({ |
|
'type': 'link', |
|
'children': [{'type': 'text', 'raw': text}], |
|
'attrs': {'url': escape_url(url)}, |
|
}) |
|
|
|
def parse_emphasis(self, m: Match, state: InlineState) -> int: |
|
pos = m.end() |
|
|
|
marker = m.group(0) |
|
mlen = len(marker) |
|
if mlen == 1 and state.in_emphasis: |
|
state.append_token({'type': 'text', 'raw': marker}) |
|
return pos |
|
elif mlen == 2 and state.in_strong: |
|
state.append_token({'type': 'text', 'raw': marker}) |
|
return pos |
|
|
|
_end_re = EMPHASIS_END_RE[marker] |
|
m1 = _end_re.search(state.src, pos) |
|
if not m1: |
|
state.append_token({'type': 'text', 'raw': marker}) |
|
return pos |
|
|
|
end_pos = m1.end() |
|
text = state.src[pos:end_pos-mlen] |
|
|
|
prec_pos = self.precedence_scan(m, state, end_pos) |
|
if prec_pos: |
|
return prec_pos |
|
|
|
new_state = state.copy() |
|
new_state.src = text |
|
if mlen == 1: |
|
new_state.in_emphasis = True |
|
children = self.render(new_state) |
|
state.append_token({'type': 'emphasis', 'children': children}) |
|
elif mlen == 2: |
|
new_state.in_strong = True |
|
children = self.render(new_state) |
|
state.append_token({'type': 'strong', 'children': children}) |
|
else: |
|
new_state.in_emphasis = True |
|
new_state.in_strong = True |
|
|
|
children = [{ |
|
'type': 'strong', |
|
'children': self.render(new_state) |
|
}] |
|
state.append_token({ |
|
'type': 'emphasis', |
|
'children': children, |
|
}) |
|
return end_pos |
|
|
|
def parse_codespan(self, m: Match, state: InlineState) -> int: |
|
marker = m.group(0) |
|
|
|
|
|
pattern = re.compile(r'(.*?[^`])' + marker + r'(?!`)', re.S) |
|
|
|
pos = m.end() |
|
m = pattern.match(state.src, pos) |
|
if m: |
|
end_pos = m.end() |
|
code = m.group(1) |
|
|
|
code = code.replace('\n', ' ') |
|
if len(code.strip()): |
|
if code.startswith(' ') and code.endswith(' '): |
|
code = code[1:-1] |
|
state.append_token({'type': 'codespan', 'raw': escape(code)}) |
|
return end_pos |
|
else: |
|
state.append_token({'type': 'text', 'raw': marker}) |
|
return pos |
|
|
|
def parse_linebreak(self, m: Match, state: InlineState) -> int: |
|
state.append_token({'type': 'linebreak'}) |
|
return m.end() |
|
|
|
def parse_softbreak(self, m: Match, state: InlineState) -> int: |
|
state.append_token({'type': 'softbreak'}) |
|
return m.end() |
|
|
|
def parse_inline_html(self, m: Match, state: InlineState) -> int: |
|
end_pos = m.end() |
|
html = m.group(0) |
|
state.append_token({'type': 'inline_html', 'raw': html}) |
|
if html.startswith(('<a ', '<a>', '<A ', '<A>')): |
|
state.in_link = True |
|
elif html.startswith(('</a ', '</a>', '</A ', '</A>')): |
|
state.in_link = False |
|
return end_pos |
|
|
|
def process_text(self, text: str, state: InlineState): |
|
state.append_token({'type': 'text', 'raw': text}) |
|
|
|
def parse(self, state: InlineState) -> List[Dict[str, Any]]: |
|
pos = 0 |
|
sc = self.compile_sc() |
|
while pos < len(state.src): |
|
m = sc.search(state.src, pos) |
|
if not m: |
|
break |
|
|
|
end_pos = m.start() |
|
if end_pos > pos: |
|
hole = state.src[pos:end_pos] |
|
self.process_text(hole, state) |
|
|
|
new_pos = self.parse_method(m, state) |
|
if not new_pos: |
|
|
|
pos = end_pos + 1 |
|
hole = state.src[end_pos:pos] |
|
self.process_text(hole, state) |
|
else: |
|
pos = new_pos |
|
|
|
if pos == 0: |
|
|
|
self.process_text(state.src, state) |
|
elif pos < len(state.src): |
|
self.process_text(state.src[pos:], state) |
|
return state.tokens |
|
|
|
def precedence_scan(self, m: Match, state: InlineState, end_pos: int, rules=None): |
|
if rules is None: |
|
rules = ['codespan', 'link', 'prec_auto_link', 'prec_inline_html'] |
|
|
|
mark_pos = m.end() |
|
sc = self.compile_sc(rules) |
|
m1 = sc.search(state.src, mark_pos, end_pos) |
|
if not m1: |
|
return |
|
|
|
rule_name = m1.lastgroup.replace('prec_', '') |
|
sc = self.compile_sc([rule_name]) |
|
m2 = sc.match(state.src, m1.start()) |
|
if not m2: |
|
return |
|
|
|
func = self._methods[rule_name] |
|
new_state = state.copy() |
|
new_state.src = state.src |
|
m2_pos = func(m2, new_state) |
|
if not m2_pos or m2_pos < end_pos: |
|
return |
|
|
|
raw_text = state.src[m.start():m2.start()] |
|
state.append_token({'type': 'text', 'raw': raw_text}) |
|
for token in new_state.tokens: |
|
state.append_token(token) |
|
return m2_pos |
|
|
|
def render(self, state: InlineState): |
|
self.parse(state) |
|
return state.tokens |
|
|
|
def __call__(self, s, env): |
|
state = self.state_cls(env) |
|
state.src = s |
|
return self.render(state) |
|
|