File size: 3,751 Bytes
d1ceb73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import re
import string
from .util import escape_url
PREVENT_BACKSLASH = r'(?<!\\)(?:\\\\)*'
PUNCTUATION = r'[' + re.escape(string.punctuation) + r']'
LINK_LABEL = r'(?:[^\\\[\]]|\\.){0,500}'
LINK_BRACKET_START = re.compile(r'[ \t]*\n?[ \t]*<')
LINK_BRACKET_RE = re.compile(r'<([^<>\n\\\x00]*)>')
LINK_HREF_BLOCK_RE = re.compile(r'[ \t]*\n?[ \t]*([^\s]+)(?:\s|$)')
LINK_HREF_INLINE_RE = re.compile(
r'[ \t]*\n?[ \t]*([^ \t\n]*?)(?:[ \t\n]|'
r'(?:' + PREVENT_BACKSLASH + r'\)))'
)
LINK_TITLE_RE = re.compile(
r'[ \t\n]+('
r'"(?:\\' + PUNCTUATION + r'|[^"\x00])*"|' # "title"
r"'(?:\\" + PUNCTUATION + r"|[^'\x00])*'" # 'title'
r')'
)
PAREN_END_RE = re.compile(r'\s*\)')
HTML_TAGNAME = r'[A-Za-z][A-Za-z0-9-]*'
HTML_ATTRIBUTES = (
r'(?:\s+[A-Za-z_:][A-Za-z0-9_.:-]*'
r'(?:\s*=\s*(?:[^ !"\'=<>`]+|\'[^\']*?\'|"[^\"]*?"))?)*'
)
BLOCK_TAGS = (
'address', 'article', 'aside', 'base', 'basefont', 'blockquote',
'body', 'caption', 'center', 'col', 'colgroup', 'dd', 'details',
'dialog', 'dir', 'div', 'dl', 'dt', 'fieldset', 'figcaption',
'figure', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3',
'h4', 'h5', 'h6', 'head', 'header', 'hr', 'html', 'iframe',
'legend', 'li', 'link', 'main', 'menu', 'menuitem', 'meta', 'nav',
'noframes', 'ol', 'optgroup', 'option', 'p', 'param', 'section',
'source', 'summary', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead',
'title', 'tr', 'track', 'ul'
)
PRE_TAGS = ('pre', 'script', 'style', 'textarea')
_INLINE_LINK_LABEL_RE = re.compile(LINK_LABEL + r'\]')
_INLINE_SQUARE_BRACKET_RE = re.compile(PREVENT_BACKSLASH + r'[\[\]]')
_ESCAPE_CHAR_RE = re.compile(r'\\(' + PUNCTUATION + r')')
def unescape_char(text):
return _ESCAPE_CHAR_RE.sub(r'\1', text)
def parse_link_text(src, pos):
level = 1
found = False
start_pos = pos
while pos < len(src):
m = _INLINE_SQUARE_BRACKET_RE.search(src, pos)
if not m:
break
pos = m.end()
marker = m.group(0)
if marker == ']':
level -= 1
if level == 0:
found = True
break
else:
level += 1
if found:
text = src[start_pos:pos-1]
return text, pos
return None, None
def parse_link_label(src, start_pos):
m = _INLINE_LINK_LABEL_RE.match(src, start_pos)
if m:
label = m.group(0)[:-1]
return label, m.end()
return None, None
def parse_link_href(src, start_pos, block=False):
m = LINK_BRACKET_START.match(src, start_pos)
if m:
start_pos = m.end() - 1
m = LINK_BRACKET_RE.match(src, start_pos)
if m:
return m.group(1), m.end()
return None, None
if block:
m = LINK_HREF_BLOCK_RE.match(src, start_pos)
else:
m = LINK_HREF_INLINE_RE.match(src, start_pos)
if not m:
return None, None
end_pos = m.end()
href = m.group(1)
if block and src[end_pos - 1] == href[-1]:
return href, end_pos
return href, end_pos - 1
def parse_link_title(src, start_pos, max_pos):
m = LINK_TITLE_RE.match(src, start_pos, max_pos)
if m:
title = m.group(1)[1:-1]
title = unescape_char(title)
return title, m.end()
return None, None
def parse_link(src, pos):
href, href_pos = parse_link_href(src, pos)
if href is None:
return None, None
title, title_pos = parse_link_title(src, href_pos, len(src))
next_pos = title_pos or href_pos
m = PAREN_END_RE.match(src, next_pos)
if not m:
return None, None
href = unescape_char(href)
attrs = {'url': escape_url(href)}
if title:
attrs['title'] = title
return attrs, m.end()
|