Upload folder using huggingface_hub

d1ceb73 verified 11 months ago

16 kB

	import re
	from typing import Optional, List, Tuple, Match
	from .util import (
	unikey,
	escape_url,
	expand_tab,
	expand_leading_tab,
	)
	from .core import Parser, BlockState
	from .helpers import (
	LINK_LABEL,
	HTML_TAGNAME,
	HTML_ATTRIBUTES,
	BLOCK_TAGS,
	PRE_TAGS,
	unescape_char,
	parse_link_href,
	parse_link_title,
	)
	from .list_parser import parse_list, LIST_PATTERN

	_INDENT_CODE_TRIM = re.compile(r'^ {1,4}', flags=re.M)
	_AXT_HEADING_TRIM = re.compile(r'(\s+\|^)#+\s*$')
	_BLOCK_QUOTE_TRIM = re.compile(r'^ ?', flags=re.M)
	_BLOCK_QUOTE_LEADING = re.compile(r'^ *>', flags=re.M)

	_LINE_BLANK_END = re.compile(r'\n[ \t]*\n$')
	_BLANK_TO_LINE = re.compile(r'[ \t]*\n')

	_BLOCK_TAGS_PATTERN = '\|'.join(BLOCK_TAGS) + '\|' + '\|'.join(PRE_TAGS)
	_OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r'[ \t]>[ \t](?:\n\|$)')
	_CLOSE_TAG_END = re.compile(r'[ \t]>[ \t](?:\n\|$)')
	_STRICT_BLOCK_QUOTE = re.compile(r'( {0,3}>[^\n]*(?:\n\|$))+')


	class BlockParser(Parser):
	BLANK_LINE = re.compile(r'(^[ \t\v\f]*\n)+', re.M)

	RAW_HTML = (
	r'^ {0,3}('
	r'</?' + HTML_TAGNAME + r'\|'
	r'<!--\|' # comment
	r'<\?\|' # script
	r'<![A-Z]\|'
	r'<!\[CDATA\[)'
	)

	BLOCK_HTML = (
	r'^ {0,3}(?:'
	r'(?:</?' + _BLOCK_TAGS_PATTERN + r'(?:[ \t]+\|\n\|$))'
	r'\|<!--' # comment
	r'\|<\?' # script
	r'\|<![A-Z]'
	r'\|<!\[CDATA\[)'
	)

	SPECIFICATION = {
	'blank_line': r'(^[ \t\v\f]*\n)+',
	'axt_heading': r'^ {0,3}(?P<axt_1>#{1,6})(?!#+)(?P<axt_2>[ \t]\|[ \t]+.?)$',
	'setex_heading': r'^ {0,3}(?P<setext_1>=\|-){1,}[ \t]*$',
	'fenced_code': (
	r'^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}\|~{3,})'
	r'[ \t](?P<fenced_3>.?)$'
	),
	'indent_code': (
	r'^(?: {4}\| *\t)[^\n]+(?:\n+\|$)'
	r'((?:(?: {4}\| \t)[^\n]+(?:\n+\|$))\|\s)'
	),
	'thematic_break': r'^ {0,3}((?:-[ \t]){3,}\|(?:_[ \t]){3,}\|(?:\[ \t]){3,})$',
	'ref_link': r'^ {0,3}\[(?P<reflink_1>' + LINK_LABEL + r')\]:',
	'block_quote': r'^ {0,3}>(?P<quote_1>.*?)$',
	'list': LIST_PATTERN,
	'block_html': BLOCK_HTML,
	'raw_html': RAW_HTML,
	}

	DEFAULT_RULES = (
	'fenced_code',
	'indent_code',
	'axt_heading',
	'setex_heading',
	'thematic_break',
	'block_quote',
	'list',
	'ref_link',
	'raw_html',
	'blank_line',
	)

	def __init__(
	self,
	block_quote_rules: Optional[List[str]]=None,
	list_rules: Optional[List[str]]=None,
	max_nested_level: int=6
	):
	super(BlockParser, self).__init__()

	if block_quote_rules is None:
	block_quote_rules = list(self.DEFAULT_RULES)

	if list_rules is None:
	list_rules = list(self.DEFAULT_RULES)

	self.block_quote_rules = block_quote_rules
	self.list_rules = list_rules
	self.max_nested_level = max_nested_level
	# register default parse methods
	self._methods = {
	name: getattr(self, 'parse_' + name) for name in self.SPECIFICATION
	}

	def parse_blank_line(self, m: Match, state: BlockState) -> int:
	"""Parse token for blank lines."""
	state.append_token({'type': 'blank_line'})
	return m.end()

	def parse_thematic_break(self, m: Match, state: BlockState) -> int:
	"""Parse token for thematic break, e.g. ``<hr>`` tag in HTML."""
	state.append_token({'type': 'thematic_break'})
	# $ does not count '\n'
	return m.end() + 1

	def parse_indent_code(self, m: Match, state: BlockState) -> int:
	"""Parse token for code block which is indented by 4 spaces."""
	# it is a part of the paragraph
	end_pos = state.append_paragraph()
	if end_pos:
	return end_pos

	code = m.group(0)
	code = expand_leading_tab(code)
	code = _INDENT_CODE_TRIM.sub('', code)
	code = code.strip('\n')
	state.append_token({'type': 'block_code', 'raw': code, 'style': 'indent'})
	return m.end()

	def parse_fenced_code(self, m: Match, state: BlockState) -> Optional[int]:
	"""Parse token for fenced code block. A fenced code block is started with
	3 or more backtick(`) or tilde(~).

	An example of a fenced code block:

	.. code-block:: markdown

	```python
	def markdown(text):
	return mistune.html(text)
	```
	"""
	spaces = m.group('fenced_1')
	marker = m.group('fenced_2')
	info = m.group('fenced_3')

	c = marker[0]
	if info and c == '`':
	# CommonMark Example 145
	# Info strings for backtick code blocks cannot contain backticks
	if info.find(c) != -1:
	return

	_end = re.compile(
	r'^ {0,3}' + c + '{' + str(len(marker)) + r',}[ \t]*(?:\n\|$)', re.M)
	cursor_start = m.end() + 1

	m2 = _end.search(state.src, cursor_start)
	if m2:
	code = state.src[cursor_start:m2.start()]
	end_pos = m2.end()
	else:
	code = state.src[cursor_start:]
	end_pos = state.cursor_max

	if spaces and code:
	_trim_pattern = re.compile('^ {0,' + str(len(spaces)) + '}', re.M)
	code = _trim_pattern.sub('', code)

	token = {'type': 'block_code', 'raw': code, 'style': 'fenced', 'marker': marker}
	if info:
	info = unescape_char(info)
	token['attrs'] = {'info': info.strip()}

	state.append_token(token)
	return end_pos

	def parse_axt_heading(self, m: Match, state: BlockState) -> int:
	"""Parse token for AXT heading. An AXT heading is started with 1 to 6
	symbol of ``#``."""
	level = len(m.group('axt_1'))
	text = m.group('axt_2').strip()
	# remove last #
	if text:
	text = _AXT_HEADING_TRIM.sub('', text)

	token = {'type': 'heading', 'text': text, 'attrs': {'level': level}, 'style': 'axt'}
	state.append_token(token)
	return m.end() + 1

	def parse_setex_heading(self, m: Match, state: BlockState) -> Optional[int]:
	"""Parse token for setex style heading. A setex heading syntax looks like:

	.. code-block:: markdown

	H1 title
	========
	"""
	last_token = state.last_token()
	if last_token and last_token['type'] == 'paragraph':
	level = 1 if m.group('setext_1') == '=' else 2
	last_token['type'] = 'heading'
	last_token['style'] = 'setext'
	last_token['attrs'] = {'level': level}
	return m.end() + 1

	sc = self.compile_sc(['thematic_break', 'list'])
	m = sc.match(state.src, state.cursor)
	if m:
	return self.parse_method(m, state)

	def parse_ref_link(self, m: Match, state: BlockState) -> Optional[int]:
	"""Parse link references and save the link information into ``state.env``.

	Here is an example of a link reference:

	.. code-block:: markdown

	a [link][example]

	[example]: https://example.com "Optional title"

	This method will save the link reference into ``state.env`` as::

	state.env['ref_links']['example'] = {
	'url': 'https://example.com',
	'title': "Optional title",
	}
	"""
	end_pos = state.append_paragraph()
	if end_pos:
	return end_pos

	label = m.group('reflink_1')
	key = unikey(label)
	if not key:
	return

	href, href_pos = parse_link_href(state.src, m.end(), block=True)
	if href is None:
	return

	_blank = self.BLANK_LINE.search(state.src, href_pos)
	if _blank:
	max_pos = _blank.start()
	else:
	max_pos = state.cursor_max

	title, title_pos = parse_link_title(state.src, href_pos, max_pos)
	if title_pos:
	m = _BLANK_TO_LINE.match(state.src, title_pos)
	if m:
	title_pos = m.end()
	else:
	title_pos = None
	title = None

	if title_pos is None:
	m = _BLANK_TO_LINE.match(state.src, href_pos)
	if m:
	href_pos = m.end()
	else:
	href_pos = None
	href = None

	end_pos = title_pos or href_pos
	if not end_pos:
	return

	if key not in state.env['ref_links']:
	href = unescape_char(href)
	data = {'url': escape_url(href), 'label': label}
	if title:
	data['title'] = title
	state.env['ref_links'][key] = data
	return end_pos

	def extract_block_quote(self, m: Match, state: BlockState) -> Tuple[str, int]:
	"""Extract text and cursor end position of a block quote."""

	# cleanup at first to detect if it is code block
	text = m.group('quote_1') + '\n'
	text = expand_leading_tab(text, 3)
	text = _BLOCK_QUOTE_TRIM.sub('', text)

	sc = self.compile_sc(['blank_line', 'indent_code', 'fenced_code'])
	require_marker = bool(sc.match(text))

	state.cursor = m.end() + 1

	end_pos = None
	if require_marker:
	m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
	if m:
	quote = m.group(0)
	quote = _BLOCK_QUOTE_LEADING.sub('', quote)
	quote = expand_leading_tab(quote, 3)
	quote = _BLOCK_QUOTE_TRIM.sub('', quote)
	text += quote
	state.cursor = m.end()
	else:
	prev_blank_line = False
	break_sc = self.compile_sc([
	'blank_line', 'thematic_break', 'fenced_code',
	'list', 'block_html',
	])
	while state.cursor < state.cursor_max:
	m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
	if m:
	quote = m.group(0)
	quote = _BLOCK_QUOTE_LEADING.sub('', quote)
	quote = expand_leading_tab(quote, 3)
	quote = _BLOCK_QUOTE_TRIM.sub('', quote)
	text += quote
	state.cursor = m.end()
	if not quote.strip():
	prev_blank_line = True
	else:
	prev_blank_line = bool(_LINE_BLANK_END.search(quote))
	continue

	if prev_blank_line:
	# CommonMark Example 249
	# because of laziness, a blank line is needed between
	# a block quote and a following paragraph
	break

	m = break_sc.match(state.src, state.cursor)
	if m:
	end_pos = self.parse_method(m, state)
	if end_pos:
	break

	# lazy continuation line
	pos = state.find_line_end()
	line = state.get_text(pos)
	line = expand_leading_tab(line, 3)
	text += line
	state.cursor = pos

	# according to CommonMark Example 6, the second tab should be
	# treated as 4 spaces
	return expand_tab(text), end_pos

	def parse_block_quote(self, m: Match, state: BlockState) -> int:
	"""Parse token for block quote. Here is an example of the syntax:

	.. code-block:: markdown

	> a block quote starts
	> with right arrows
	"""
	text, end_pos = self.extract_block_quote(m, state)
	# scan children state
	child = state.child_state(text)
	if state.depth() >= self.max_nested_level - 1:
	rules = list(self.block_quote_rules)
	rules.remove('block_quote')
	else:
	rules = self.block_quote_rules

	self.parse(child, rules)
	token = {'type': 'block_quote', 'children': child.tokens}
	if end_pos:
	state.prepend_token(token)
	return end_pos
	state.append_token(token)
	return state.cursor

	def parse_list(self, m: Match, state: BlockState) -> int:
	"""Parse tokens for ordered and unordered list."""
	return parse_list(self, m, state)

	def parse_block_html(self, m: Match, state: BlockState) -> Optional[int]:
	return self.parse_raw_html(m, state)

	def parse_raw_html(self, m: Match, state: BlockState) -> Optional[int]:
	marker = m.group(0).strip()

	# rule 2
	if marker == '<!--':
	return _parse_html_to_end(state, '-->', m.end())

	# rule 3
	if marker == '<?':
	return _parse_html_to_end(state, '?>', m.end())

	# rule 5
	if marker == '<![CDATA[':
	return _parse_html_to_end(state, ']]>', m.end())

	# rule 4
	if marker.startswith('<!'):
	return _parse_html_to_end(state, '>', m.end())

	close_tag = None
	open_tag = None
	if marker.startswith('</'):
	close_tag = marker[2:].lower()
	# rule 6
	if close_tag in BLOCK_TAGS:
	return _parse_html_to_newline(state, self.BLANK_LINE)
	else:
	open_tag = marker[1:].lower()
	# rule 1
	if open_tag in PRE_TAGS:
	end_tag = '</' + open_tag + '>'
	return _parse_html_to_end(state, end_tag, m.end())
	# rule 6
	if open_tag in BLOCK_TAGS:
	return _parse_html_to_newline(state, self.BLANK_LINE)

	# Blocks of type 7 may not interrupt a paragraph.
	end_pos = state.append_paragraph()
	if end_pos:
	return end_pos

	# rule 7
	start_pos = m.end()
	end_pos = state.find_line_end()
	if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or \
	(close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos)):
	return _parse_html_to_newline(state, self.BLANK_LINE)

	def parse(self, state: BlockState, rules: Optional[List[str]]=None) -> None:
	sc = self.compile_sc(rules)

	while state.cursor < state.cursor_max:
	m = sc.search(state.src, state.cursor)
	if not m:
	break

	end_pos = m.start()
	if end_pos > state.cursor:
	text = state.get_text(end_pos)
	state.add_paragraph(text)
	state.cursor = end_pos

	end_pos = self.parse_method(m, state)
	if end_pos:
	state.cursor = end_pos
	else:
	end_pos = state.find_line_end()
	text = state.get_text(end_pos)
	state.add_paragraph(text)
	state.cursor = end_pos

	if state.cursor < state.cursor_max:
	text = state.src[state.cursor:]
	state.add_paragraph(text)
	state.cursor = state.cursor_max


	def _parse_html_to_end(state, end_marker, start_pos):
	marker_pos = state.src.find(end_marker, start_pos)
	if marker_pos == -1:
	text = state.src[state.cursor:]
	end_pos = state.cursor_max
	else:
	text = state.get_text(marker_pos)
	state.cursor = marker_pos
	end_pos = state.find_line_end()
	text += state.get_text(end_pos)

	state.append_token({'type': 'block_html', 'raw': text})
	return end_pos


	def _parse_html_to_newline(state, newline):
	m = newline.search(state.src, state.cursor)
	if m:
	end_pos = m.start()
	text = state.get_text(end_pos)
	else:
	text = state.src[state.cursor:]
	end_pos = state.cursor_max

	state.append_token({'type': 'block_html', 'raw': text})
	return end_pos