Upload folder using huggingface_hub

d1ceb73 verified 11 months ago

7.17 kB

	import re
	from .core import BlockState
	from .util import (
	strip_end,
	expand_tab,
	expand_leading_tab,
	)
	# because list is complex, split list parser in a new file

	LIST_PATTERN = (
	r'^(?P<list_1> {0,3})'
	r'(?P<list_2>[\*\+-]\|\d{1,9}[.)])'
	r'(?P<list_3>[ \t]*\|[ \t].+)$'
	)

	_LINE_HAS_TEXT = re.compile(r'( *)\S')


	def parse_list(block, m: re.Match, state: BlockState) -> int:
	"""Parse tokens for ordered and unordered list."""
	text = m.group('list_3')
	if not text.strip():
	# Example 285
	# an empty list item cannot interrupt a paragraph
	end_pos = state.append_paragraph()
	if end_pos:
	return end_pos

	marker = m.group('list_2')
	ordered = len(marker) > 1
	depth = state.depth()
	token = {
	'type': 'list',
	'children': [],
	'tight': True,
	'bullet': marker[-1],
	'attrs': {
	'depth': depth,
	'ordered': ordered,
	},
	}
	if ordered:
	start = int(marker[:-1])
	if start != 1:
	# Example 304
	# we allow only lists starting with 1 to interrupt paragraphs
	end_pos = state.append_paragraph()
	if end_pos:
	return end_pos
	token['attrs']['start'] = start

	state.cursor = m.end() + 1
	groups = (m.group('list_1'), marker, text)

	if depth >= block.max_nested_level - 1:
	rules = list(block.list_rules)
	rules.remove('list')
	else:
	rules = block.list_rules

	bullet = _get_list_bullet(marker[-1])
	while groups:
	groups = _parse_list_item(block, bullet, groups, token, state, rules)

	end_pos = token.pop('_end_pos', None)
	_transform_tight_list(token)
	if end_pos:
	index = token.pop('_tok_index')
	state.tokens.insert(index, token)
	return end_pos

	state.append_token(token)
	return state.cursor


	def _transform_tight_list(token):
	if token['tight']:
	# reset tight list item
	for list_item in token['children']:
	for tok in list_item['children']:
	if tok['type'] == 'paragraph':
	tok['type'] = 'block_text'
	elif tok['type'] == 'list':
	_transform_tight_list(tok)


	def _parse_list_item(block, bullet, groups, token, state, rules):
	spaces, marker, text = groups

	leading_width = len(spaces) + len(marker)
	text, continue_width = _compile_continue_width(text, leading_width)
	item_pattern = _compile_list_item_pattern(bullet, leading_width)
	pairs = [
	('thematic_break', block.specification['thematic_break']),
	('fenced_code', block.specification['fenced_code']),
	('axt_heading', block.specification['axt_heading']),
	('block_quote', block.specification['block_quote']),
	('block_html', block.specification['block_html']),
	('list', block.specification['list']),
	]
	if leading_width < 3:
	_repl_w = str(leading_width)
	pairs = [(n, p.replace('3', _repl_w, 1)) for n, p in pairs]

	pairs.insert(1, ('list_item', item_pattern))
	regex = '\|'.join(r'(?P<%s>(?<=\n)%s)' % pair for pair in pairs)
	sc = re.compile(regex, re.M)

	src = ''
	next_group = None
	prev_blank_line = False
	pos = state.cursor

	continue_space = ' ' * continue_width
	while pos < state.cursor_max:
	pos = state.find_line_end()
	line = state.get_text(pos)
	if block.BLANK_LINE.match(line):
	src += '\n'
	prev_blank_line = True
	state.cursor = pos
	continue

	line = expand_leading_tab(line)
	if line.startswith(continue_space):
	if prev_blank_line and not text and not src.strip():
	# Example 280
	# A list item can begin with at most one blank line
	break

	src += line
	prev_blank_line = False
	state.cursor = pos
	continue

	m = sc.match(state.src, state.cursor)
	if m:
	tok_type = m.lastgroup
	if tok_type == 'list_item':
	if prev_blank_line:
	token['tight'] = False
	next_group = (
	m.group('listitem_1'),
	m.group('listitem_2'),
	m.group('listitem_3')
	)
	state.cursor = m.end() + 1
	break

	if tok_type == 'list':
	break

	tok_index = len(state.tokens)
	end_pos = block.parse_method(m, state)
	if end_pos:
	token['_tok_index'] = tok_index
	token['_end_pos'] = end_pos
	break

	if prev_blank_line and not line.startswith(continue_space):
	# not a continue line, and previous line is blank
	break

	src += line
	state.cursor = pos

	text += _clean_list_item_text(src, continue_width)
	child = state.child_state(strip_end(text))

	block.parse(child, rules)

	if token['tight'] and _is_loose_list(child.tokens):
	token['tight'] = False

	token['children'].append({
	'type': 'list_item',
	'children': child.tokens,
	})
	if next_group:
	return next_group


	def _get_list_bullet(c):
	if c == '.':
	bullet = r'\d{0,9}\.'
	elif c == ')':
	bullet = r'\d{0,9}\)'
	elif c == '*':
	bullet = r'\*'
	elif c == '+':
	bullet = r'\+'
	else:
	bullet = '-'
	return bullet


	def _compile_list_item_pattern(bullet, leading_width):
	if leading_width > 3:
	leading_width = 3
	return (
	r'^(?P<listitem_1> {0,' + str(leading_width) + '})'
	r'(?P<listitem_2>' + bullet + ')'
	r'(?P<listitem_3>[ \t]*\|[ \t][^\n]+)$'
	)


	def _compile_continue_width(text, leading_width):
	text = expand_leading_tab(text, 3)
	text = expand_tab(text)

	m2 = _LINE_HAS_TEXT.match(text)
	if m2:
	# indent code, startswith 5 spaces
	if text.startswith(' '):
	space_width = 1
	else:
	space_width = len(m2.group(1))

	text = text[space_width:] + '\n'
	else:
	space_width = 1
	text = ''

	continue_width = leading_width + space_width
	return text, continue_width


	def _clean_list_item_text(src, continue_width):
	# according to Example 7, tab should be treated as 3 spaces
	rv = []
	trim_space = ' ' * continue_width
	lines = src.split('\n')
	for line in lines:
	if line.startswith(trim_space):
	line = line.replace(trim_space, '', 1)
	# according to CommonMark Example 5
	# tab should be treated as 4 spaces
	line = expand_tab(line)
	rv.append(line)
	else:
	rv.append(line)

	return '\n'.join(rv)


	def _is_loose_list(tokens):
	paragraph_count = 0
	for tok in tokens:
	if tok['type'] == 'blank_line':
	return True
	if tok['type'] == 'paragraph':
	paragraph_count += 1
	if paragraph_count > 1:
	return True