Upload folder using huggingface_hub

d1ceb73 verified 11 months ago

23.3 kB

	# flake8: noqa
	"""
	Shim module between Bleach and html5lib. This makes it easier to upgrade the
	html5lib library without having to change a lot of code.
	"""

	import re
	import string
	import warnings

	# ignore html5lib deprecation warnings to use bleach; we are bleach
	# apply before we import submodules that import html5lib
	warnings.filterwarnings(
	"ignore",
	message="html5lib's sanitizer is deprecated",
	category=DeprecationWarning,
	module="bleach._vendor.html5lib",
	)

	from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file
	HTMLParser,
	getTreeWalker,
	)
	from bleach._vendor.html5lib import (
	constants,
	) # noqa: E402 module level import not at top of file
	from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file
	namespaces,
	prefixes,
	)
	from bleach._vendor.html5lib.constants import (
	_ReparseException as ReparseException,
	) # noqa: E402 module level import not at top of file
	from bleach._vendor.html5lib.filters.base import (
	Filter,
	) # noqa: E402 module level import not at top of file
	from bleach._vendor.html5lib.filters.sanitizer import (
	allowed_protocols,
	allowed_css_properties,
	allowed_svg_properties,
	attr_val_is_uri,
	svg_attr_val_allows_ref,
	svg_allow_local_href,
	) # noqa: E402 module level import not at top of file
	from bleach._vendor.html5lib.filters.sanitizer import (
	Filter as SanitizerFilter,
	) # noqa: E402 module level import not at top of file
	from bleach._vendor.html5lib._inputstream import (
	HTMLInputStream,
	) # noqa: E402 module level import not at top of file
	from bleach._vendor.html5lib.serializer import (
	escape,
	HTMLSerializer,
	) # noqa: E402 module level import not at top of file
	from bleach._vendor.html5lib._tokenizer import (
	attributeMap,
	HTMLTokenizer,
	) # noqa: E402 module level import not at top of file
	from bleach._vendor.html5lib._trie import (
	Trie,
	) # noqa: E402 module level import not at top of file


	#: Map of entity name to expanded entity
	ENTITIES = constants.entities

	#: Trie of html entity string -> character representation
	ENTITIES_TRIE = Trie(ENTITIES)

	#: Token type constants--these never change
	TAG_TOKEN_TYPES = {
	constants.tokenTypes["StartTag"],
	constants.tokenTypes["EndTag"],
	constants.tokenTypes["EmptyTag"],
	}
	TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
	TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
	TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]
	TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]


	#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
	#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
	HTML_TAGS = frozenset(
	(
	"a",
	"abbr",
	"address",
	"area",
	"article",
	"aside",
	"audio",
	"b",
	"base",
	"bdi",
	"bdo",
	"blockquote",
	"body",
	"br",
	"button",
	"canvas",
	"caption",
	"cite",
	"code",
	"col",
	"colgroup",
	"data",
	"datalist",
	"dd",
	"del",
	"details",
	"dfn",
	"dialog",
	"div",
	"dl",
	"dt",
	"em",
	"embed",
	"fieldset",
	"figcaption",
	"figure",
	"footer",
	"form",
	"h1",
	"h2",
	"h3",
	"h4",
	"h5",
	"h6",
	"head",
	"header",
	"hgroup",
	"hr",
	"html",
	"i",
	"iframe",
	"img",
	"input",
	"ins",
	"kbd",
	"keygen",
	"label",
	"legend",
	"li",
	"link",
	"map",
	"mark",
	"menu",
	"meta",
	"meter",
	"nav",
	"noscript",
	"object",
	"ol",
	"optgroup",
	"option",
	"output",
	"p",
	"param",
	"picture",
	"pre",
	"progress",
	"q",
	"rp",
	"rt",
	"ruby",
	"s",
	"samp",
	"script",
	"section",
	"select",
	"slot",
	"small",
	"source",
	"span",
	"strong",
	"style",
	"sub",
	"summary",
	"sup",
	"table",
	"tbody",
	"td",
	"template",
	"textarea",
	"tfoot",
	"th",
	"thead",
	"time",
	"title",
	"tr",
	"track",
	"u",
	"ul",
	"var",
	"video",
	"wbr",
	)
	)


	#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
	#: from mozilla on 2019.07.11
	#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
	HTML_TAGS_BLOCK_LEVEL = frozenset(
	(
	"address",
	"article",
	"aside",
	"blockquote",
	"details",
	"dialog",
	"dd",
	"div",
	"dl",
	"dt",
	"fieldset",
	"figcaption",
	"figure",
	"footer",
	"form",
	"h1",
	"h2",
	"h3",
	"h4",
	"h5",
	"h6",
	"header",
	"hgroup",
	"hr",
	"li",
	"main",
	"nav",
	"ol",
	"p",
	"pre",
	"section",
	"table",
	"ul",
	)
	)


	class InputStreamWithMemory:
	"""Wraps an HTMLInputStream to remember characters since last <

	This wraps existing HTMLInputStream classes to keep track of the stream
	since the last < which marked an open tag state.

	"""

	def __init__(self, inner_stream):
	self._inner_stream = inner_stream
	self.reset = self._inner_stream.reset
	self.position = self._inner_stream.position
	self._buffer = []

	@property
	def errors(self):
	return self._inner_stream.errors

	@property
	def charEncoding(self):
	return self._inner_stream.charEncoding

	@property
	def changeEncoding(self):
	return self._inner_stream.changeEncoding

	def char(self):
	c = self._inner_stream.char()
	# char() can return None if EOF, so ignore that
	if c:
	self._buffer.append(c)
	return c

	def charsUntil(self, characters, opposite=False):
	chars = self._inner_stream.charsUntil(characters, opposite=opposite)
	self._buffer.extend(list(chars))
	return chars

	def unget(self, char):
	if self._buffer:
	self._buffer.pop(-1)
	return self._inner_stream.unget(char)

	def get_tag(self):
	"""Returns the stream history since last '<'

	Since the buffer starts at the last '<' as as seen by tagOpenState(),
	we know that everything from that point to when this method is called
	is the "tag" that is being tokenized.

	"""
	return "".join(self._buffer)

	def start_tag(self):
	"""Resets stream history to just '<'

	This gets called by tagOpenState() which marks a '<' that denotes an
	open tag. Any time we see that, we reset the buffer.

	"""
	self._buffer = ["<"]


	class BleachHTMLTokenizer(HTMLTokenizer):
	"""Tokenizer that doesn't consume character entities"""

	def __init__(self, consume_entities=False, **kwargs):
	super().__init__(**kwargs)

	self.consume_entities = consume_entities

	# Wrap the stream with one that remembers the history
	self.stream = InputStreamWithMemory(self.stream)

	# Remember the last token emitted; needed for block element spacing
	self.emitted_last_token = None

	def __iter__(self):
	last_error_token = None

	for token in super().__iter__():
	if last_error_token is not None:
	if (
	last_error_token["data"] == "invalid-character-in-attribute-name"
	and token["type"] in TAG_TOKEN_TYPES
	and token.get("data")
	):
	# token["data"] is an html5lib attributeMap
	# (OrderedDict 3.7+ and dict otherwise)
	# of attr name to attr value
	#
	# Remove attribute names that have ', " or < in them
	# because those characters are invalid for attribute names.
	token["data"] = attributeMap(
	(attr_name, attr_value)
	for attr_name, attr_value in token["data"].items()
	if (
	'"' not in attr_name
	and "'" not in attr_name
	and "<" not in attr_name
	)
	)
	last_error_token = None
	yield token

	elif (
	last_error_token["data"] == "expected-closing-tag-but-got-char"
	and self.parser.tags is not None
	and token["data"].lower().strip() not in self.parser.tags
	):
	# We've got either a malformed tag or a pseudo-tag or
	# something that html5lib wants to turn into a malformed
	# comment which Bleach clean() will drop so we interfere
	# with the token stream to handle it more correctly.
	#
	# If this is an allowed tag, it's malformed and we just let
	# the html5lib parser deal with it--we don't enter into this
	# block.
	#
	# If this is not an allowed tag, then we convert it to
	# characters and it'll get escaped in the sanitizer.
	token["data"] = self.stream.get_tag()
	token["type"] = TAG_TOKEN_TYPE_CHARACTERS

	last_error_token = None
	yield token

	elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
	# If the token is a parse error, then let the last_error_token
	# go, and make token the new last_error_token
	yield last_error_token
	last_error_token = token

	else:
	yield last_error_token
	yield token
	last_error_token = None

	continue

	# If the token is a ParseError, we hold on to it so we can get the
	# next token and potentially fix it.
	if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
	last_error_token = token
	continue

	yield token

	if last_error_token:
	if last_error_token["data"] == "eof-in-tag-name":
	# Handle the case where the text being parsed ends with <
	# followed by a series of characters. It's treated as a tag
	# name that abruptly ends, but we should treat that like
	# character data
	yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
	elif last_error_token["data"] in (
	"eof-in-attribute-name",
	"eof-in-attribute-value-no-quotes",
	):
	# Handle the case where the text being parsed ends with <
	# followed by a series of characters and then space and then
	# more characters. It's treated as a tag name followed by an
	# attribute that abruptly ends, but we should treat that like
	# character data.
	yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
	else:
	yield last_error_token

	def consumeEntity(self, allowedChar=None, fromAttribute=False):
	# If this tokenizer is set to consume entities, then we can let the
	# superclass do its thing.
	if self.consume_entities:
	return super().consumeEntity(allowedChar, fromAttribute)

	# If this tokenizer is set to not consume entities, then we don't want
	# to consume and convert them, so this overrides the html5lib tokenizer's
	# consumeEntity so that it's now a no-op.
	#
	# However, when that gets called, it's consumed an &, so we put that back in
	# the stream.
	if fromAttribute:
	self.currentToken["data"][-1][1] += "&"

	else:
	self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})

	def tagOpenState(self):
	# This state marks a < that is either a StartTag, EndTag, EmptyTag,
	# or ParseError. In all cases, we want to drop any stream history
	# we've collected so far and we do that by calling start_tag() on
	# the input stream wrapper.
	self.stream.start_tag()
	return super().tagOpenState()

	def emitCurrentToken(self):
	token = self.currentToken

	if (
	self.parser.tags is not None
	and token["type"] in TAG_TOKEN_TYPES
	and token["name"].lower() not in self.parser.tags
	):
	# If this is a start/end/empty tag for a tag that's not in our
	# allowed list, then it gets stripped or escaped. In both of these
	# cases it gets converted to a Characters token.
	if self.parser.strip:
	if (
	self.emitted_last_token
	and token["type"] == TAG_TOKEN_TYPE_START
	and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL
	):
	# If this is a block level tag we're stripping, we drop it
	# for a newline because that's what a browser would parse
	# it as
	new_data = "\n"
	else:
	# For all other things being stripped, we throw in an empty
	# string token
	new_data = ""

	else:
	# If we're escaping the token, we want to escape the exact
	# original string. Since tokenizing also normalizes data
	# and this is a tag-like thing, we've lost some information.
	# So we go back through the stream to get the original
	# string and use that.
	new_data = self.stream.get_tag()

	new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}

	self.currentToken = self.emitted_last_token = new_token
	self.tokenQueue.append(new_token)
	self.state = self.dataState
	return

	self.emitted_last_token = self.currentToken
	super().emitCurrentToken()


	class BleachHTMLParser(HTMLParser):
	"""Parser that uses BleachHTMLTokenizer"""

	def __init__(self, tags, strip, consume_entities, **kwargs):
	"""
	:arg tags: set of allowed tags--everything else is either stripped or
	escaped; if None, then this doesn't look at tags at all
	:arg strip: whether to strip disallowed tags (True) or escape them (False);
	if tags=None, then this doesn't have any effect
	:arg consume_entities: whether to consume entities (default behavior) or
	leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)

	"""
	self.tags = (
	frozenset((tag.lower() for tag in tags)) if tags is not None else None
	)
	self.strip = strip
	self.consume_entities = consume_entities
	super().__init__(**kwargs)

	def _parse(
	self, stream, innerHTML=False, container="div", scripting=True, **kwargs
	):
	# set scripting=True to parse <noscript> as though JS is enabled to
	# match the expected context in browsers
	#
	# https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
	#
	# Override HTMLParser so we can swap out the tokenizer for our own.
	self.innerHTMLMode = innerHTML
	self.container = container
	self.scripting = scripting
	self.tokenizer = BleachHTMLTokenizer(
	stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
	)
	self.reset()

	try:
	self.mainLoop()
	except ReparseException:
	self.reset()
	self.mainLoop()


	def convert_entity(value):
	"""Convert an entity (minus the & and ; part) into what it represents

	This handles numeric, hex, and text entities.

	:arg value: the string (minus the ``&`` and ``;`` part) to convert

	:returns: unicode character or None if it's an ambiguous ampersand that
	doesn't match a character entity

	"""
	if value[0] == "#":
	if len(value) < 2:
	return None

	if value[1] in ("x", "X"):
	# hex-encoded code point
	int_as_string, base = value[2:], 16
	else:
	# decimal code point
	int_as_string, base = value[1:], 10

	if int_as_string == "":
	return None

	code_point = int(int_as_string, base)
	if 0 < code_point < 0x110000:
	return chr(code_point)
	else:
	return None

	return ENTITIES.get(value, None)


	def convert_entities(text):
	"""Converts all found entities in the text

	:arg text: the text to convert entities in

	:returns: unicode text with converted entities

	"""
	if "&" not in text:
	return text

	new_text = []
	for part in next_possible_entity(text):
	if not part:
	continue

	if part.startswith("&"):
	entity = match_entity(part)
	if entity is not None:
	converted = convert_entity(entity)

	# If it's not an ambiguous ampersand, then replace with the
	# unicode character. Otherwise, we leave the entity in.
	if converted is not None:
	new_text.append(converted)
	remainder = part[len(entity) + 2 :]
	if part:
	new_text.append(remainder)
	continue

	new_text.append(part)

	return "".join(new_text)


	def match_entity(stream):
	"""Returns first entity in stream or None if no entity exists

	Note: For Bleach purposes, entities must start with a "&" and end with a
	";". This ignores ambiguous character entities that have no ";" at the end.

	:arg stream: the character stream

	:returns: the entity string without "&" or ";" if it's a valid character
	entity; ``None`` otherwise

	"""
	# Nix the & at the beginning
	if stream[0] != "&":
	raise ValueError('Stream should begin with "&"')

	stream = stream[1:]

	stream = list(stream)
	possible_entity = ""
	end_characters = "<&=;" + string.whitespace

	# Handle number entities
	if stream and stream[0] == "#":
	possible_entity = "#"
	stream.pop(0)

	if stream and stream[0] in ("x", "X"):
	allowed = "0123456789abcdefABCDEF"
	possible_entity += stream.pop(0)
	else:
	allowed = "0123456789"

	# FIXME(willkg): Do we want to make sure these are valid number
	# entities? This doesn't do that currently.
	while stream and stream[0] not in end_characters:
	c = stream.pop(0)
	if c not in allowed:
	break
	possible_entity += c

	if possible_entity and stream and stream[0] == ";":
	return possible_entity
	return None

	# Handle character entities
	while stream and stream[0] not in end_characters:
	c = stream.pop(0)
	possible_entity += c
	if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
	# If it's not a prefix, then it's not an entity and we're
	# out
	return None

	if possible_entity and stream and stream[0] == ";":
	return possible_entity

	return None


	AMP_SPLIT_RE = re.compile("(&)")


	def next_possible_entity(text):
	"""Takes a text and generates a list of possible entities

	:arg text: the text to look at

	:returns: generator where each part (except the first) starts with an
	"&"

	"""
	for i, part in enumerate(AMP_SPLIT_RE.split(text)):
	if i == 0:
	yield part
	elif i % 2 == 0:
	yield "&" + part


	class BleachHTMLSerializer(HTMLSerializer):
	"""HTMLSerializer that undoes & -> & in attributes and sets
	escape_rcdata to True
	"""

	# per the HTMLSerializer.__init__ docstring:
	#
	# Whether to escape characters that need to be
	# escaped within normal elements within rcdata elements such as
	# style.
	#
	escape_rcdata = True

	def escape_base_amp(self, stoken):
	"""Escapes just bare & in HTML attribute values"""
	# First, undo escaping of &. We need to do this because html5lib's
	# HTMLSerializer expected the tokenizer to consume all the character
	# entities and convert them to their respective characters, but the
	# BleachHTMLTokenizer doesn't do that. For example, this fixes
	# &entity; back to &entity; .
	stoken = stoken.replace("&", "&")

	# However, we do want all bare & that are not marking character
	# entities to be changed to &, so let's do that carefully here.
	for part in next_possible_entity(stoken):
	if not part:
	continue

	if part.startswith("&"):
	entity = match_entity(part)
	# Only leave entities in that are not ambiguous. If they're
	# ambiguous, then we escape the ampersand.
	if entity is not None and convert_entity(entity) is not None:
	yield f"&{entity};"

	# Length of the entity plus 2--one for & at the beginning
	# and one for ; at the end
	part = part[len(entity) + 2 :]
	if part:
	yield part
	continue

	yield part.replace("&", "&")

	def serialize(self, treewalker, encoding=None):
	"""Wrap HTMLSerializer.serialize and conver & to & in attribute values

	Note that this converts & to & in attribute values where the & isn't
	already part of an unambiguous character entity.

	"""
	in_tag = False
	after_equals = False

	for stoken in super().serialize(treewalker, encoding):
	if in_tag:
	if stoken == ">":
	in_tag = False

	elif after_equals:
	if stoken != '"':
	yield from self.escape_base_amp(stoken)

	after_equals = False
	continue

	elif stoken == "=":
	after_equals = True

	yield stoken
	else:
	if stoken.startswith("<"):
	in_tag = True
	yield stoken