Upload folder using huggingface_hub

d1ceb73 verified 11 months ago

23.2 kB

	"""
	babel.numbers
	~~~~~~~~~~~~~

	CLDR Plural support. See UTS #35.

	:copyright: (c) 2013-2024 by the Babel Team.
	:license: BSD, see LICENSE for more details.
	"""
	from __future__ import annotations

	import decimal
	import re
	from collections.abc import Iterable, Mapping
	from typing import TYPE_CHECKING, Any, Callable

	if TYPE_CHECKING:
	from typing_extensions import Literal

	_plural_tags = ('zero', 'one', 'two', 'few', 'many', 'other')
	_fallback_tag = 'other'


	def extract_operands(source: float \| decimal.Decimal) -> tuple[decimal.Decimal \| int, int, int, int, int, int, Literal[0], Literal[0]]:
	"""Extract operands from a decimal, a float or an int, according to `CLDR rules`_.

	The result is an 8-tuple (n, i, v, w, f, t, c, e), where those symbols are as follows:

	====== ===============================================================
	Symbol Value
	------ ---------------------------------------------------------------
	n absolute value of the source number (integer and decimals).
	i integer digits of n.
	v number of visible fraction digits in n, with trailing zeros.
	w number of visible fraction digits in n, without trailing zeros.
	f visible fractional digits in n, with trailing zeros.
	t visible fractional digits in n, without trailing zeros.
	c compact decimal exponent value: exponent of the power of 10 used in compact decimal formatting.
	e currently, synonym for ‘c’. however, may be redefined in the future.
	====== ===============================================================

	.. _`CLDR rules`: https://www.unicode.org/reports/tr35/tr35-61/tr35-numbers.html#Operands

	:param source: A real number
	:type source: int\|float\|decimal.Decimal
	:return: A n-i-v-w-f-t-c-e tuple
	:rtype: tuple[decimal.Decimal, int, int, int, int, int, int, int]
	"""
	n = abs(source)
	i = int(n)
	if isinstance(n, float):
	if i == n:
	n = i
	else:
	# Cast the `float` to a number via the string representation.
	# This is required for Python 2.6 anyway (it will straight out fail to
	# do the conversion otherwise), and it's highly unlikely that the user
	# actually wants the lossless conversion behavior (quoting the Python
	# documentation):
	# > If value is a float, the binary floating point value is losslessly
	# > converted to its exact decimal equivalent.
	# > This conversion can often require 53 or more digits of precision.
	# Should the user want that behavior, they can simply pass in a pre-
	# converted `Decimal` instance of desired accuracy.
	n = decimal.Decimal(str(n))

	if isinstance(n, decimal.Decimal):
	dec_tuple = n.as_tuple()
	exp = dec_tuple.exponent
	fraction_digits = dec_tuple.digits[exp:] if exp < 0 else ()
	trailing = ''.join(str(d) for d in fraction_digits)
	no_trailing = trailing.rstrip('0')
	v = len(trailing)
	w = len(no_trailing)
	f = int(trailing or 0)
	t = int(no_trailing or 0)
	else:
	v = w = f = t = 0
	c = e = 0 # TODO: c and e are not supported
	return n, i, v, w, f, t, c, e


	class PluralRule:
	"""Represents a set of language pluralization rules. The constructor
	accepts a list of (tag, expr) tuples or a dict of `CLDR rules`_. The
	resulting object is callable and accepts one parameter with a positive or
	negative number (both integer and float) for the number that indicates the
	plural form for a string and returns the tag for the format:

	>>> rule = PluralRule({'one': 'n is 1'})
	>>> rule(1)
	'one'
	>>> rule(2)
	'other'

	Currently the CLDR defines these tags: zero, one, two, few, many and
	other where other is an implicit default. Rules should be mutually
	exclusive; for a given numeric value, only one rule should apply (i.e.
	the condition should only be true for one of the plural rule elements.

	.. _`CLDR rules`: https://www.unicode.org/reports/tr35/tr35-33/tr35-numbers.html#Language_Plural_Rules
	"""

	__slots__ = ('abstract', '_func')

	def __init__(self, rules: Mapping[str, str] \| Iterable[tuple[str, str]]) -> None:
	"""Initialize the rule instance.

	:param rules: a list of ``(tag, expr)``) tuples with the rules
	conforming to UTS #35 or a dict with the tags as keys
	and expressions as values.
	:raise RuleError: if the expression is malformed
	"""
	if isinstance(rules, Mapping):
	rules = rules.items()
	found = set()
	self.abstract: list[tuple[str, Any]] = []
	for key, expr in sorted(rules):
	if key not in _plural_tags:
	raise ValueError(f"unknown tag {key!r}")
	elif key in found:
	raise ValueError(f"tag {key!r} defined twice")
	found.add(key)
	ast = _Parser(expr).ast
	if ast:
	self.abstract.append((key, ast))

	def __repr__(self) -> str:
	rules = self.rules
	args = ", ".join([f"{tag}: {rules[tag]}" for tag in _plural_tags if tag in rules])
	return f"<{type(self).__name__} {args!r}>"

	@classmethod
	def parse(cls, rules: Mapping[str, str] \| Iterable[tuple[str, str]] \| PluralRule) -> PluralRule:
	"""Create a `PluralRule` instance for the given rules. If the rules
	are a `PluralRule` object, that object is returned.

	:param rules: the rules as list or dict, or a `PluralRule` object
	:raise RuleError: if the expression is malformed
	"""
	if isinstance(rules, PluralRule):
	return rules
	return cls(rules)

	@property
	def rules(self) -> Mapping[str, str]:
	"""The `PluralRule` as a dict of unicode plural rules.

	>>> rule = PluralRule({'one': 'n is 1'})
	>>> rule.rules
	{'one': 'n is 1'}
	"""
	_compile = _UnicodeCompiler().compile
	return {tag: _compile(ast) for tag, ast in self.abstract}

	@property
	def tags(self) -> frozenset[str]:
	"""A set of explicitly defined tags in this rule. The implicit default
	``'other'`` rules is not part of this set unless there is an explicit
	rule for it.
	"""
	return frozenset(i[0] for i in self.abstract)

	def __getstate__(self) -> list[tuple[str, Any]]:
	return self.abstract

	def __setstate__(self, abstract: list[tuple[str, Any]]) -> None:
	self.abstract = abstract

	def __call__(self, n: float \| decimal.Decimal) -> str:
	if not hasattr(self, '_func'):
	self._func = to_python(self)
	return self._func(n)


	def to_javascript(rule: Mapping[str, str] \| Iterable[tuple[str, str]] \| PluralRule) -> str:
	"""Convert a list/dict of rules or a `PluralRule` object into a JavaScript
	function. This function depends on no external library:

	>>> to_javascript({'one': 'n is 1'})
	"(function(n) { return (n == 1) ? 'one' : 'other'; })"

	Implementation detail: The function generated will probably evaluate
	expressions involved into range operations multiple times. This has the
	advantage that external helper functions are not required and is not a
	big performance hit for these simple calculations.

	:param rule: the rules as list or dict, or a `PluralRule` object
	:raise RuleError: if the expression is malformed
	"""
	to_js = _JavaScriptCompiler().compile
	result = ['(function(n) { return ']
	for tag, ast in PluralRule.parse(rule).abstract:
	result.append(f"{to_js(ast)} ? {tag!r} : ")
	result.append('%r; })' % _fallback_tag)
	return ''.join(result)


	def to_python(rule: Mapping[str, str] \| Iterable[tuple[str, str]] \| PluralRule) -> Callable[[float \| decimal.Decimal], str]:
	"""Convert a list/dict of rules or a `PluralRule` object into a regular
	Python function. This is useful in situations where you need a real
	function and don't are about the actual rule object:

	>>> func = to_python({'one': 'n is 1', 'few': 'n in 2..4'})
	>>> func(1)
	'one'
	>>> func(3)
	'few'
	>>> func = to_python({'one': 'n in 1,11', 'few': 'n in 3..10,13..19'})
	>>> func(11)
	'one'
	>>> func(15)
	'few'

	:param rule: the rules as list or dict, or a `PluralRule` object
	:raise RuleError: if the expression is malformed
	"""
	namespace = {
	'IN': in_range_list,
	'WITHIN': within_range_list,
	'MOD': cldr_modulo,
	'extract_operands': extract_operands,
	}
	to_python_func = _PythonCompiler().compile
	result = [
	'def evaluate(n):',
	' n, i, v, w, f, t, c, e = extract_operands(n)',
	]
	for tag, ast in PluralRule.parse(rule).abstract:
	# the str() call is to coerce the tag to the native string. It's
	# a limited ascii restricted set of tags anyways so that is fine.
	result.append(f" if ({to_python_func(ast)}): return {str(tag)!r}")
	result.append(f" return {_fallback_tag!r}")
	code = compile('\n'.join(result), '<rule>', 'exec')
	eval(code, namespace)
	return namespace['evaluate']


	def to_gettext(rule: Mapping[str, str] \| Iterable[tuple[str, str]] \| PluralRule) -> str:
	"""The plural rule as gettext expression. The gettext expression is
	technically limited to integers and returns indices rather than tags.

	>>> to_gettext({'one': 'n is 1', 'two': 'n is 2'})
	'nplurals=3; plural=((n == 1) ? 0 : (n == 2) ? 1 : 2);'

	:param rule: the rules as list or dict, or a `PluralRule` object
	:raise RuleError: if the expression is malformed
	"""
	rule = PluralRule.parse(rule)

	used_tags = rule.tags \| {_fallback_tag}
	_compile = _GettextCompiler().compile
	_get_index = [tag for tag in _plural_tags if tag in used_tags].index

	result = [f"nplurals={len(used_tags)}; plural=("]
	for tag, ast in rule.abstract:
	result.append(f"{_compile(ast)} ? {_get_index(tag)} : ")
	result.append(f"{_get_index(_fallback_tag)});")
	return ''.join(result)


	def in_range_list(num: float \| decimal.Decimal, range_list: Iterable[Iterable[float \| decimal.Decimal]]) -> bool:
	"""Integer range list test. This is the callback for the "in" operator
	of the UTS #35 pluralization rule language:

	>>> in_range_list(1, [(1, 3)])
	True
	>>> in_range_list(3, [(1, 3)])
	True
	>>> in_range_list(3, [(1, 3), (5, 8)])
	True
	>>> in_range_list(1.2, [(1, 4)])
	False
	>>> in_range_list(10, [(1, 4)])
	False
	>>> in_range_list(10, [(1, 4), (6, 8)])
	False
	"""
	return num == int(num) and within_range_list(num, range_list)


	def within_range_list(num: float \| decimal.Decimal, range_list: Iterable[Iterable[float \| decimal.Decimal]]) -> bool:
	"""Float range test. This is the callback for the "within" operator
	of the UTS #35 pluralization rule language:

	>>> within_range_list(1, [(1, 3)])
	True
	>>> within_range_list(1.0, [(1, 3)])
	True
	>>> within_range_list(1.2, [(1, 4)])
	True
	>>> within_range_list(8.8, [(1, 4), (7, 15)])
	True
	>>> within_range_list(10, [(1, 4)])
	False
	>>> within_range_list(10.5, [(1, 4), (20, 30)])
	False
	"""
	return any(num >= min_ and num <= max_ for min_, max_ in range_list)


	def cldr_modulo(a: float, b: float) -> float:
	"""Javaish modulo. This modulo operator returns the value with the sign
	of the dividend rather than the divisor like Python does:

	>>> cldr_modulo(-3, 5)
	-3
	>>> cldr_modulo(-3, -5)
	-3
	>>> cldr_modulo(3, 5)
	3
	"""
	reverse = 0
	if a < 0:
	a *= -1
	reverse = 1
	if b < 0:
	b *= -1
	rv = a % b
	if reverse:
	rv *= -1
	return rv


	class RuleError(Exception):
	"""Raised if a rule is malformed."""


	_VARS = {
	'n', # absolute value of the source number.
	'i', # integer digits of n.
	'v', # number of visible fraction digits in n, with trailing zeros.*
	'w', # number of visible fraction digits in n, without trailing zeros.*
	'f', # visible fraction digits in n, with trailing zeros.*
	't', # visible fraction digits in n, without trailing zeros.*
	'c', # compact decimal exponent value: exponent of the power of 10 used in compact decimal formatting.
	'e', # currently, synonym for `c`. however, may be redefined in the future.
	}

	_RULES: list[tuple[str \| None, re.Pattern[str]]] = [
	(None, re.compile(r'\s+', re.UNICODE)),
	('word', re.compile(fr'\b(and\|or\|is\|(?:with)?in\|not\|mod\|[{"".join(_VARS)}])\b')),
	('value', re.compile(r'\d+')),
	('symbol', re.compile(r'%\|,\|!=\|=')),
	('ellipsis', re.compile(r'\.{2,3}\|\u2026', re.UNICODE)), # U+2026: ELLIPSIS
	]


	def tokenize_rule(s: str) -> list[tuple[str, str]]:
	s = s.split('@')[0]
	result: list[tuple[str, str]] = []
	pos = 0
	end = len(s)
	while pos < end:
	for tok, rule in _RULES:
	match = rule.match(s, pos)
	if match is not None:
	pos = match.end()
	if tok:
	result.append((tok, match.group()))
	break
	else:
	raise RuleError(f"malformed CLDR pluralization rule. Got unexpected {s[pos]!r}")
	return result[::-1]


	def test_next_token(
	tokens: list[tuple[str, str]],
	type_: str,
	value: str \| None = None,
	) -> list[tuple[str, str]] \| bool:
	return tokens and tokens[-1][0] == type_ and \
	(value is None or tokens[-1][1] == value)


	def skip_token(tokens: list[tuple[str, str]], type_: str, value: str \| None = None):
	if test_next_token(tokens, type_, value):
	return tokens.pop()


	def value_node(value: int) -> tuple[Literal['value'], tuple[int]]:
	return 'value', (value, )


	def ident_node(name: str) -> tuple[str, tuple[()]]:
	return name, ()


	def range_list_node(
	range_list: Iterable[Iterable[float \| decimal.Decimal]],
	) -> tuple[Literal['range_list'], Iterable[Iterable[float \| decimal.Decimal]]]:
	return 'range_list', range_list


	def negate(rv: tuple[Any, ...]) -> tuple[Literal['not'], tuple[tuple[Any, ...]]]:
	return 'not', (rv,)


	class _Parser:
	"""Internal parser. This class can translate a single rule into an abstract
	tree of tuples. It implements the following grammar::

	condition = and_condition ('or' and_condition)*
	('@integer' samples)?
	('@decimal' samples)?
	and_condition = relation ('and' relation)*
	relation = is_relation \| in_relation \| within_relation
	is_relation = expr 'is' ('not')? value
	in_relation = expr (('not')? 'in' \| '=' \| '!=') range_list
	within_relation = expr ('not')? 'within' range_list
	expr = operand (('mod' \| '%') value)?
	operand = 'n' \| 'i' \| 'f' \| 't' \| 'v' \| 'w'
	range_list = (range \| value) (',' range_list)*
	value = digit+
	digit = 0\|1\|2\|3\|4\|5\|6\|7\|8\|9
	range = value'..'value
	samples = sampleRange (',' sampleRange)* (',' ('…'\|'...'))?
	sampleRange = decimalValue '~' decimalValue
	decimalValue = value ('.' value)?

	- Whitespace can occur between or around any of the above tokens.
	- Rules should be mutually exclusive; for a given numeric value, only one
	rule should apply (i.e. the condition should only be true for one of
	the plural rule elements).
	- The in and within relations can take comma-separated lists, such as:
	'n in 3,5,7..15'.
	- Samples are ignored.

	The translator parses the expression on instantiation into an attribute
	called `ast`.
	"""

	def __init__(self, string):
	self.tokens = tokenize_rule(string)
	if not self.tokens:
	# If the pattern is only samples, it's entirely possible
	# no stream of tokens whatsoever is generated.
	self.ast = None
	return
	self.ast = self.condition()
	if self.tokens:
	raise RuleError(f"Expected end of rule, got {self.tokens[-1][1]!r}")

	def expect(self, type_, value=None, term=None):
	token = skip_token(self.tokens, type_, value)
	if token is not None:
	return token
	if term is None:
	term = repr(value is None and type_ or value)
	if not self.tokens:
	raise RuleError(f"expected {term} but end of rule reached")
	raise RuleError(f"expected {term} but got {self.tokens[-1][1]!r}")

	def condition(self):
	op = self.and_condition()
	while skip_token(self.tokens, 'word', 'or'):
	op = 'or', (op, self.and_condition())
	return op

	def and_condition(self):
	op = self.relation()
	while skip_token(self.tokens, 'word', 'and'):
	op = 'and', (op, self.relation())
	return op

	def relation(self):
	left = self.expr()
	if skip_token(self.tokens, 'word', 'is'):
	return skip_token(self.tokens, 'word', 'not') and 'isnot' or 'is', \
	(left, self.value())
	negated = skip_token(self.tokens, 'word', 'not')
	method = 'in'
	if skip_token(self.tokens, 'word', 'within'):
	method = 'within'
	else:
	if not skip_token(self.tokens, 'word', 'in'):
	if negated:
	raise RuleError('Cannot negate operator based rules.')
	return self.newfangled_relation(left)
	rv = 'relation', (method, left, self.range_list())
	return negate(rv) if negated else rv

	def newfangled_relation(self, left):
	if skip_token(self.tokens, 'symbol', '='):
	negated = False
	elif skip_token(self.tokens, 'symbol', '!='):
	negated = True
	else:
	raise RuleError('Expected "=" or "!=" or legacy relation')
	rv = 'relation', ('in', left, self.range_list())
	return negate(rv) if negated else rv

	def range_or_value(self):
	left = self.value()
	if skip_token(self.tokens, 'ellipsis'):
	return left, self.value()
	else:
	return left, left

	def range_list(self):
	range_list = [self.range_or_value()]
	while skip_token(self.tokens, 'symbol', ','):
	range_list.append(self.range_or_value())
	return range_list_node(range_list)

	def expr(self):
	word = skip_token(self.tokens, 'word')
	if word is None or word[1] not in _VARS:
	raise RuleError('Expected identifier variable')
	name = word[1]
	if skip_token(self.tokens, 'word', 'mod'):
	return 'mod', ((name, ()), self.value())
	elif skip_token(self.tokens, 'symbol', '%'):
	return 'mod', ((name, ()), self.value())
	return ident_node(name)

	def value(self):
	return value_node(int(self.expect('value')[1]))


	def _binary_compiler(tmpl):
	"""Compiler factory for the `_Compiler`."""
	return lambda self, left, right: tmpl % (self.compile(left), self.compile(right))


	def _unary_compiler(tmpl):
	"""Compiler factory for the `_Compiler`."""
	return lambda self, x: tmpl % self.compile(x)


	compile_zero = lambda x: '0'


	class _Compiler:
	"""The compilers are able to transform the expressions into multiple
	output formats.
	"""

	def compile(self, arg):
	op, args = arg
	return getattr(self, f"compile_{op}")(*args)

	compile_n = lambda x: 'n'
	compile_i = lambda x: 'i'
	compile_v = lambda x: 'v'
	compile_w = lambda x: 'w'
	compile_f = lambda x: 'f'
	compile_t = lambda x: 't'
	compile_c = lambda x: 'c'
	compile_e = lambda x: 'e'
	compile_value = lambda x, v: str(v)
	compile_and = _binary_compiler('(%s && %s)')
	compile_or = _binary_compiler('(%s \|\| %s)')
	compile_not = _unary_compiler('(!%s)')
	compile_mod = _binary_compiler('(%s %% %s)')
	compile_is = _binary_compiler('(%s == %s)')
	compile_isnot = _binary_compiler('(%s != %s)')

	def compile_relation(self, method, expr, range_list):
	raise NotImplementedError()


	class _PythonCompiler(_Compiler):
	"""Compiles an expression to Python."""

	compile_and = _binary_compiler('(%s and %s)')
	compile_or = _binary_compiler('(%s or %s)')
	compile_not = _unary_compiler('(not %s)')
	compile_mod = _binary_compiler('MOD(%s, %s)')

	def compile_relation(self, method, expr, range_list):
	ranges = ",".join([f"({self.compile(a)}, {self.compile(b)})" for (a, b) in range_list[1]])
	return f"{method.upper()}({self.compile(expr)}, [{ranges}])"


	class _GettextCompiler(_Compiler):
	"""Compile into a gettext plural expression."""

	compile_i = _Compiler.compile_n
	compile_v = compile_zero
	compile_w = compile_zero
	compile_f = compile_zero
	compile_t = compile_zero

	def compile_relation(self, method, expr, range_list):
	rv = []
	expr = self.compile(expr)
	for item in range_list[1]:
	if item[0] == item[1]:
	rv.append(f"({expr} == {self.compile(item[0])})")
	else:
	min, max = map(self.compile, item)
	rv.append(f"({expr} >= {min} && {expr} <= {max})")
	return f"({' \|\| '.join(rv)})"


	class _JavaScriptCompiler(_GettextCompiler):
	"""Compiles the expression to plain of JavaScript."""

	# XXX: presently javascript does not support any of the
	# fraction support and basically only deals with integers.
	compile_i = lambda x: 'parseInt(n, 10)'
	compile_v = compile_zero
	compile_w = compile_zero
	compile_f = compile_zero
	compile_t = compile_zero

	def compile_relation(self, method, expr, range_list):
	code = _GettextCompiler.compile_relation(
	self, method, expr, range_list)
	if method == 'in':
	expr = self.compile(expr)
	code = f"(parseInt({expr}, 10) == {expr} && {code})"
	return code


	class _UnicodeCompiler(_Compiler):
	"""Returns a unicode pluralization rule again."""

	# XXX: this currently spits out the old syntax instead of the new
	# one. We can change that, but it will break a whole bunch of stuff
	# for users I suppose.

	compile_is = _binary_compiler('%s is %s')
	compile_isnot = _binary_compiler('%s is not %s')
	compile_and = _binary_compiler('%s and %s')
	compile_or = _binary_compiler('%s or %s')
	compile_mod = _binary_compiler('%s mod %s')

	def compile_not(self, relation):
	return self.compile_relation(*relation[1], negated=True)

	def compile_relation(self, method, expr, range_list, negated=False):
	ranges = []
	for item in range_list[1]:
	if item[0] == item[1]:
	ranges.append(self.compile(item[0]))
	else:
	ranges.append(f"{self.compile(item[0])}..{self.compile(item[1])}")
	return f"{self.compile(expr)}{' not' if negated else ''} {method} {','.join(ranges)}"