Spaces:
Sleeping
Sleeping
""" | |
This is a python implementation of wcwidth() and wcswidth(). | |
https://github.com/jquast/wcwidth | |
from Markus Kuhn's C code, retrieved from: | |
http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c | |
This is an implementation of wcwidth() and wcswidth() (defined in | |
IEEE Std 1002.1-2001) for Unicode. | |
http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html | |
http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html | |
In fixed-width output devices, Latin characters all occupy a single | |
"cell" position of equal width, whereas ideographic CJK characters | |
occupy two such cells. Interoperability between terminal-line | |
applications and (teletype-style) character terminals using the | |
UTF-8 encoding requires agreement on which character should advance | |
the cursor by how many cell positions. No established formal | |
standards exist at present on which Unicode character shall occupy | |
how many cell positions on character terminals. These routines are | |
a first attempt of defining such behavior based on simple rules | |
applied to data provided by the Unicode Consortium. | |
For some graphical characters, the Unicode standard explicitly | |
defines a character-cell width via the definition of the East Asian | |
FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes. | |
In all these cases, there is no ambiguity about which width a | |
terminal shall use. For characters in the East Asian Ambiguous (A) | |
class, the width choice depends purely on a preference of backward | |
compatibility with either historic CJK or Western practice. | |
Choosing single-width for these characters is easy to justify as | |
the appropriate long-term solution, as the CJK practice of | |
displaying these characters as double-width comes from historic | |
implementation simplicity (8-bit encoded characters were displayed | |
single-width and 16-bit ones double-width, even for Greek, | |
Cyrillic, etc.) and not any typographic considerations. | |
Much less clear is the choice of width for the Not East Asian | |
(Neutral) class. Existing practice does not dictate a width for any | |
of these characters. It would nevertheless make sense | |
typographically to allocate two character cells to characters such | |
as for instance EM SPACE or VOLUME INTEGRAL, which cannot be | |
represented adequately with a single-width glyph. The following | |
routines at present merely assign a single-cell width to all | |
neutral characters, in the interest of simplicity. This is not | |
entirely satisfactory and should be reconsidered before | |
establishing a formal standard in this area. At the moment, the | |
decision which Not East Asian (Neutral) characters should be | |
represented by double-width glyphs cannot yet be answered by | |
applying a simple rule from the Unicode database content. Setting | |
up a proper standard for the behavior of UTF-8 character terminals | |
will require a careful analysis not only of each Unicode character, | |
but also of each presentation form, something the author of these | |
routines has avoided to do so far. | |
http://www.unicode.org/unicode/reports/tr11/ | |
Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c | |
""" | |
from __future__ import division | |
# std imports | |
import os | |
import sys | |
import warnings | |
# local | |
from .table_vs16 import VS16_NARROW_TO_WIDE | |
from .table_wide import WIDE_EASTASIAN | |
from .table_zero import ZERO_WIDTH | |
from .unicode_versions import list_versions | |
try: | |
# std imports | |
from functools import lru_cache | |
except ImportError: | |
# lru_cache was added in Python 3.2 | |
# 3rd party | |
from backports.functools_lru_cache import lru_cache | |
# global cache | |
_PY3 = sys.version_info[0] >= 3 | |
def _bisearch(ucs, table): | |
""" | |
Auxiliary function for binary search in interval table. | |
:arg int ucs: Ordinal value of unicode character. | |
:arg list table: List of starting and ending ranges of ordinal values, | |
in form of ``[(start, end), ...]``. | |
:rtype: int | |
:returns: 1 if ordinal value ucs is found within lookup table, else 0. | |
""" | |
lbound = 0 | |
ubound = len(table) - 1 | |
if ucs < table[0][0] or ucs > table[ubound][1]: | |
return 0 | |
while ubound >= lbound: | |
mid = (lbound + ubound) // 2 | |
if ucs > table[mid][1]: | |
lbound = mid + 1 | |
elif ucs < table[mid][0]: | |
ubound = mid - 1 | |
else: | |
return 1 | |
return 0 | |
def wcwidth(wc, unicode_version='auto'): | |
r""" | |
Given one Unicode character, return its printable length on a terminal. | |
:param str wc: A single Unicode character. | |
:param str unicode_version: A Unicode version number, such as | |
``'6.0.0'``. A list of version levels suported by wcwidth | |
is returned by :func:`list_versions`. | |
Any version string may be specified without error -- the nearest | |
matching version is selected. When ``latest`` (default), the | |
highest Unicode version level is used. | |
:return: The width, in cells, necessary to display the character of | |
Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has | |
no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is | |
not printable, or has an indeterminate effect on the terminal, such as | |
a control character. Otherwise, the number of column positions the | |
character occupies on a graphic terminal (1 or 2) is returned. | |
:rtype: int | |
See :ref:`Specification` for details of cell measurement. | |
""" | |
ucs = ord(wc) if wc else 0 | |
# small optimization: early return of 1 for printable ASCII, this provides | |
# approximately 40% performance improvement for mostly-ascii documents, with | |
# less than 1% impact to others. | |
if 32 <= ucs < 0x7f: | |
return 1 | |
# C0/C1 control characters are -1 for compatibility with POSIX-like calls | |
if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0: | |
return -1 | |
_unicode_version = _wcmatch_version(unicode_version) | |
# Zero width | |
if _bisearch(ucs, ZERO_WIDTH[_unicode_version]): | |
return 0 | |
# 1 or 2 width | |
return 1 + _bisearch(ucs, WIDE_EASTASIAN[_unicode_version]) | |
def wcswidth(pwcs, n=None, unicode_version='auto'): | |
""" | |
Given a unicode string, return its printable length on a terminal. | |
:param str pwcs: Measure width of given unicode string. | |
:param int n: When ``n`` is None (default), return the length of the entire | |
string, otherwise only the first ``n`` characters are measured. This | |
argument exists only for compatibility with the C POSIX function | |
signature. It is suggested instead to use python's string slicing | |
capability, ``wcswidth(pwcs[:n])`` | |
:param str unicode_version: An explicit definition of the unicode version | |
level to use for determination, may be ``auto`` (default), which uses | |
the Environment Variable, ``UNICODE_VERSION`` if defined, or the latest | |
available unicode version, otherwise. | |
:rtype: int | |
:returns: The width, in cells, needed to display the first ``n`` characters | |
of the unicode string ``pwcs``. Returns ``-1`` for C0 and C1 control | |
characters! | |
See :ref:`Specification` for details of cell measurement. | |
""" | |
# this 'n' argument is a holdover for POSIX function | |
_unicode_version = None | |
end = len(pwcs) if n is None else n | |
width = 0 | |
idx = 0 | |
last_measured_char = None | |
while idx < end: | |
char = pwcs[idx] | |
if char == u'\u200D': | |
# Zero Width Joiner, do not measure this or next character | |
idx += 2 | |
continue | |
if char == u'\uFE0F' and last_measured_char: | |
# on variation selector 16 (VS16) following another character, | |
# conditionally add '1' to the measured width if that character is | |
# known to be converted from narrow to wide by the VS16 character. | |
if _unicode_version is None: | |
_unicode_version = _wcversion_value(_wcmatch_version(unicode_version)) | |
if _unicode_version >= (9, 0, 0): | |
width += _bisearch(ord(last_measured_char), VS16_NARROW_TO_WIDE["9.0.0"]) | |
last_measured_char = None | |
idx += 1 | |
continue | |
# measure character at current index | |
wcw = wcwidth(char, unicode_version) | |
if wcw < 0: | |
# early return -1 on C0 and C1 control characters | |
return wcw | |
if wcw > 0: | |
# track last character measured to contain a cell, so that | |
# subsequent VS-16 modifiers may be understood | |
last_measured_char = char | |
width += wcw | |
idx += 1 | |
return width | |
def _wcversion_value(ver_string): | |
""" | |
Integer-mapped value of given dotted version string. | |
:param str ver_string: Unicode version string, of form ``n.n.n``. | |
:rtype: tuple(int) | |
:returns: tuple of digit tuples, ``tuple(int, [...])``. | |
""" | |
retval = tuple(map(int, (ver_string.split('.')))) | |
return retval | |
def _wcmatch_version(given_version): | |
""" | |
Return nearest matching supported Unicode version level. | |
If an exact match is not determined, the nearest lowest version level is | |
returned after a warning is emitted. For example, given supported levels | |
``4.1.0`` and ``5.0.0``, and a version string of ``4.9.9``, then ``4.1.0`` | |
is selected and returned: | |
>>> _wcmatch_version('4.9.9') | |
'4.1.0' | |
>>> _wcmatch_version('8.0') | |
'8.0.0' | |
>>> _wcmatch_version('1') | |
'4.1.0' | |
:param str given_version: given version for compare, may be ``auto`` | |
(default), to select Unicode Version from Environment Variable, | |
``UNICODE_VERSION``. If the environment variable is not set, then the | |
latest is used. | |
:rtype: str | |
:returns: unicode string, or non-unicode ``str`` type for python 2 | |
when given ``version`` is also type ``str``. | |
""" | |
# Design note: the choice to return the same type that is given certainly | |
# complicates it for python 2 str-type, but allows us to define an api that | |
# uses 'string-type' for unicode version level definitions, so all of our | |
# example code works with all versions of python. | |
# | |
# That, along with the string-to-numeric and comparisons of earliest, | |
# latest, matching, or nearest, greatly complicates this function. | |
# Performance is somewhat curbed by memoization. | |
_return_str = not _PY3 and isinstance(given_version, str) | |
if _return_str: | |
# avoid list-comprehension to work around a coverage issue: | |
# https://github.com/nedbat/coveragepy/issues/753 | |
unicode_versions = list(map(lambda ucs: ucs.encode(), list_versions())) | |
else: | |
unicode_versions = list_versions() | |
latest_version = unicode_versions[-1] | |
if given_version in (u'auto', 'auto'): | |
given_version = os.environ.get( | |
'UNICODE_VERSION', | |
'latest' if not _return_str else latest_version.encode()) | |
if given_version in (u'latest', 'latest'): | |
# default match, when given as 'latest', use the most latest unicode | |
# version specification level supported. | |
return latest_version if not _return_str else latest_version.encode() | |
if given_version in unicode_versions: | |
# exact match, downstream has specified an explicit matching version | |
# matching any value of list_versions(). | |
return given_version if not _return_str else given_version.encode() | |
# The user's version is not supported by ours. We return the newest unicode | |
# version level that we support below their given value. | |
try: | |
cmp_given = _wcversion_value(given_version) | |
except ValueError: | |
# submitted value raises ValueError in int(), warn and use latest. | |
warnings.warn("UNICODE_VERSION value, {given_version!r}, is invalid. " | |
"Value should be in form of `integer[.]+', the latest " | |
"supported unicode version {latest_version!r} has been " | |
"inferred.".format(given_version=given_version, | |
latest_version=latest_version)) | |
return latest_version if not _return_str else latest_version.encode() | |
# given version is less than any available version, return earliest | |
# version. | |
earliest_version = unicode_versions[0] | |
cmp_earliest_version = _wcversion_value(earliest_version) | |
if cmp_given <= cmp_earliest_version: | |
# this probably isn't what you wanted, the oldest wcwidth.c you will | |
# find in the wild is likely version 5 or 6, which we both support, | |
# but it's better than not saying anything at all. | |
warnings.warn("UNICODE_VERSION value, {given_version!r}, is lower " | |
"than any available unicode version. Returning lowest " | |
"version level, {earliest_version!r}".format( | |
given_version=given_version, | |
earliest_version=earliest_version)) | |
return earliest_version if not _return_str else earliest_version.encode() | |
# create list of versions which are less than our equal to given version, | |
# and return the tail value, which is the highest level we may support, | |
# or the latest value we support, when completely unmatched or higher | |
# than any supported version. | |
# | |
# function will never complete, always returns. | |
for idx, unicode_version in enumerate(unicode_versions): | |
# look ahead to next value | |
try: | |
cmp_next_version = _wcversion_value(unicode_versions[idx + 1]) | |
except IndexError: | |
# at end of list, return latest version | |
return latest_version if not _return_str else latest_version.encode() | |
# Maybe our given version has less parts, as in tuple(8, 0), than the | |
# next compare version tuple(8, 0, 0). Test for an exact match by | |
# comparison of only the leading dotted piece(s): (8, 0) == (8, 0). | |
if cmp_given == cmp_next_version[:len(cmp_given)]: | |
return unicode_versions[idx + 1] | |
# Or, if any next value is greater than our given support level | |
# version, return the current value in index. Even though it must | |
# be less than the given value, its our closest possible match. That | |
# is, 4.1 is returned for given 4.9.9, where 4.1 and 5.0 are available. | |
if cmp_next_version > cmp_given: | |
return unicode_version | |
assert False, ("Code path unreachable", given_version, unicode_versions) # pragma: no cover | |