Spaces:
Running
Running
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Licensed under WTFPL or the Unlicense or CC0. | |
# This uses Python 3, but it's easy to port to Python 2 by changing | |
# strings to u'xx'. | |
import itertools | |
import re | |
def _num2chinese(num: str, big=False, simp=True, o=False, twoalt=False) -> str: | |
"""Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九) | |
Args: | |
num (str): arabic number to convert | |
big (bool, optional): use financial characters. Defaults to False. | |
simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True. | |
o (bool, optional): use 〇 for 'zero'. Defaults to False. | |
twoalt (bool, optional): use 两/兩 for 'two' when appropriate. Defaults to False. | |
Raises: | |
ValueError: if number is more than 1e48 | |
ValueError: if 'e' exposent in number | |
Returns: | |
str: converted number as hanzi characters | |
""" | |
# check num first | |
nd = str(num) | |
if abs(float(nd)) >= 1e48: | |
raise ValueError("number out of range") | |
if "e" in nd: | |
raise ValueError("scientific notation is not supported") | |
c_symbol = "正负点" if simp else "正負點" | |
if o: # formal | |
twoalt = False | |
if big: | |
c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖" | |
c_unit1 = "拾佰仟" | |
c_twoalt = "贰" if simp else "貳" | |
else: | |
c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九" | |
c_unit1 = "十百千" | |
if twoalt: | |
c_twoalt = "两" if simp else "兩" | |
else: | |
c_twoalt = "二" | |
c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載" | |
revuniq = lambda l: "".join(k for k, g in itertools.groupby(reversed(l))) | |
nd = str(num) | |
result = [] | |
if nd[0] == "+": | |
result.append(c_symbol[0]) | |
elif nd[0] == "-": | |
result.append(c_symbol[1]) | |
if "." in nd: | |
integer, remainder = nd.lstrip("+-").split(".") | |
else: | |
integer, remainder = nd.lstrip("+-"), None | |
if int(integer): | |
splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)] | |
intresult = [] | |
for nu, unit in enumerate(splitted): | |
# special cases | |
if int(unit) == 0: # 0000 | |
intresult.append(c_basic[0]) | |
continue | |
if nu > 0 and int(unit) == 2: # 0002 | |
intresult.append(c_twoalt + c_unit2[nu - 1]) | |
continue | |
ulist = [] | |
unit = unit.zfill(4) | |
for nc, ch in enumerate(reversed(unit)): | |
if ch == "0": | |
if ulist: # ???0 | |
ulist.append(c_basic[0]) | |
elif nc == 0: | |
ulist.append(c_basic[int(ch)]) | |
elif nc == 1 and ch == "1" and unit[1] == "0": | |
# special case for tens | |
# edit the 'elif' if you don't like | |
# 十四, 三千零十四, 三千三百一十四 | |
ulist.append(c_unit1[0]) | |
elif nc > 1 and ch == "2": | |
ulist.append(c_twoalt + c_unit1[nc - 1]) | |
else: | |
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1]) | |
ustr = revuniq(ulist) | |
if nu == 0: | |
intresult.append(ustr) | |
else: | |
intresult.append(ustr + c_unit2[nu - 1]) | |
result.append(revuniq(intresult).strip(c_basic[0])) | |
else: | |
result.append(c_basic[0]) | |
if remainder: | |
result.append(c_symbol[2]) | |
result.append("".join(c_basic[int(ch)] for ch in remainder)) | |
return "".join(result) | |
def _number_replace(match) -> str: | |
"""function to apply in a match, transform all numbers in a match by chinese characters | |
Args: | |
match (re.Match): numbers regex matches | |
Returns: | |
str: replaced characters for the numbers | |
""" | |
match_str: str = match.group() | |
return _num2chinese(match_str) | |
def replace_numbers_to_characters_in_text(text: str) -> str: | |
"""Replace all arabic numbers in a text by their equivalent in chinese characters (simplified) | |
Args: | |
text (str): input text to transform | |
Returns: | |
str: output text | |
""" | |
text = re.sub(r"[0-9]+", _number_replace, text) | |
return text | |