test-rtechs's picture
Upload 68 files
607e564 verified
raw
history blame
122 kB
#!/usr/bin/env python
"""
Written by Ulf Hermjakob, USC/ISI March-April 2024
uroman is a universal romanizer. It converts text in any script to the Latin alphabet.
This script is a Python reimplementation of an earlier Perl script, with some improvements.
The tool has been tested on 250 languages, with 100 or more sentences each.
This script is still under development and large-scale testing. Feedback welcome.
This script provides token-size caching (for faster runtimes).
Output formats include
(1) best romanization string
(2) best romanization edges ("best path"; incl. start and end positions with respect to the original string)
(3) best romanization with alternatives (as applicable for ambiguous romanization)
(4) best romanization full lattice (all edges, including superseded sub-edges)
See below for 'sample calls' under main()
"""
from __future__ import annotations
import argparse
from collections import defaultdict
# from memory_profiler import profile
import datetime
from enum import Enum
from fractions import Fraction
import gc
import json
import math
import os
import pathlib
from pathlib import Path
import pstats
import regex
import sys
from typing import List, Tuple
import unicodedata as ud
PROFILE_FLAG = "--profile" # also used in argparse processing
if PROFILE_FLAG in sys.argv:
import cProfile
# UTILITIES
def timer(func):
def wrapper(*args, **kwargs):
start_time = datetime.datetime.now()
print(f"Calling: {func.__name__}{args}")
print(f"Start time: {start_time:%A, %B %d, %Y at %H:%M}")
result = func(*args, **kwargs)
end_time = datetime.datetime.now()
time_diff = (end_time-start_time).total_seconds()
print(f"End time: {end_time:%A, %B %d, %Y at %H:%M}")
print(f"Duration: {time_diff} seconds")
return result
return wrapper
def slot_value_in_double_colon_del_list(line: str, slot: str, default: str | list | None = None) -> str | list | None:
"""For a given slot, e.g. 'cost', get its value from a line such as '::s1 of course ::s2 ::cost 0.3' -> 0.3
The value can be an empty string, as for ::s2 in the example above."""
m = regex.match(fr'(?:.*\s)?::{slot}(|\s+\S.*?)(?:\s+::\S.*|\s*)$', line)
return m.group(1).strip() if m else default
def has_value_in_double_colon_del_list(line: str, slot: str) -> bool:
return isinstance(slot_value_in_double_colon_del_list(line, slot), str)
def dequote_string(s: str) -> str:
if isinstance(s, str):
m = regex.match(r'''\s*(['"“])(.*)(['"”])\s*$''', s)
if m and ((m.group(1) + m.group(3)) in ("''", '""', '“”')):
return m.group(2)
return s
def last_chr(s: str) -> str:
if len(s):
return s[len(s)-1]
else:
''
def ud_numeric(char: str) -> int | float | None:
try:
num_f = ud.numeric(char)
return int(num_f) if num_f.is_integer() else num_f
except (ValueError, TypeError):
return None
def robust_str_to_num(num_s: str, filename: str = None, line_number: int | None = None, silent: bool = False) \
-> int | float | None:
if isinstance(num_s, str):
try:
return float(num_s) if "." in num_s else int(num_s)
except ValueError:
if not silent:
sys.stderr.write(f'Cannot convert "{num_s}" to a number')
if line_number:
sys.stderr.write(f' line: {line_number}')
if filename:
sys.stderr.write(f' file: {filename}')
sys.stderr.write(f'\n')
elif isinstance(num_s, float) or isinstance(num_s, int):
return num_s
return None
def first_non_none(*args):
for arg in args:
if arg is not None:
return arg
return None
def any_not_none(*args) -> bool:
for arg in args:
if arg is not None:
return True
return False
def add_non_none_to_dict(d: dict, key: str, value) -> None:
if value is not None:
d[key] = value
def fraction_char2fraction(fraction_char: str, fraction_value: float | None = None,
uroman: Uroman | None = None) -> Fraction | None:
s = ''
fraction = None
for ud_decomp_elem in ud.decomposition(fraction_char).split():
try:
s += chr(int(ud_decomp_elem, 16))
except ValueError:
s += ud_decomp_elem
if m := regex.match(r'<fraction>(\d+)⁄(\d+)$', s):
numerator_s, denominator_s = m.group(1, 2)
try:
fraction = Fraction(int(numerator_s), int(denominator_s))
except ValueError:
fraction = None
if (fraction is None) and uroman and fraction_value:
if num_denom := uroman.unicode_float2fraction(fraction_value):
try:
fraction = Fraction(num_denom[0], num_denom[1])
except ValueError:
fraction = None
return fraction
def chr_name(char: str) -> str:
"""robust version of ud.name; see related Uroman.char_name() that includes names not included in UnicodeData.txt"""
try:
return ud.name(char)
except (ValueError, TypeError):
return ''
def args_get(key: str, args: argparse.Namespace | None = None):
return vars(args)[key] if args and (key in args) else None
class DictClass:
def __init__(self, **kw_args):
for kw_arg in kw_args:
kw_arg2 = kw_arg.replace('_', '-')
value = kw_args[kw_arg]
if not (value in (None, [], False)):
self.__dict__[kw_arg2] = value
def __repr__(self):
return str(self.__dict__)
def __getitem__(self, key, default=None):
return self.__dict__[key] if key in self.__dict__ else default
def __bool__(self):
return len(self.__dict__) > 0
class RomRule(DictClass):
# key: source string
# typical attributes: s (source), t (target), prov (provenance), lcodes (language codes)
# t_alts=t_alts (target alternatives), use_only_at_start_of_word, dont_use_at_start_of_word,
# use_only_at_end_of_word, dont_use_at_end_of_word, use_only_for_whole_word
pass
class Script(DictClass):
# key: lower case script_name
# typical attributes: script_name, direction, abugida_default_vowels, alt_script_names, languages
pass
class RomFormat(Enum):
"""Output format of romanization"""
STR = 'str' # simple string
EDGES = 'edges' # list of edges (includes character offsets in original string)
ALTS = 'alts' # lattice including alternative edges
LATTICE = 'lattice' # lattice including alternative and superseded edges
def __str__(self):
return self.value
class Uroman:
"""This class loads and maintains uroman data independent of any specific text corpus.
Typically, only a single instance will be used. (In contrast to multiple lattice instances, one per text.)
Methods include some testing. And finally methods to romanize a string (romanize_string()) or an entire file
(romanize_file())."""
def __init__(self, data_dir: Path, **args): # args: load_log, rebuild_ud_props
self.data_dir = data_dir
self.rom_rules = defaultdict(list)
self.scripts = defaultdict(Script)
self.dict_bool = defaultdict(bool)
self.dict_str = defaultdict(str)
self.dict_int = defaultdict(int)
self.dict_num = defaultdict(lambda: None) # values are int (most common), float, or str ("1/2")
# num_props key: txt
# values: {"txt": "\u137b", "rom": "100", "value": 100, "type": "base", "mult": 1, "script": "Ethiopic"}
self.num_props = defaultdict(dict)
self.dict_set = defaultdict(set)
self.float2fraction = {} # caching
gc.disable()
self.load_resource_files(data_dir, args.get('load_log', False),
args.get('rebuild_ud_props', False),
args.get('rebuild_num_props', False))
gc.enable()
self.hangul_rom = {}
self.rom_cache = {} # key: (s, lcode) value: t
self.stats = defaultdict(int) # stats, e.g. for unprocessed numbers
self.abugida_cache = {} # key: (script, char_rom) value: (base_rom, base_rom_plus_abugida_vowel, modified rom)
def second_rom_filter(self, c: str, rom: str, name: str | None) -> Tuple[str | None, str]:
"""Much of this code will eventually move the old Perl code to generate cleaner primary data"""
if rom and (' ' in rom):
if name is None:
name = self.chr_name(c)
if "MYANMAR VOWEL SIGN KAYAH" in name:
if m := regex.search(r'kayah\s+(\S+)\s*$', rom):
return m.group(1), name
if "MENDE KIKAKUI SYLLABLE" in name:
if m := regex.search(r'm\d+\s+(\S+)\s*$', rom):
return m.group(1), name
if regex.search(r'\S\s+\S', rom):
return c, name
return None, name
def load_rom_file(self, filename: str, provenance: str, file_format: str = None, load_log: bool = True):
"""Reads in and processes the 3 main romanization data files: (1) romanization-auto-table.txt
which was automatically generated from UnicodeData.txt (2) UnicodeDataOverwrite.txt that "corrects"
some entries in romanization-auto-table.txt and (3) romanization-table.txt which was largely manually
created and allows complex romanization rules, some for specific languages, some for specific contexts."""
n_entries = 0
try:
f = open(filename)
except FileNotFoundError:
sys.stderr.write(f'Cannot open file {filename}\n')
return
with (f):
for line_number, line in enumerate(f, 1):
if line.startswith('#'):
continue
if regex.match(r'^\s*$', line): # blank line
continue
line = regex.sub(r'\s{2,}#.*$', '', line)
if file_format == 'u2r':
t_at_end_of_syllable = None
u = dequote_string(slot_value_in_double_colon_del_list(line, 'u'))
try:
cp = int(u, 16)
s = chr(cp)
except ValueError:
continue
t = dequote_string(slot_value_in_double_colon_del_list(line, 'r'))
if name := slot_value_in_double_colon_del_list(line, 'name'):
self.dict_str[('name', s)] = name
if pic := slot_value_in_double_colon_del_list(line, 'pic'):
self.dict_str[('pic', s)] = pic
if tone_mark := slot_value_in_double_colon_del_list(line, 'tone-mark'):
self.dict_str[('tone-mark', s)] = tone_mark
if syllable_info := slot_value_in_double_colon_del_list(line, 'syllable-info'):
self.dict_str[('syllable-info', s)] = syllable_info
else:
s = dequote_string(slot_value_in_double_colon_del_list(line, 's'))
t = dequote_string(slot_value_in_double_colon_del_list(line, 't'))
t_at_end_of_syllable = dequote_string(slot_value_in_double_colon_del_list(line,
't-end-of-syllable'))
if (num_s := slot_value_in_double_colon_del_list(line, 'num')) is not None:
num = robust_str_to_num(num_s)
self.dict_num[s] = (num_s if (num is None) else num)
is_minus_sign = has_value_in_double_colon_del_list(line, 'is-minus-sign')
is_plus_sign = has_value_in_double_colon_del_list(line, 'is-plus-sign')
is_decimal_point = has_value_in_double_colon_del_list(line, 'is-decimal-point')
is_large_power = has_value_in_double_colon_del_list(line, 'is-large-power')
fraction_connector = slot_value_in_double_colon_del_list(line, 'fraction-connector')
percentage_marker = slot_value_in_double_colon_del_list(line, 'percentage-marker')
int_frac_connector = slot_value_in_double_colon_del_list(line, 'int-frac-connector')
lcode_s = slot_value_in_double_colon_del_list(line, 'lcode')
lcodes = regex.split(r'[,;]\s*', lcode_s) if lcode_s else []
use_only_at_start_of_word = has_value_in_double_colon_del_list(line, 'use-only-at-start-of-word')
dont_use_at_start_of_word = has_value_in_double_colon_del_list(line, 'dont-use-at-start-of-word')
use_only_at_end_of_word = has_value_in_double_colon_del_list(line, 'use-only-at-end-of-word')
dont_use_at_end_of_word = has_value_in_double_colon_del_list(line, 'dont-use-at-end-of-word')
use_only_for_whole_word = has_value_in_double_colon_del_list(line, 'use-only-for-whole-word')
num_s = slot_value_in_double_colon_del_list(line, 'num')
num = robust_str_to_num(num_s, filename, line_number, silent=False)
t_alt_s = slot_value_in_double_colon_del_list(line, 't-alt')
t_alts = regex.split(r'[,;]\s*', t_alt_s) if t_alt_s else []
t_alts = list(map(dequote_string, t_alts))
t_mod, name2 = self.second_rom_filter(s, t, None)
if t_mod and (t_mod != t):
if t != s:
pass # sys.stderr.write(f'UPDATE: {s} {name2} {t} -> {t_mod}\n')
t = t_mod
if s is not None:
for bool_key in ('is-large-power', 'is-minus-sign', 'is-plus-sign', 'is-decimal-point'):
bool_value = eval(bool_key.replace('-', '_'))
if bool_value:
self.dict_bool[(bool_key, s)] = True
if any_not_none(t, num, is_minus_sign, is_plus_sign, is_decimal_point, is_large_power,
fraction_connector, percentage_marker, int_frac_connector):
self.register_s_prefix(s)
n_entries += 1
# if regex.match(r'[\u2800-\u28FF]', s): print("Braille", s, t)
restrictions = [lcodes, use_only_at_start_of_word, dont_use_at_start_of_word,
use_only_at_end_of_word, dont_use_at_end_of_word, use_only_for_whole_word]
n_restrictions = len([restr for restr in restrictions if restr])
provenance2 = provenance
if (t is None) and (num is not None) and (provenance2 == "rom"):
provenance2 = "num"
new_rom_rule = RomRule(s=s, t=t, prov=provenance2, lcodes=lcodes, t_alts=t_alts, num=num,
use_only_at_start_of_word=use_only_at_start_of_word,
dont_use_at_start_of_word=dont_use_at_start_of_word,
use_only_at_end_of_word=use_only_at_end_of_word,
dont_use_at_end_of_word=dont_use_at_end_of_word,
use_only_for_whole_word=use_only_for_whole_word,
t_at_end_of_syllable=t_at_end_of_syllable,
n_restr=n_restrictions,
is_minus_sign=is_minus_sign,
is_plus_sign=is_plus_sign,
is_decimal_point=is_decimal_point,
fraction_connector=fraction_connector,
percentage_marker=percentage_marker,
int_frac_connector=int_frac_connector,
is_large_power=is_large_power)
old_rom_rules = self.rom_rules[s]
if ((len(old_rom_rules) == 1) and (old_rom_rules[0]['prov'] in ('ud', 'ow'))
and not (lcodes or use_only_at_start_of_word or dont_use_at_start_of_word
or use_only_at_end_of_word or dont_use_at_end_of_word
or use_only_for_whole_word)):
self.rom_rules[s] = [new_rom_rule] # overwrite
else:
self.rom_rules[s].append(new_rom_rule)
# Thai
thai_cancellation_mark = '\u0E4C'
# cancellation applies to preceding letter incl. any vowel modifier letter (e.g. ศักดิ์สิทธิ์ -> saksit)
for cp in range(0x0E01, 0x0E4C): # Thai
c = chr(cp)
s = c + thai_cancellation_mark
new_rom_rule = RomRule(s=s, t='', prov='auto cancel letter')
if not self.rom_rules[s]:
self.rom_rules[s] = [new_rom_rule]
self.register_s_prefix(s)
thai_consonants = list(map(chr, range(0x0E01, 0x0E2F)))
thai_vowel_modifiers = ['\u0E31', '\u0E47'] + list(map(chr, range(0x0E33, 0x0E3B)))
for c1 in thai_consonants:
for v in thai_vowel_modifiers:
s = c1 + v + thai_cancellation_mark
new_rom_rule = RomRule(s=s, t='', prov='auto cancel syllable')
if not self.rom_rules[s]:
self.rom_rules[s] = [new_rom_rule]
self.register_s_prefix(s)
if load_log:
sys.stderr.write(f'Loaded {n_entries} from {filename}\n')
def load_script_file(self, filename: str, load_log: bool = True):
"""Reads in (typically from Scripts.txt) information about various scripts such as Devanagari,
incl. information such as the default abugida vowel letter (e.g. "a")."""
n_entries, max_n_script_name_components = 0, 0
try:
f = open(filename)
except FileNotFoundError:
sys.stderr.write(f'Cannot open file {filename}\n')
return
with f:
for line_number, line in enumerate(f, 1):
if line.startswith('#'):
continue
if regex.match(r'^\s*$', line): # blank line
continue
line = regex.sub(r'\s{2,}#.*$', '', line)
if script_name := slot_value_in_double_colon_del_list(line, 'script-name'):
lc_script_name = script_name.lower()
if lc_script_name in self.scripts:
sys.stderr.write(f'** Ignoring duplicate script "{script_name}" '
f'in line {line_number} of {filename}\n')
else:
n_entries += 1
direction = slot_value_in_double_colon_del_list(line, 'direction')
abugida_default_vowel_s = slot_value_in_double_colon_del_list(line,
'abugida-default-vowel')
abugida_default_vowels = regex.split(r'[,;]\s*', abugida_default_vowel_s) \
if abugida_default_vowel_s else []
alt_script_name_s = slot_value_in_double_colon_del_list(line, 'alt-script-name')
alt_script_names = regex.split(r'[,;]\s*', alt_script_name_s) if alt_script_name_s else []
language_s = slot_value_in_double_colon_del_list(line, 'language')
languages = regex.split(r'[,;]\s*', language_s) if language_s else []
new_script = Script(script_name=script_name, alt_script_names=alt_script_names,
languages=languages, direction=direction,
abugida_default_vowels=abugida_default_vowels)
self.scripts[lc_script_name] = new_script
for language in languages:
self.dict_set[('scripts', language)].add(script_name)
for alt_script_name in alt_script_names:
lc_alt_script_name = alt_script_name.lower()
if lc_alt_script_name in self.scripts:
sys.stderr.write(f'** Ignoring duplicate alternative script name "{script_name}" '
f'in line {line_number} of {filename}\n')
else:
self.scripts[lc_alt_script_name] = new_script
n_script_name_components = len(script_name.split())
if n_script_name_components > max_n_script_name_components:
max_n_script_name_components = n_script_name_components
if max_n_script_name_components:
self.dict_int['max_n_script_name_components'] = max_n_script_name_components
if load_log:
sys.stderr.write(f'Loaded {n_entries} script descriptions from {filename}'
f' (max_n_scripts_name_components: {max_n_script_name_components})\n')
def extract_script_name(self, script_name_plus: str, full_char_name: str = None) -> str | None:
"""Using info from Scripts.txt, this script selects the script name from a Unicode,
e.g. given "OLD HUNGARIAN CAPITAL LETTER A", extract "Old Hungarian"."""
if full_char_name and script_name_plus == full_char_name:
return None
while script_name_plus:
if script_name_plus.lower() in self.scripts:
if script := self.scripts[script_name_plus.lower()]:
if script_name := script['script-name']:
return script_name
script_name_plus = regex.sub(r'\s*\S*\s*$', '', script_name_plus)
return None
def load_unicode_data_props(self, filename: str, load_log: bool = True):
"""Loads Unicode derived data from (1) UnicodeDataProps.txt, (2) UnicodeDataPropsHangul.txt
and UnicodeDataPropsCJK.txt with a list of valid script-specific characters."""
n_script, n_script_char, n_script_vowel_sign, n_script_medial_consonant_sign, n_script_virama = 0, 0, 0, 0, 0
try:
f = open(filename)
except FileNotFoundError:
sys.stderr.write(f'Cannot open file {filename}\n')
return
with f:
for line_number, line in enumerate(f, 1):
if line.startswith('#'):
continue
if regex.match(r'^\s*$', line): # blank line
continue
line = regex.sub(r'\s{2,}#.*$', '', line)
if script_name := slot_value_in_double_colon_del_list(line, 'script-name'):
n_script += 1
for char in slot_value_in_double_colon_del_list(line, 'char', []):
self.dict_str[('script', char)] = script_name
n_script_char += 1
for char in slot_value_in_double_colon_del_list(line, 'numeral', []):
self.dict_str[('script', char)] = script_name
n_script_char += 1
for char in slot_value_in_double_colon_del_list(line, 'vowel-sign', []):
self.dict_bool[('is-vowel-sign', char)] = True
n_script_vowel_sign += 1
for char in slot_value_in_double_colon_del_list(line, 'medial-consonant-sign', []):
self.dict_bool[('is-medial-consonant-sign', char)] = True
n_script_medial_consonant_sign += 1
for char in slot_value_in_double_colon_del_list(line, 'sign-virama', []):
self.dict_bool[('is-virama', char)] = True
n_script_virama += 1
if load_log:
sys.stderr.write(f'Loaded from {filename} mappings of {n_script_char:,d} characters '
f'to {n_script} script{"" if n_script == 1 else "s"}')
if n_script_vowel_sign or n_script_virama or n_script_medial_consonant_sign:
sys.stderr.write(f', with a total of {n_script_vowel_sign} vowel signs, '
f'{n_script_medial_consonant_sign} medial consonant signs '
f'and {n_script_virama} viramas')
sys.stderr.write('.\n')
def load_num_props(self, filename: str, load_log: bool = True):
"""Loads Unicode derived data from (1) UnicodeDataProps.txt, (2) UnicodeDataPropsHangul.txt
and UnicodeDataPropsCJK.txt with a list of valid script-specific characters."""
n_entries = 0
try:
f = open(filename)
except FileNotFoundError:
sys.stderr.write(f'Cannot open file {filename}\n')
return
with f:
for line_number, line in enumerate(f, 1):
if line.startswith('#'):
continue
if regex.match(r'^\s*$', line): # blank line
continue
d = json.loads(line)
if isinstance(d, dict):
if txt := d.get('txt'):
self.num_props[txt] = d
n_entries += 1
else:
sys.stderr.write(f'Missing txt in l.{line_number} in file {filename}: {line.strip()}\n')
for bool_key in ('is-large-power',):
if d.get(bool_key):
self.dict_bool[(bool_key, txt)] = True
else:
sys.stderr.write(f'json in l.{line_number} in file {filename} not a dict: {line.strip()}\n')
if load_log:
sys.stderr.write(f'Loaded {n_entries} entries from {filename}\n')
@staticmethod
def de_accent_pinyin(s: str) -> str:
"""De-accents a string from "liú" to "liu" and "ü" to "u" (to help process file Chinese_to_Pinyin.txt)."""
result = ''
for char in s:
if decomp := ud.decomposition(char).split():
try:
decomp_chars = [chr(int(x, 16)) for x in decomp]
letters = [x for x in decomp_chars if ud.category(x).startswith('L')]
except ValueError:
sys.stderr.write(f'Cannot decode {decomp}\n')
continue
if len(letters) == 1:
result += letters[0]
else:
sys.stderr.write(f'Cannot decode {decomp} (expected 1 letter)\n')
else:
result += char
result = result.replace('ü', 'u')
return result
def register_s_prefix(self, s: str):
for prefix_len in range(1, len(s) + 1):
self.dict_bool[('s-prefix', s[:prefix_len])] = True
def load_chinese_pinyin_file(self, filename: str, load_log: bool = True):
"""Loads file Chinese_to_Pinyin.txt which maps Chinese characters to their Latin form."""
n_entries = 0
try:
f = open(filename)
except FileNotFoundError:
sys.stderr.write(f'Cannot open file {filename}\n')
return
with f:
for line_number, line in enumerate(f, 1):
if line.startswith('#'):
continue
if regex.match(r'^\s*$', line): # blank line
continue
try:
chinese, pinyin = line.rstrip().split()
rom = self.de_accent_pinyin(pinyin)
except ValueError:
sys.stderr.write(f'Cannot process line {line_number} in file {filename}: {line}')
else:
s = chinese
new_rom_rule = RomRule(s=s, t=rom, prov='rom pinyin', lcodes=[])
self.rom_rules[chinese].append(new_rom_rule)
self.register_s_prefix(s)
n_entries += 1
if load_log:
sys.stderr.write(f'Loaded {n_entries} script descriptions from {filename}\n')
@staticmethod
def add_char_to_rebuild_unicode_data_dict(d: dict, script_name: str, prop_class: str, char: str):
d['script-names'].add(script_name)
key = (script_name, prop_class)
if key in d:
d[key].append(char)
else:
d[key] = [char]
def rebuild_unicode_data_props(self, out_filename: str, cjk: str = None, hangul: str = None):
"""This functions rebuilds UnicodeDataProps*.txt This might be useful when a new UnicodeData.txt
version is released, or additional information is extracted from Unicode to UnicodeDataProps.txt
Regular users normally never have to call this function."""
d = {'script-names': set()}
n_script_refs = 0
codepoint = -1
prop_classes = {'char'}
while codepoint < 0xF0000:
codepoint += 1
c = chr(codepoint)
if not (char_name := self.chr_name(c)):
continue
for prop_name_comp2 in ('VOWEL SIGN',
('MEDIAL CONSONANT SIGN', 'CONSONANT SIGN MEDIAL', 'CONSONANT SIGN SHAN MEDIAL',
'CONSONANT SIGN MON MEDIAL'),
('SIGN VIRAMA', 'SIGN ASAT', 'AL-LAKUNA', 'SIGN COENG', 'SIGN PAMAAEH',
'CHARACTER PHINTHU'),
('NUMERAL', 'NUMBER', 'DIGIT', 'FRACTION')):
if prop_name_comp2 and isinstance(prop_name_comp2, tuple):
prop_list = prop_name_comp2
else:
prop_list = (prop_name_comp2,)
for prop_name_comp in prop_list:
prop_class = prop_list[0].lower().replace(' ', '-')
if prop_class not in prop_classes:
prop_classes.add(prop_class)
script_name_cand = regex.sub(fr'\s+{prop_name_comp}\b.*$', '', char_name)
if script_name := self.extract_script_name(script_name_cand, char_name):
self.add_char_to_rebuild_unicode_data_dict(d, script_name, prop_class, c)
script_name_cand = regex.sub(r'\s+(CONSONANT|LETTER|LIGATURE|SIGN|SYLLABLE|SYLLABICS|VOWEL|'
r'IDEOGRAPH|HIEROGLYPH|POINT|ACCENT|CHARACTER|TIPPI|ADDAK|IRI|URA|'
r'SYMBOL GENITIVE|SYMBOL COMPLETED|SYMBOL LOCATIVE|SYMBOL AFOREMENTIONED|'
r'AU LENGTH MARK)\b.*$', '',
char_name)
if script_name := self.extract_script_name(script_name_cand, char_name):
self.add_char_to_rebuild_unicode_data_dict(d, script_name, 'char', c)
n_script_refs += 1
# print(sorted(d['script-names']))
prop_classes = sorted(prop_classes)
out_filenames = [x for x in [out_filename, cjk, hangul] if x]
cjk2 = cjk if cjk else out_filename
hangul2 = hangul if hangul else out_filename
for out_file in out_filenames:
try:
f_out = open(out_file, 'w')
except OSError:
sys.stderr.write(f'Cannot write to file {out_file}\n')
continue
with f_out:
for script_name in sorted(d['script-names']):
if script_name == 'CJK':
if out_file != cjk2:
continue
elif script_name == 'Hangul':
if out_file != hangul2:
continue
else:
if out_file != out_filename:
continue
prop_components = [f"::script-name {script_name}"]
for prop_class in prop_classes:
key = (script_name, prop_class)
if key in d:
if chars := ''.join(d[key]):
if prop_class in ('char',):
prop_components.append(f"::n-{prop_class} {len(chars)}")
prop_components.append(f"::{prop_class} {chars}")
f_out.write(f"{' '.join(prop_components)}\n")
sys.stderr.write(f"Rebuilt {out_filenames} with {n_script_refs} characters "
f"for {len(d['script-names'])} scripts.\n")
def rebuild_num_props(self, out_filename: str, err_filename: str):
n_out, n_err = 0, 0
with open(out_filename, 'w') as f_out, open(err_filename, 'w') as f_err:
codepoint = -1
while codepoint < 0xF0000:
codepoint += 1
char = chr(codepoint)
num = first_non_none(ud_numeric(char), # robust ud.numeric
self.num_value(char)) # uroman table includes extra num values, e.g. for Egyptian
if num is None:
continue
result_dict = {}
orig_txt = char
value: int | float | None = None # non-fraction-value(3 1/2) = 3
fraction: Fraction | None = None # fraction(3 1/2) = Fraction(1, 2)
num_base = None # num_base(500) = 100
base_multiplier = None # base_multiplier(500) = 5
script = None
is_large_power = self.dict_bool[('is-large-power', char)]
# num_base is typically a power of 10: 1, 10, 100, 1000, 10000, 100000, 1000000, ...
# exceptions might include 12 for the 'dozen' in popular English 'two dozen and one' (2*12+1=25)
# exceptions might include 20 for the 'score' in archaic English 'four score and seven' (4*20+7=87)
# exceptions might include 20 for the 'vingt' as in standard French 'quatre-vingt-treize' (4*20+13=93)
if script_name := self.chr_script_name(char):
script = script_name
elif char in '0123456789':
script = 'ascii-digit'
name = self.chr_name(char)
exclude_from_number_processing = False
for scrypt_type in ('SUPERSCRIPT', 'SUBSCRIPT',
'CIRCLED', 'PARENTHESIZED', 'SEGMENTED', 'MATHEMATICAL', 'ROMAN NUMERAL',
'FULL STOP', 'COMMA'):
if scrypt_type in name:
script = '*' + scrypt_type.lower().replace(' ', '-')
exclude_from_number_processing = True
break
for scrypt_type in ('VULGAR FRACTION',):
if scrypt_type in name:
script = scrypt_type.lower().replace(' ', '-')
break
if exclude_from_number_processing:
continue
if isinstance(num, int):
value = num
if 0 <= num <= 9:
num_base = 1
base_multiplier = num
if "DIGIT" in name:
num_type = 'digit'
else:
# Chinese numbers 零 (0), 一 (1), ... 九 (9) have numeric values,
# but are NOT (full) digits
num_type = 'digit-like'
elif m := regex.match(r'([0-9]+?)(0*)$', str(num)):
base_multiplier = int(m.group(1)) # non_base_value(500) = 5
num_base = int('1' + m.group(2))
num_type = 'base' if base_multiplier == 1 else 'multi'
else:
num_type = 'other-int' # Do such cases exist?
elif ("FRACTION" in name) and (fraction := fraction_char2fraction(char, num, self)):
fraction = fraction
num_type = 'fraction'
else:
num_type = 'other-num' # Do such cases exist? Yes. Bengali currency numerators, ...
value_s = '' if value is None else str(value)
fraction_s = '' if fraction is None else f'{fraction.numerator}/{fraction.denominator}'
fraction_list = None if fraction is None else [fraction.numerator, fraction.denominator]
delimiter_s = ' ' if value_s and fraction_s else ''
rom = (value_s + delimiter_s + fraction_s) or orig_txt
add_non_none_to_dict(result_dict, 'txt', orig_txt)
add_non_none_to_dict(result_dict, 'rom', rom)
add_non_none_to_dict(result_dict, 'value', value)
add_non_none_to_dict(result_dict, 'fraction', fraction_list)
add_non_none_to_dict(result_dict, 'type', num_type)
if is_large_power:
result_dict['is-large-power'] = True
add_non_none_to_dict(result_dict, 'base', num_base)
add_non_none_to_dict(result_dict, 'mult', base_multiplier)
add_non_none_to_dict(result_dict, 'script', script)
if num_type.startswith('other'):
add_non_none_to_dict(result_dict, 'name', name)
f_err.write(json.dumps(result_dict) + '\n')
n_err += 1
else:
if not script:
add_non_none_to_dict(result_dict, 'name', name)
f_out.write(json.dumps(result_dict) + '\n')
n_out += 1
sys.stderr.write(f'Processed {codepoint} codepoints,\n wrote {n_out} lines to {out_filename}\n'
f' and {n_err} lines to {err_filename}\n')
def load_resource_files(self, data_dir: Path, load_log: bool = False,
rebuild_ud_props: bool = False, rebuild_num_props: bool = False):
"""Loads all resource files needed for romanization."""
data_dir = data_dir
if not isinstance(data_dir, pathlib.Path):
sys.stderr.write(f'Error: data_dir is of {type(data_dir)}, not a Path.\n'
f' Cannot load any resource files.\n')
return
self.load_rom_file(os.path.join(data_dir, "romanization-auto-table.txt"),
'ud', file_format='rom', load_log=load_log)
self.load_rom_file(os.path.join(data_dir, "UnicodeDataOverwrite.txt"),
'ow', file_format='u2r', load_log=load_log)
self.load_rom_file(os.path.join(data_dir, "romanization-table.txt"),
'man', file_format='rom', load_log=load_log)
self.load_chinese_pinyin_file(os.path.join(data_dir, "Chinese_to_Pinyin.txt"), load_log=load_log)
self.load_script_file(os.path.join(data_dir, "Scripts.txt"), load_log=load_log)
self.load_num_props(os.path.join(data_dir, "NumProps.jsonl"), load_log=load_log)
for base_file in ("UnicodeDataProps.txt", "UnicodeDataPropsCJK.txt", "UnicodeDataPropsHangul.txt"):
self.load_unicode_data_props(os.path.join(data_dir, base_file), load_log=load_log)
if rebuild_ud_props:
self.rebuild_unicode_data_props(os.path.join(data_dir, "UnicodeDataProps.txt"),
cjk=os.path.join(data_dir, "UnicodeDataPropsCJK.txt"),
hangul=os.path.join(data_dir, "UnicodeDataPropsHangul.txt"))
if rebuild_num_props:
self.rebuild_num_props(os.path.join(data_dir, "NumProps.jsonl"),
os.path.join(data_dir, "NumPropsRejects.jsonl"))
def unicode_hangul_romanization(self, s: str, pass_through_p: bool = False):
"""Special algorithmic solution to convert (Korean) Hangul characters to the Latin alphabet."""
if cached_rom := self.hangul_rom.get(s, None):
return cached_rom
leads = "g gg n d dd r m b bb s ss - j jj c k t p h".split()
vowels = "a ae ya yae eo e yeo ye o wa wai oe yo u weo we wi yu eu yi i".split()
tails = "- g gg gs n nj nh d l lg lm lb ls lt lp lh m b bs s ss ng j c k t p h".split()
result = ""
for c in s:
cp = ord(c)
if 0xAC00 <= cp <= 0xD7A3:
code = cp - 0xAC00
lead_index = int(code / (28 * 21))
vowel_index = int(code / 28) % 21
tail_index = code % 28
rom = leads[lead_index] + vowels[vowel_index] + tails[tail_index]
rom = rom.replace('-', '')
self.hangul_rom[c] = rom
result += rom
elif pass_through_p:
result += c
return result
@staticmethod
def char_is_nonspacing_mark(s) -> bool:
""" Checks whether a character is a nonspacing mark, e.g. combining accents, points, vowel signs"""
return (len(s) == 1) and (ud.category(s) == 'Mn')
@staticmethod
def char_is_format_char(s) -> bool:
""" Checks whether a character is a formatting character, e.g. a zero-with joiner/non-joiner"""
return (len(s) == 1) and (ud.category(s) == 'Cf')
@staticmethod
def char_is_space_separator(s) -> bool:
""" Checks whether a character is a space,
e.g. ' ', non-breakable space, en space, ideographic (Chinese) space, Ogham space mark
but excluding \t, \r, \n"""
return (len(s) == 1) and (ud.category(s) == 'Zs')
def chr_name(self, char: str) -> str:
try:
return ud.name(char)
except (ValueError, TypeError):
if name := self.dict_str[('name', char)]:
return name
return ''
def num_value(self, s: str) -> int | float | Fraction | None:
"""rom_rules include numeric values beyond UnicodeData.txt, e.g. for Egyptian numerals"""
for rom_rule in self.rom_rules[s]:
if (num := rom_rule['num']) is not None:
return num
return None
def rom_rule_value(self, s: str, key: str):
for rom_rule in self.rom_rules[s]:
if (value := rom_rule.get(key)) is not None:
return value
return None
def unicode_float2fraction(self, num: float, precision: float = 0.000001) -> Tuple[int, int] | None:
"""only for common unicode fractions"""
if chached_value := self.float2fraction.get(num, None):
return chached_value
for numerator in (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11):
for denominator in (2, 3, 4, 5, 6, 8, 12, 16, 20, 32, 40, 64, 80, 160, 320):
if abs(numerator / denominator - num) < precision:
result = numerator, denominator
self.float2fraction[num] = result
return result
return None
def chr_script_name(self, char: str) -> str:
"""For letters, diacritics, numerals etc."""
return self.dict_str[('script', char)]
def test_output_of_selected_scripts_and_rom_rules(self):
"""Low level test function that checks and displays romanization information."""
output = ''
for s in ("Oriya", "Chinese"):
d = self.scripts[s.lower()]
output += f'SCRIPT {s} {d}\n'
for s in ('ƿ', 'β', 'и', 'μπ', '⠹', '亿', 'ちょ', 'и', '𓍧', '正', '分之', 'ऽ', 'ศ', 'ด์'):
d = self.rom_rules[s]
output += f'DICT {s} {d}\n'
for s in ('ƿ', 'β', 'न', 'ु'):
output += f'SCRIPT-NAME {s} {self.chr_script_name(s)}\n'
for s in ('万', '\uF8F7', '\U00013368', '\U0001308B', '\u0E48', '\u0E40'):
name = self.chr_name(s)
num = self.dict_num[s]
pic = self.dict_str[('pic', s)]
tone_mark = self.dict_str[('tone-mark', s)]
syllable_info = self.dict_str[('syllable-info', s)]
is_large_power = self.dict_bool[('is-large-power', s)]
output += f'PROPS {s}'
if name:
output += f' name: {name}'
if num:
output += f' num: {num} ({type(num).__name__})'
if pic:
output += f' pic: {pic}'
if tone_mark:
output += f' tone-mark: {tone_mark}'
if syllable_info:
output += f' syllable-info: {syllable_info}'
if is_large_power:
output += f' is-large-power: {is_large_power}'
output += '\n'
mayan12 = '\U0001D2EC'
egyptian600 = '𓍧'
runic90 = '𐍁'
klingon2 = '\uF8F2'
for offset, c in enumerate(f'9九万萬百፲፱፻፸¾0²₂AⅫ⑫൵{runic90}{mayan12}{egyptian600}{klingon2}'):
output += f'NUM-EDGE: {NumEdge(offset, offset+1, c, self)}\n'
for s in ('\u00bc', '\u0968'):
output += f'NUM-PROPS: {self.num_props[s]}\n'
print(output)
def test_romanization(self, **args):
"""A few full cases of romanization testing."""
tests = [('ألاسكا', None), ('यह एक अच्छा अनुवाद है.', 'hin'), ('ちょっとまってください', 'kor'),
('Μπανγκαλόρ', 'ell'), ('Зеленський', 'ukr'), ('കേരളം', 'mal')]
for test in tests:
s = test[0]
lcode = test[1] if len(test) >= 2 else None
rom = self.romanize_string(s, lcode=lcode, **args)
sys.stderr.write(f'ROM {s} -> {rom}\n')
n_alerts = 0
codepoint = -1
while codepoint < 0xF0000:
codepoint += 1
c = chr(codepoint)
rom = self.romanize_string(c)
if regex.search(r'\s', rom) and regex.search(r'\S', rom):
name = self.chr_name(c)
sys.stderr.write(f'U+{codepoint:04X} {c} {name} {rom}\n')
n_alerts += 1
sys.stderr.write(f'{n_alerts} alerts for roms with spaces\n')
def romanize_file(self, input_filename: str | None = None, output_filename: str | None = None,
lcode: str | None = None, direct_input: List[str] = None, **args):
"""Script to apply romanization to an entire file. Input and output files needed.
Language code (lcode) recommended."""
f_in_to_be_closed, f_out_to_be_closed = False, False
if direct_input and (input_filename is None):
f_in = direct_input # list of lines
elif isinstance(input_filename, str):
try:
f_in = open(input_filename)
f_in_to_be_closed = True
except OSError:
sys.stderr.write(f'Error in romanize_file: Cannot open file {input_filename}\n')
f_in = None
elif input_filename is None:
f_in = sys.stdin
else:
sys.stderr.write(f"Error in romanize_file: argument 'input_filename' {input_filename} "
f"is of wrong type: {type(input_filename)} (should be str)\n")
f_in = None
if isinstance(output_filename, str):
try:
f_out = open(str(output_filename), 'w')
f_out_to_be_closed = True
except OSError:
sys.stderr.write(f'Error in romanize_file: Cannot write to file {output_filename}\n')
f_out = None
elif output_filename is None:
f_out = sys.stdout
else:
sys.stderr.write(f"Error in romanize_file: argument 'output_filename' {output_filename} "
f"is of wrong type: {type(output_filename)} (should be str)\n")
f_out = None
if f_in and f_out:
max_lines = args.get('max_lines', None)
progress_dots_output = False
for line_number, line in enumerate(f_in, 1):
if m := regex.match(r'(::lcode\s+)([a-z]{3})(\s+)(.*?)\s*$', line):
lcode_kw, lcode2, space, snt = m.group(1, 2, 3, 4)
rom_result = self.romanize_string(snt, lcode2 or lcode, **args)
if args.get('rom_format', None) == RomFormat.STR:
lcode_prefix = f"{lcode_kw}{lcode2}{space}"
f_out.write(lcode_prefix + rom_result + '\n')
else:
lcode_prefix = f'[0, 0, "", "lcode: {lcode2}"]' # meta edge with lcode info
prefixed_edges = [lcode_prefix] + self.romanize_string(snt, lcode2 or lcode, **args)
f_out.write(Edge.json_str(prefixed_edges) + '\n')
else:
f_out.write(Edge.json_str(self.romanize_string(line.rstrip(), lcode, **args)) + '\n')
if not args.get('silent'):
if line_number % 100 == 0:
if line_number % 1000 == 0:
sys.stderr.write(str(line_number))
else:
sys.stderr.write('.')
progress_dots_output = True
sys.stderr.flush()
gc.collect()
if max_lines and line_number >= max_lines:
break
if progress_dots_output:
sys.stderr.write('\n')
sys.stderr.flush()
if f_in_to_be_closed:
f_in.close()
if f_out_to_be_closed:
f_out.close()
@staticmethod
def apply_any_offset_to_cached_rom_result(cached_rom_result: str | List[Edge], offset: int = 0) \
-> str | List[Edge]:
if isinstance(cached_rom_result, str):
return cached_rom_result
elif offset == 0:
return cached_rom_result
else:
return [Edge(edge.start + offset, edge.end + offset, edge.txt, edge.type) for edge in cached_rom_result]
def romanize_string_core(self, s: str, lcode: str | None, rom_format: RomFormat, cache_p: bool,
offset: int = 0, **args) -> str | List[Edge]:
"""Script to support token-by-token romanization with caching for higher speed."""
if cache_p:
cached_rom = self.rom_cache.get((s, lcode, rom_format), None)
if cached_rom is not None:
return self.apply_any_offset_to_cached_rom_result(cached_rom, offset)
lat = Lattice(s, uroman=self, lcode=lcode)
lat.pick_tibetan_vowel_edge(**args)
lat.prep_braille(**args)
lat.add_romanization(**args)
lat.add_numbers(self, **args)
lat.add_braille_numbers(**args)
lat.add_rom_fall_back_singles(**args)
if rom_format == RomFormat.LATTICE:
all_edges = lat.all_edges(0, len(s))
lat.add_alternatives(all_edges)
if cache_p:
self.rom_cache[(s, lcode, rom_format)] = all_edges
result = self.apply_any_offset_to_cached_rom_result(all_edges, offset)
else:
best_edges = lat.best_rom_edge_path(0, len(s))
if rom_format in (RomFormat.EDGES, RomFormat.ALTS):
if rom_format == RomFormat.ALTS:
lat.add_alternatives(best_edges)
if cache_p:
self.rom_cache[(s, lcode, rom_format)] = best_edges
result = self.apply_any_offset_to_cached_rom_result(best_edges, offset)
else:
rom = lat.edge_path_to_surf(best_edges)
del lat
if cache_p:
self.rom_cache[(s, lcode, rom_format)] = rom
result = rom
return result
def romanize_string(self, s: str, lcode: str | None = None, rom_format: RomFormat = RomFormat.STR, **args) \
-> str | List[Edge]:
"""Main entry point for romanizing a string. Recommended argument: lcode (language code).
recursive only used for development.
Method returns a string or a list of edges (with start and end offsets)."""
lcode = lcode or args.get('lcode', None)
# print('rom::', s, 'lcode:', lcode, 'print-lattice:', print_lattice_p)
# with caching (for string format output only for now)
if cache_p := not args.get('no_caching', False):
rest, offset = s, 0
result = '' if rom_format == RomFormat.STR else []
while m3 := regex.match(r'(.*?)([.,; ]*[ 。][.,; ]*)(.*)$', rest):
pre, delimiter, rest = m3.group(1, 2, 3)
result += self.romanize_string_core(pre, lcode, rom_format, cache_p, offset, **args)
offset += len(pre)
result += self.romanize_string_core(delimiter, lcode, rom_format, cache_p, offset, **args)
offset += len(delimiter)
result += self.romanize_string_core(rest, lcode, rom_format, cache_p, offset, **args)
return result
else:
return self.romanize_string_core(s, lcode, rom_format, cache_p, 0, **args)
class Edge:
"""This class defines edges that span part of a sentence with a specific romanization.
There might be multiple edges for a given span. The edges in turn are part of the
romanization lattice."""
def __init__(self, start: int, end: int, s: str, annotation: str = None):
self.start = start
self.end = end
self.txt = s
self.type = annotation
def __str__(self):
return f'[{self.start}-{self.end}] {self.txt} ({self.type})'
def __repr__(self):
return str(self)
def json(self) -> str: # start - end - text - annotation
return json.dumps([self.start, self.end, self.txt, self.type])
@staticmethod
def json_str(rom_result: List[Edge] | str) -> str:
if isinstance(rom_result, str):
return rom_result
else:
result = '['
for edge in rom_result:
if isinstance(edge, Edge):
result += edge.json()
else:
result += str(edge)
result += ']'
return result
class NumEdge(Edge):
def __init__(self, start: int, end: int, s: str, uroman: Uroman | None, active: bool = False):
"""For NumEdge, the s argument is in original language (not yet romanized)."""
# For speed, much of this processing should at some point be cached in data files.
Edge.__init__(self, start, end, s)
self.orig_txt, self.txt = s, s
self.value, self.fraction, self.num_base, self.base_multiplier = None, None, None, None
self.type, self.script, self.is_large_power, self.active = None, None, False, active
self.n_decimals = None
self.value_s = None # precision for 3.14159265358979323846264338327950288419716939937510582097494
if start+1 == end:
char = s[0]
if d := uroman.num_props.get(char):
self.active = True
self.value = d.get('value')
fraction_list = d.get('fraction')
self.fraction = Fraction(fraction_list[0], fraction_list[1]) if fraction_list else None
self.num_base = d.get('base')
self.base_multiplier = d.get('mult')
self.type = d.get('type')
self.script = d.get('script')
self.is_large_power = d.get('is-large-power')
self.update()
def update(self,
value: int | float | None = None,
value_s: str | None = None,
fraction: Fraction | None = None,
n_decimals: int | None = None,
num_base: int | None = None,
base_multiplier: int | float | None = None,
script: str | None = None,
e_type: str | None = None,
orig_txt: str | None = None) -> str:
self.value = first_non_none(value, self.value)
self.value_s = first_non_none(value_s, self.value_s)
self.fraction = first_non_none(fraction, self.fraction)
self.n_decimals = first_non_none(n_decimals, self.n_decimals)
self.num_base = first_non_none(num_base, self.num_base)
self.base_multiplier = first_non_none(base_multiplier, self.base_multiplier)
self.script = first_non_none(script, self.script)
self.type = first_non_none(e_type, self.type)
self.orig_txt = first_non_none(orig_txt, self.orig_txt)
if self.value_s is not None:
value_s = self.value_s
elif self.value is None:
value_s = ''
elif isinstance(self.value, float) and (self.n_decimals is not None):
value_s = first_non_none(self.value_s, f'{self.value:0.{self.n_decimals}f}')
else:
value_s = str(self.value)
fraction_s = '' if self.fraction is None else f'{self.fraction.numerator}/{self.fraction.denominator}'
delimiter_s = ' ' if value_s and fraction_s else ''
self.txt = (value_s + delimiter_s + fraction_s) or self.orig_txt
return self.txt
def __str__(self):
if self.num_base is not None:
if self.base_multiplier is not None:
b_clause = f'{self.base_multiplier}*{self.num_base}'
else:
b_clause = str(self.num_base)
else:
b_clause = None
return (('' if self.active else ' *')
+ f'[{self.start}-{self.end}] {self.orig_txt} R:{self.txt} T:{self.type}'
+ (' LP' if self.is_large_power else '')
+ (f' B:{b_clause}' if (b_clause is not None) else '')
+ (f' V:{self.value}' if ((self.value is not None) and (str(self.value) != self.txt)) else '')
+ (f' VS:{self.value_s}' if ((self.value_s is not None) and (self.value_s != self.txt)) else '')
+ (f' F:.{self.n_decimals}f' if self.n_decimals else f'')
+ (f' S:{self.script}' if self.script else ''))
class Lattice:
"""Lattice for a specific romanization instance. Has edges."""
def __init__(self, s: str, uroman: Uroman, lcode: str = None):
self.s = s
self.lcode = lcode
self.lattice = defaultdict(set)
self.max_vertex = len(s)
self.uroman = uroman
self.props = {}
self.simple_top_rom_cache = {}
self.contains_script = defaultdict(bool)
self.check_for_scripts()
def check_for_scripts(self):
for c in self.s:
script_name = self.uroman.chr_script_name(c)
self.contains_script[script_name] = True
if regex.search(r'[\u2800-\u28FF]', self.s):
self.contains_script['Braille'] = True
def add_edge(self, edge: Edge):
self.lattice[(edge.start, edge.end)].add(edge)
self.lattice[(edge.start, 'right')].add(edge.end)
self.lattice[(edge.end, 'left')].add(edge.start)
def __str__(self):
edges = []
for start in range(self.max_vertex):
for end in self.lattice[(start, 'right')]:
for edge in self.lattice[(start, end)]:
edges.append(f'[{start}-{end}] {edge.txt} ({edge.type})')
return ' '.join(edges)
@staticmethod
def char_is_braille(c: str) -> bool:
return 0x2800 <= ord(c[0]) <= 0x28FF
# Help Tibet
def char_is_subjoined_letter(self, c: str) -> bool:
return "SUBJOINED LETTER" in self.uroman.chr_name(c)
def char_is_regular_letter(self, c: str) -> bool:
char_name = self.uroman.chr_name(c)
return ("LETTER" in char_name) and not ("SUBJOINED" in char_name)
def char_is_letter(self, c: str) -> bool:
return "LETTER" in self.uroman.chr_name(c)
def char_is_vowel_sign(self, c: str) -> bool:
return self.uroman.dict_bool[('is-vowel-sign', c)]
def char_is_letter_or_vowel_sign(self, c: str) -> bool:
return self.char_is_letter(c) or self.char_is_vowel_sign(c)
def is_at_start_of_word(self, position: int) -> bool:
# return not regex.match(r'(?:\pL|\pM)', self.s[position-1:position])
first_char = self.s[position]
first_char_is_braille = self.char_is_braille(first_char)
end = position
if (preceded_by_alpha := self.props.get(('preceded_by_alpha', end), None)) in (True, False):
return not preceded_by_alpha
for start in self.lattice[(end, 'left')]:
for edge in self.lattice[(start, end)]:
prev_letter = None if edge.txt == '' else edge.txt[-1]
if len(edge.txt) and (prev_letter.isalpha() or (first_char_is_braille and (prev_letter in ["'"]))):
self.props[('preceded_by_alpha', position)] = True
return False
self.props[('preceded_by_alpha', position)] = False
return True
def is_at_end_of_word(self, position: int) -> bool:
if (cached_followed_by_alpha := self.props.get(('followed_by_alpha', position), None)) in (True, False):
return not cached_followed_by_alpha
start = position
while (start+1 < self.max_vertex) \
and self.uroman.char_is_nonspacing_mark(self.s[start]) \
and ('NUKTA' in self.uroman.chr_name(self.s[start])):
start += 1
for end in range(start + 1, self.max_vertex + 1):
s = self.s[start:end]
if not self.uroman.dict_bool[('s-prefix', s)]:
break
for rom_rule in self.uroman.rom_rules[s]:
rom = rom_rule['t']
if (not rom_rule['use-only-at-start-of-word']) and regex.search(r'\pL', rom):
self.props[('followed_by_alpha', position)] = True
return False
self.props[('followed_by_alpha', position)] = False
return True
def is_at_end_of_syllable(self, position: int) -> Tuple[bool, str]:
"""At least initially for Thai"""
prev_char = self.s[position-2] if position >= 2 else None
# char = self.s[position-1] if position >= 1 else None
next_char = self.s[position] if position < self.max_vertex else None
if self.uroman.dict_str[('tone-mark', next_char)]:
adj_position = position + 1
next_char = self.s[adj_position] if adj_position < self.max_vertex else None
# print('TONE-MARK', position, next_char)
else:
adj_position = position
next_char2 = self.s[adj_position + 1] if adj_position + 1 < self.max_vertex else None
if prev_char is None:
return False, 'start-of-string'
if not regex.search(r'(?:\pL|\pM)$', prev_char): # start of token
return False, 'start-of-token'
if self.uroman.dict_str[('syllable-info', prev_char)] == 'written-pre-consonant-spoken-post-consonant':
return False, 'pre-post-vowel-on-left'
if self.uroman.dict_str[('syllable-info', next_char)] == 'written-pre-consonant-spoken-post-consonant':
return True, 'pre-post-vowel-on-right'
if adj_position >= self.max_vertex: # end of string
return True, 'end-of-string'
# if not self.char_is_letter_or_vowel_sign(next_char): # end of token
if not regex.match(r'(?:\pL|\pM)', next_char): # end of token
return True, 'end-of-token'
if position > 0:
left_edge = self.best_left_neighbor_edge(position-1)
if left_edge and regex.search(r'[bcdfghjklmnpqrstvxz]$', left_edge.txt):
return False, 'consonant-to-the-left'
next_char_rom = first_non_none(self.simple_top_romanization_candidate_for_span(adj_position,
adj_position + 2,
simple_search=True),
self.simple_top_romanization_candidate_for_span(adj_position,
adj_position + 1,
simple_search=True),
"?")
if not regex.match(r"[aeiou]", next_char_rom.lower()): # followed by consonant
return True, f'not-followed-by-vowel {next_char_rom}'
if (next_char == '\u0E2D') and (next_char2 is not None): # THAI CHARACTER O ANG
next_char2_rom = first_non_none(self.simple_top_romanization_candidate_for_span(adj_position+1,
adj_position+2,
simple_search=True),
"?")
if regex.match(r"[aeiou]", next_char2_rom.lower()):
return True, 'o-ang-followed-by-vowel' # In that context Thai char. "o ang" is considered a consonant
return False, 'not-at-syllable-end-by-default'
def romanization_by_first_rule(self, s) -> str | None:
try:
return self.uroman.rom_rules[s][0]['t']
except IndexError:
return None
def expand_rom_with_special_chars(self, rom: str, start: int, end: int, **args) \
-> Tuple[str, int, int, str | None]:
"""This method contains a number of special romanization heuristics that typically modify
an existing or preliminary edge based on context."""
orig_start = start
uroman = self.uroman
full_string = self.s
annot = None
if rom == '':
return rom, start, end, None
prev_char = (full_string[start-1] if start >= 1 else '')
first_char = full_string[start]
last_char = full_string[end-1]
next_char = (full_string[end] if end < len(full_string) else '')
# \u2820 is the Braille character indicating that the next letter is upper case
if (prev_char == '\u2820') and regex.match(r'[a-z]', rom):
return rom[0].upper() + rom[1:], start-1, end, 'rom exp'
# Normalize multi-upper case THessalonike -> Thessalonike, but don't change THESSALONIKE
if start+1 == end and rom.isupper() and next_char.islower():
ablation = args.get('ablation', '') # VERBOSE
if not ('nocap' in ablation):
rom = rom.capitalize()
# Japanese small tsu (and Gurmukhi addak) used as consonant doubler:
if (prev_char and prev_char in 'っッ\u0A71') \
and (uroman.chr_script_name(prev_char) == uroman.chr_script_name(prev_char)) \
and (m_double_consonant := regex.match(r'(ch|[bcdfghjklmnpqrstwz])', rom)):
# return m_double_consonant.group(1).replace('ch', 't') + rom, start-1, end, 'rom exp'
# expansion might additional apply to the right
if prev_char in 'っッ': # for Japanese, per Hepburn, use tch
rom = m_double_consonant.group(1).replace('ch', 't') + rom
else:
rom = m_double_consonant.group(1).replace('ch', 'c') + rom
start = start-1
first_char = full_string[start]
prev_char = (full_string[start-1] if start >= 1 else '')
# Thai
if uroman.chr_script_name(first_char) == 'Thai':
if (start+1 == end) and regex.match(r'[bcdfghjklmnpqrstvwxyz]+$', rom):
if uroman.dict_str[('syllable-info', prev_char)] == 'written-pre-consonant-spoken-post-consonant':
for vowel_prefix_len in [1]:
if vowel_prefix_len <= start:
for vowel_suffix_len in [3, 2, 1]:
if end + vowel_suffix_len <= len(full_string):
pattern = (full_string[start-vowel_prefix_len: start]
+ '–'
+ full_string[end:end+vowel_suffix_len])
if uroman.rom_rules[pattern]:
vowel_rom_rule = uroman.rom_rules[pattern][0]
vowel_rom = vowel_rom_rule['t']
# print(f" PATTERN {pattern} ({full_string[start:end]}/{rom}) {rom}{vowel_rom}")
return rom + vowel_rom, start-vowel_prefix_len, end+vowel_suffix_len, 'rom exp'
if (uroman.chr_script_name(prev_char) == 'Thai') \
and (uroman.dict_str[('syllable-info', prev_char)]
== 'written-pre-consonant-spoken-post-consonant') \
and regex.match(r'[bcdfghjklmnpqrstvwxyz]', rom) \
and (vowel_rom := self.romanization_by_first_rule(prev_char)):
return rom + vowel_rom, start-1, end, 'rom exp'
# THAI CHARACTER O ANG
if (first_char == '\u0E2D') and (end - start == 1):
prev_script = uroman.chr_script_name(prev_char)
next_script = uroman.chr_script_name(next_char)
prev_rom = self.find_rom_edge_path_backwards(0, start, 1, return_str=True)
next_rom = self.romanization_by_first_rule(next_char)
# if not recursive:
# lc = uroman.romanize_string(full_string[:start], lcode=self.lcode, recursive=True)
# rc = uroman.romanize_string(full_string[end:], lcode=self.lcode, recursive=True)
# print('PP', start, end, prev_script, next_script, prev_rom, next_rom, ' LC:', lc[-40:],
# ' RC:', rc[:40])
# delete THAI CHARACTER O ANG unless it is surrounded on both sides by a Thai consonant
if not ((prev_script == 'Thai') and (next_script == 'Thai')
and regex.match(r'[bcdfghjklmnpqrstvwxz]+$', prev_rom)
and regex.match(r'[bcdfghjklmnpqrstvwxz]+$', next_rom)):
# if not recursive:
# print(f'* DELETE O ANG {first_char} {start}-{end} LC: {lc[-40:]} RC: {rc[:40]}')
return '', start, end, 'rom del'
# Coptic: consonant + grace-accent = e + consonant
if next_char and (next_char == "\u0300") and (uroman.chr_script_name(last_char) == "Coptic")\
and (not self.simple_top_romanization_candidate_for_span(orig_start, end+1)):
rom = 'e' + rom
end = end+1
last_char = full_string[end - 1]
next_char = (full_string[end] if end < len(full_string) else '')
annot = 'rom exp'
# Japanese small y: ki + small ya = kya etc.
if (next_char and next_char in 'ゃゅょャュョ') \
and (uroman.chr_script_name(last_char) == uroman.chr_script_name(next_char)) \
and regex.search(r'([bcdfghjklmnpqrstvwxyz]i$)', rom) \
and (y_rom := self.romanization_by_first_rule(next_char)) \
and (not self.simple_top_romanization_candidate_for_span(orig_start, end+1)) \
and (not self.simple_top_romanization_candidate_for_span(start, end+1)):
rom = rom[:-1] + y_rom
end = end+1
last_char = full_string[end - 1]
next_char = (full_string[end] if end < len(full_string) else '')
annot = 'rom exp'
# Japanese vowel lengthener (U+30FC)
last_rom_char = last_chr(rom)
if (next_char == 'ー') \
and (uroman.chr_script_name(last_char) in ('Hiragana', 'Katakana')) \
and (last_rom_char in 'aeiou'):
return rom + last_rom_char, start, end+1, 'rom exp'
# Virama (in Indian languages)
if self.uroman.dict_bool[('is-virama', next_char)]:
return rom, start, end + 1, "rom exp"
if rom.startswith(' ') and ((start == 0) or (prev_char == ' ')):
rom = rom[1:]
if rom.endswith(' ') and ((end == len(full_string)+1) or (next_char == ' ')):
rom = rom[:-1]
return rom, start, end, annot
def prep_braille(self, **_args) -> None:
if self.contains_script['Braille']:
dots6 = '\u2820' # characters in following word are upper case
all_caps = False
for i, c in enumerate(self.s):
if (i >= 1) and (self.s[i-1] == dots6) and (c == dots6):
all_caps = True
elif all_caps:
if c in '\u2800': # Braille space
all_caps = False
else:
self.props[('is-upper', i)] = True
def pick_tibetan_vowel_edge(self, **args) -> None:
if not self.contains_script['Tibetan']:
return None
verbose = bool(args.get('verbose'))
s = self.s
uroman = self.uroman
tibetan_syllable = []
tibetan_letter_positions = []
for start in range(self.max_vertex):
c = s[start]
if (uroman.chr_script_name(c) == 'Tibetan') and self.char_is_letter_or_vowel_sign(c):
tibetan_letter_positions.append(start)
else:
if tibetan_letter_positions:
tibetan_syllable.append(tibetan_letter_positions)
tibetan_letter_positions = []
if tibetan_letter_positions:
tibetan_syllable.append(tibetan_letter_positions)
for tibetan_letter_positions in tibetan_syllable:
vowel_pos = None
orig_txt = ''
roms = []
subjoined_letter_positions = []
first_letter_position = tibetan_letter_positions[0]
for i in tibetan_letter_positions:
c = s[i]
orig_txt += c
rom = first_non_none(self.simple_top_romanization_candidate_for_span(i, i+1), "?")
self.props[('edge-vowel', i)] = None
if self.char_is_vowel_sign(c) or (rom and regex.match(r"[aeiou]+$", rom)):
vowel_pos = i
self.props[('edge-vowel', i)] = True
# delete any syllable initial ' before vowel
if roms == ["'"]:
self.props[('edge-delete', i-1)] = True
elif self.char_is_subjoined_letter(c):
subjoined_letter_positions.append(i)
if i > first_letter_position:
if c == "\u0FB0":
vowel_pos = i-1
self.props[('edge-vowel', i-1)] = True
else:
self.props[('edge-vowel', i-1)] = False
rom = regex.sub(r'([bcdfghjklmnpqrstvwxyz].*)a$', r'\1', rom)
elif c == "\u0F60": # Tibetan letter -a (')
self.props[('edge-vowel', i)] = False
if i > first_letter_position:
vowel_pos = i-1
self.props[('edge-vowel', i-1)] = True
if i == tibetan_letter_positions[-1]:
self.props[('edge-delete', i)] = True
if roms and not (roms[-1] in "aeiou"):
rom = "a'"
else:
rom = "'"
else:
rom = regex.sub(r'([bcdfghjklmnpqrstvwxyz].*)a$', r'\1', rom)
roms.append(rom)
if vowel_pos is not None:
for i in tibetan_letter_positions:
if self.props.get(('edge-vowel', i)) is None:
self.props[('edge-vowel', i)] = False
else:
best_cost, best_vowel_pos, best_pre, best_post = math.inf, None, None, None
n_letters = len(tibetan_letter_positions)
for i in tibetan_letter_positions:
rel_pos = i - first_letter_position
pre, post = ''.join(roms[:rel_pos+1]), ''.join(roms[rel_pos+1:])
if self.props.get(('edge-vowel', i)) is False:
cost = 20
if cost < best_cost:
best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
elif n_letters == 1:
cost = 0
if cost < best_cost:
best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
elif n_letters == 2:
cost = 0 if i == 0 else 0.1
if cost < best_cost:
best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
else:
good_suffix = regex.match(r"(?:|[bcdfghjklmnpqrstvwxz]|bh|bs|ch|cs|dd|ddh|"
r"dh|dz|dzh|gh|gr|gs|kh|khs|kss|n|nn|nt|ms|ng|ngs|ns|ph|"
r"rm|sh|ss|th|ts|tsh|tt|tth|zh|zhs)'?$", post)
good_prefix = regex.match(r"'?(?:.|bd|br|brg|brgy|bs|bsh|bst|bt|bts|by|bz|bzh|"
r"ch|db|dby|dk|dm|dp|dpy|dr|"
r"gl|gn|gr|gs|gt|gy|gzh|kh|khr|khy|kr|ky|ld|lh|lt|mkh|mny|mth|mtsh|"
r"ny|ph|phr|phy|rgy|rk|el|rn|rny|rt|rts|"
r"sk|skr|sky|sl|sm|sn|sny|sp|spy|sr|st|th|ts|tsh)$", pre)
subjoined_suffix = all([x in subjoined_letter_positions
for x in tibetan_letter_positions[rel_pos+2:]])
# print('GOOD', good_suffix, good_prefix, subjoined_suffix, f'{pre}a{post}',
# subjoined_letter_positions, tibetan_letter_positions[rel_pos+2:])
if good_suffix and good_prefix:
cost = len(pre) * 0.1
elif good_suffix:
cost = len(pre)
elif subjoined_suffix and good_prefix:
cost = len(pre) * 0.3
elif subjoined_suffix:
cost = len(pre) * 0.5
else:
cost = math.inf
if cost < best_cost:
best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
if best_vowel_pos is not None:
for i in tibetan_letter_positions:
if self.props.get(('edge-vowel', i)) is None:
value = (i == best_vowel_pos)
self.props[('edge-vowel', i)] = value
if verbose:
best_cost = best_cost if isinstance(best_cost, int) else round(best_cost, 2)
sys.stderr.write(f'Tib. best cost: "{best_pre}a{best_post}" o:{orig_txt} c:{round(best_cost, 2)}'
f' p:{best_vowel_pos} {tibetan_letter_positions}\n')
def add_default_abugida_vowel(self, rom: str, start: int, end: int, annotation: str = '') -> str:
"""Adds an abugida vowel (e.g. "a") where needed. Important for many languages in South Asia."""
uroman = self.uroman
s = self.s
try:
first_s_char = s[start]
last_s_char = s[end-1]
script_name = uroman.chr_script_name(first_s_char)
script = self.uroman.scripts[script_name.lower()]
if not (abugida_default_vowels := script['abugida-default-vowels']):
return rom
key = (script, rom)
if key in uroman.abugida_cache:
base_rom, base_rom_plus_vowel, mod_rom = uroman.abugida_cache[key]
rom = mod_rom
else:
vowels_regex1 = '|'.join(abugida_default_vowels) # e.g. 'a' or 'a|o'
vowels_regex2 = '|'.join(map(lambda x: x + '+', abugida_default_vowels)) # e.g. 'a+' or 'a+|o+'
if m := regex.match(fr'([cfghkmnqrstxy]?y)({vowels_regex2})-?$', rom):
base_rom = m.group(1)
base_rom_plus_vowel = base_rom + m.group(2)
elif m := regex.match(fr'([bcdfghjklmnpqrstvwxyz]+)({vowels_regex1})-?$', rom):
base_rom = m.group(1)
base_rom_plus_vowel = base_rom + m.group(2)
if rom.endswith('-') and (start+1 == end) and rom[0].isalpha():
rom = rom[:-1]
else:
base_rom = rom
base_rom_plus_vowel = base_rom + abugida_default_vowels[0]
if (not regex.match(r"[bcdfghjklmnpqrstvwxyz]+$", base_rom)
and (not ((script_name == 'Tibetan') and (base_rom == "'")))):
base_rom, base_rom_plus_vowel = None, None
uroman.abugida_cache[key] = (base_rom, base_rom_plus_vowel, rom)
if base_rom is None:
return rom
if 'tail' in annotation:
return rom
prev_s_char = s[start-1] if start >= 1 else ''
next_s_char = s[end] if len(s) > end else ''
next2_s_char = s[end+1] if len(s) > end+1 else ''
if script_name == 'Tibetan':
if self.props.get(('edge-delete', start)):
return ''
elif self.props.get(('edge-vowel', start)):
return base_rom_plus_vowel
else:
return base_rom
if (next_s_char and ((base_rom in "bcdfghklmnpqrstvwz") or (base_rom in ["ng"]))
and (next_s_char in "យ")): # Khmer yo
return base_rom
if self.uroman.dict_bool[('is-vowel-sign', next_s_char)]:
return base_rom
if self.uroman.dict_bool[('is-medial-consonant-sign', next_s_char)]:
return base_rom
if self.char_is_subjoined_letter(next_s_char):
return base_rom
if self.uroman.char_is_nonspacing_mark(next_s_char) \
and self.uroman.dict_bool[('is-vowel-sign', next2_s_char)]:
return base_rom
if self.uroman.dict_bool[('is-virama', next_s_char)]:
return base_rom
if self.uroman.char_is_nonspacing_mark(next_s_char) \
and self.uroman.dict_bool[('is-virama', next2_s_char)]:
return base_rom
if self.uroman.dict_bool[('is-virama', prev_s_char)]:
return base_rom_plus_vowel
if self.is_at_start_of_word(start) and not regex.search('r[aeiou]', rom):
return base_rom_plus_vowel
# delete many final schwas from most Devanagari languages (except: Sanskrit)
if self.is_at_end_of_word(end):
if (script_name in ("Devanagari",)) and (self.lcode not in ('san',)): # Sanskrit
return rom
else:
return base_rom_plus_vowel
if uroman.chr_script_name(prev_s_char) != script_name:
return base_rom_plus_vowel
if 'VOCALIC' in self.uroman.chr_name(last_s_char):
return base_rom
if uroman.chr_script_name(next_s_char) == script_name:
return base_rom_plus_vowel
except Exception:
return rom
else:
pass
# print('ABUGIDA', rom, start, script_name, script, abugida_default_vowels, prev_s_char, next_s_char)
return rom
def cand_is_valid(self, rom_rule: RomRule, start: int, end: int, rom: str) -> bool:
if rom is None:
return False
if rom_rule['dont-use-at-start-of-word'] and self.is_at_start_of_word(start):
return False
if rom_rule['use-only-at-start-of-word'] and not self.is_at_start_of_word(start):
return False
if rom_rule['dont-use-at-end-of-word'] and self.is_at_end_of_word(end):
return False
if rom_rule['use-only-at-end-of-word'] and not self.is_at_end_of_word(end):
return False
if rom_rule['use-only-for-whole-word'] \
and not (self.is_at_start_of_word(start) and self.is_at_end_of_word(end)):
return False
if (lcodes := rom_rule['lcodes']) and (self.lcode not in lcodes):
return False
return True
# @profile
def simple_sorted_romanization_candidates_for_span(self, start, end) -> List[str]:
s = self.s[start:end]
if not self.uroman.dict_bool[('s-prefix', s)]:
return []
rom_rule_candidates = []
for rom_rule in self.uroman.rom_rules[s]:
rom = rom_rule['t']
if self.cand_is_valid(rom_rule, start, end, rom):
rom_rule_candidates.append((rom_rule['n-restr'] or 0, rom_rule['t']))
rom_rule_candidates.sort(reverse=True)
return [x[1] for x in rom_rule_candidates]
def simple_top_romanization_candidate_for_span(self, start, end, simple_search: bool = False) -> str | None:
if (start < 0) or (end > self.max_vertex):
return None
span_range = (start, end)
if (cached_result := self.simple_top_rom_cache.get(span_range)) is not None:
return cached_result
best_cand, best_n_restr, best_rom_rule = None, None, None
for rom_rule in self.uroman.rom_rules[self.s[start:end]]:
if self.cand_is_valid(rom_rule, start, end, rom_rule['t']):
n_restr = rom_rule['n-restr'] or 0
if best_n_restr is None or (n_restr > best_n_restr):
best_cand, best_n_restr, best_rom_rule = rom_rule['t'], n_restr, rom_rule
if simple_search:
return best_cand
if best_rom_rule:
t_at_end_of_syllable = best_rom_rule['t-at-end-of-syllable']
if t_at_end_of_syllable is not None:
is_at_end_of_syllable, rationale = self.is_at_end_of_syllable(end)
if is_at_end_of_syllable:
best_cand = t_at_end_of_syllable
# print(f" SIMPLE {start}-{end} {best_cand} ({best_rom_rule['t']},{t_at_end_of_syllable}) "
# f"END:{is_at_end_of_syllable} ({rationale})")
self.simple_top_rom_cache[span_range] = best_cand
# if (best_rom_rule is not None) and ('cancel' in (prov := best_rom_rule['prov'])):
# sys.stderr.write(f' Cancel {self.s} ({start}-{end}) {prov} {self.s[start:end]}\n')
return best_cand
def decomp_rom(self, char_position: int) -> str | None:
"""Input: decomposable character such as ﻼ or ½
Output: la or 1/2"""
full_string = self.s
char = full_string[char_position]
rom = None
if ud_decomp_s := ud.decomposition(char):
format_comps = []
other_comps = []
decomp_s = ''
# name = self.uroman.chr_name(char)
for ud_decomp_elem in ud_decomp_s.split():
if ud_decomp_elem.startswith("<"):
format_comps.append(ud_decomp_elem)
else:
try:
norm_char = chr(int(ud_decomp_elem, 16))
except ValueError:
other_comps.append(ud_decomp_elem)
else:
decomp_s += norm_char
if (format_comps and (format_comps[0] not in ('<super>', '<sub>', '<noBreak>', '<compat>'))
and (not other_comps) and decomp_s):
rom = self.uroman.romanize_string(decomp_s, self.lcode)
# make sure to add a space for 23½ -> 23 1/2
if rom and ud.numeric(char, None):
rom = rom.replace('⁄', '/')
if char_position >= 1 and ud.numeric(full_string[char_position-1], None):
rom = ' ' + rom
if (char_position+1 < len(full_string)) and ud.numeric(full_string[char_position+1], None):
rom += ' '
return rom
def add_romanization(self, **args):
"""Adds a romanization edge to the romanization lattice."""
for start in range(self.max_vertex):
for end in range(start+1, self.max_vertex+1):
if not self.uroman.dict_bool[('s-prefix', self.s[start:end])]:
break
if (rom := self.simple_top_romanization_candidate_for_span(start, end)) is not None:
if self.contains_script['Braille'] and (start+1 == end):
if self.props.get(('is-upper', start)):
rom = rom.upper()
edge_annotation = 'rom'
if regex.match(r'\+(m|ng|n|h|r)', rom):
rom, edge_annotation = rom[1:], 'rom tail'
rom = self.add_default_abugida_vowel(rom, start, end, annotation=edge_annotation)
# orig_rom, orig_start, orig_end = rom, start, end
rom, start2, end2, exp_edge_annotation \
= self.expand_rom_with_special_chars(rom, start, end, annotation=edge_annotation,
recursive=args.get('recursive', False), **args)
edge_annotation = exp_edge_annotation or edge_annotation
# if (orig_rom, orig_start, orig_end) != (rom, start, end):
# print(f'EXP {s} {orig_rom} {orig_start}-{orig_end} -> {rom} {start}-{end}')
# if rom != rom_orig: print('** Add ABUGIDA', rom, start, end, rom2)
self.add_edge(Edge(start2, end2, rom, edge_annotation))
if start < len(self.s):
char = self.s[start]
cp = ord(char)
# Korean Hangul characters
if 0xAC00 <= cp <= 0xD7A3:
if rom := self.uroman.unicode_hangul_romanization(char):
self.add_edge(Edge(start, start+1, rom, 'rom'))
# character decomposition
if rom_decomp := self.decomp_rom(start):
self.add_edge(Edge(start, start + 1, rom_decomp, 'rom decomp'))
@staticmethod
def update_edge_list(edges, new_edge, old_edges) -> List[NumEdge]:
new_edge_not_yet_added = True
result = []
for edge in edges:
if edge in old_edges:
edge.active = False
if new_edge_not_yet_added:
result.append(new_edge)
new_edge_not_yet_added = False
else:
result.append(edge)
if new_edge_not_yet_added:
result.append(new_edge)
return result
@staticmethod
def edge_is_digit(edge: Edge | None) -> bool:
return (isinstance(edge, NumEdge)
and (edge.value is not None)
and isinstance(edge.value, int)
and (edge.type == 'digit')
and (0 <= edge.value <= 9)
and (edge.end - edge.start == 1))
@staticmethod
def is_gap_null_edge(edge: Edge) -> bool:
return isinstance(edge, NumEdge) and (edge.orig_txt in ('零', '〇'))
@staticmethod
def braille_digit(char: str) -> str | None:
position = '\u281A\u2801\u2803\u2809\u2819\u2811\u280B\u281B\u2813\u280A'.find(char) # Braille 0-9
return str(position) if position >= 0 else None
def add_braille_number(self, start: int, end: int, txt: str, **_args) -> None:
new_edge = NumEdge(start, end, txt, self.uroman)
new_edge.type = 'number'
self.add_edge(new_edge)
def add_braille_numbers(self, **_args):
if self.contains_script['Braille']:
s = self.s
num_s, start = '', None
for i in range(len(s)):
char = s[i]
if char == '\u283C': # number mark
if start is None:
start = i
elif (start is not None) and (digit_s := self.braille_digit(char)):
num_s += digit_s
elif (start is not None) and (char == '\u2832'): # period
num_s += '.'
elif (start is not None) and (char == '\u2802'): # comma
num_s += ','
elif isinstance(start, int):
self.add_braille_number(start, i, num_s)
num_s, start = '', None
if start is not None:
self.add_braille_number(start, len(s), num_s)
def add_numbers(self, uroman, **args):
"""Adds a numerical romanization edge to the romanization lattice, currently just for digits.
To be significantly expanded to cover complex Chinese, Egyptian, Amharic numbers."""
verbose = bool(args.get('verbose'))
s = self.s
num_edges = []
for start in range(len(s)):
char = s[start]
if uroman.num_props[char]:
new_edge = NumEdge(start, start + 1, char, uroman)
num_edges.append(new_edge)
if verbose:
print('NumEdge', new_edge)
self.add_edge(new_edge)
# D1 sequence of digits 1234
for edge in num_edges:
if self.edge_is_digit(edge) and edge.active: # and (edge.value != 0):
n_decimal_points = 0
n_decimals = None
new_value_s = str(edge.value)
sub_edges = [edge]
prev_edge = edge
while True:
right_edge = self.best_right_neighbor_edge(prev_edge.end)
if self.edge_is_digit(right_edge):
sub_edges.append(right_edge)
new_value_s += str(right_edge.value)
if n_decimals is not None:
n_decimals += 1
prev_edge = right_edge
elif ((prev_edge.end < len(s)) and (s[prev_edge.end] == '.') and (n_decimal_points == 0)
and (right_edge2 := self.best_right_neighbor_edge(prev_edge.end + 1))
and self.edge_is_digit(right_edge2)):
if right_edge is None:
right_edge = Edge(prev_edge.end, prev_edge.end+1, s[prev_edge.end],
'decimal period')
self.add_edge(right_edge)
sub_edges.append(right_edge)
sub_edges.append(right_edge2)
new_value_s += '.' + str(right_edge2.value)
n_decimal_points += 1
n_decimals = 1
prev_edge = right_edge2
else:
break
if len(sub_edges) >= 2:
new_value = float(new_value_s) if '.' in new_value_s else int(new_value_s)
new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True)
new_edge.update(value=new_value, value_s=new_value_s, n_decimals=n_decimals, num_base=1,
e_type='D1', script=sub_edges[-1].script)
self.add_edge(new_edge)
num_edges = self.update_edge_list(num_edges, new_edge, sub_edges)
if verbose:
print(new_edge.type, new_edge)
# G1 combine (*) "single digits" 2*100=200, 3*10= 30
for edge in num_edges:
if (isinstance(edge, NumEdge) and edge.active and (edge.num_base == 1)
and isinstance(edge.value, int) and (edge.value >= 1)):
right_edge = self.best_right_neighbor_edge(edge.end, skip_num_edge=False)
if (right_edge
and isinstance(right_edge, NumEdge)
and right_edge.active
and isinstance(right_edge.value, int)
and (right_edge.num_base > 1)
and (not right_edge.is_large_power)):
new_value = edge.value * right_edge.value
new_edge = NumEdge(edge.start, right_edge.end, str(new_value), uroman, active=True)
new_edge.update(value=new_value, num_base=right_edge.num_base, e_type='G1',
orig_txt=edge.orig_txt + right_edge.orig_txt,
script=right_edge.script)
self.add_edge(new_edge)
num_edges = self.update_edge_list(num_edges, new_edge, [edge, right_edge])
if verbose:
print(new_edge.type, new_edge)
# G2 combine (+) G1 "single digits" 200+30+4=234 (within larger blocks of 1000, 1000000)
for edge in num_edges:
if isinstance(edge, NumEdge) and edge.active and isinstance(edge.value, int) and not edge.is_large_power:
sub_edges = [edge]
prev_edge = edge
prev_non_edge = edge # None if (edge.orig_txt in '零') else prev_edge
while (prev_edge
and (right_edge := self.best_right_neighbor_edge(prev_edge.end, skip_num_edge=False))
and isinstance(right_edge, NumEdge)
and right_edge.active
and isinstance(right_edge.value, int)
and (not right_edge.is_large_power)
and (self.is_gap_null_edge(prev_non_edge)
or ((prev_non_edge.num_base > right_edge.value)
and (prev_non_edge.num_base > right_edge.num_base)))):
sub_edges.append(right_edge)
prev_edge = right_edge
if not self.is_gap_null_edge(right_edge):
prev_non_edge = right_edge
if len(sub_edges) >= 2:
new_value = sum([e.value for e in sub_edges])
new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True)
new_edge.update(value=new_value, num_base=sub_edges[-1].num_base, e_type='G2',
orig_txt=''.join([e.orig_txt for e in sub_edges]),
script=sub_edges[-1].script)
self.add_edge(new_edge)
num_edges = self.update_edge_list(num_edges, new_edge, sub_edges)
new_edge.type = 'G2'
if verbose:
print(new_edge.type, new_edge)
# G3 combine (*) G2 blocks with large powers, e.g. 234*1000 = 234000
for edge in num_edges:
if (isinstance(edge, NumEdge) and edge.active and (not edge.is_large_power)
and (isinstance(edge.value, int) or isinstance(edge.value, float))):
right_edge = self.best_right_neighbor_edge(edge.end, skip_num_edge=False)
if (right_edge
and isinstance(right_edge, NumEdge)
and right_edge.active
and isinstance(right_edge.value, int)
and (right_edge.num_base > 1)
and right_edge.is_large_power):
new_value = round(edge.value * right_edge.value, 5)
if isinstance(new_value, float) and new_value.is_integer():
new_value = int(new_value)
new_edge = NumEdge(edge.start, right_edge.end, str(new_value), uroman, active=True)
new_edge.update(value=new_value, num_base=right_edge.num_base, e_type='G3',
orig_txt=edge.orig_txt + right_edge.orig_txt,
script=right_edge.script)
self.add_edge(new_edge)
num_edges = self.update_edge_list(num_edges, new_edge, [edge, right_edge])
if verbose:
print(new_edge.type, new_edge)
# G4 combine (+) G3 blocks 234000+567=234567
for edge in num_edges:
if isinstance(edge, NumEdge) and edge.active and isinstance(edge.value, int):
sub_edges = [edge]
while ((prev_edge := sub_edges[-1])
and (right_edge := self.best_right_neighbor_edge(prev_edge.end, skip_num_edge=False))
and isinstance(right_edge, NumEdge)
and right_edge.active
and isinstance(right_edge.value, int)
and (prev_edge.num_base > right_edge.value)
and (prev_edge.num_base > right_edge.num_base)):
if ((prev_edge.script == 'CJK')
and (prev_edge.num_base >= 1000)
and ('tag' not in prev_edge.type)
and regex.match('10+$', str(prev_edge.num_base))
and (1 <= right_edge.value <= 9)
and (right_edge.start + 1 == right_edge.end)):
new_num_base = prev_edge.num_base // 10
new_value = new_num_base * right_edge.value
# print('DIGIT TAG', prev_edge, right_edge, new_value)
right_edge.value = new_value
right_edge.num_base = new_num_base
right_edge.type = 'G4tag'
sub_edges.append(right_edge)
if len(sub_edges) >= 2:
new_value = sum([e.value for e in sub_edges])
new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True)
new_edge.update(value=new_value, num_base=sub_edges[-1].num_base, e_type='G4',
orig_txt=''.join([e.orig_txt for e in sub_edges]),
script=sub_edges[-1].script)
self.add_edge(new_edge)
num_edges = self.update_edge_list(num_edges, new_edge, sub_edges)
if verbose:
print(new_edge.type, new_edge)
# F1
for edge in num_edges:
# cushion fractions with spaces as needed: e.g. 23½ -> 23 1/2 or 十一五 -> 11 5
if isinstance(edge, NumEdge) and regex.match(r'\d', edge.txt):
left_edge = self.best_left_neighbor_edge(edge.start)
if left_edge and regex.search(r'\d$', left_edge.txt):
if edge.fraction:
sep = ' '
else:
sep = '·'
edge.txt = sep + edge.txt
for edge in num_edges:
if (isinstance(edge, NumEdge) and edge.active and (edge.value is not None)
and (((edge.value > 1000) and (edge.start + 1 == edge.end))
or (edge.orig_txt in '兩參参伍陆陸什')
or (edge.orig_txt in ('京兆', )))):
edge.active = False
if verbose: # or (num_edges and any([e.type in ['G1', 'G2', 'G3', 'G4'] for e in num_edges])):
if num_edges:
print('actives:')
for num_edge in num_edges:
print(num_edge)
for start in range(len(s)):
start_char = s[start]
if (best_edge := self.best_edge_in_span(start, start+1)) and isinstance(best_edge, NumEdge):
continue
if (num := ud_numeric(start_char)) is not None:
name = self.uroman.chr_name(start_char)
if ("DIGIT" in name) and isinstance(num, int) and (0 <= num <= 9):
# if start_char not in '0123456789': print('DIGIT', s[start], num, name)
self.add_edge(Edge(start, start + 1, str(num), 'num'))
else:
uroman.stats[('*NUM', start_char, num)] += 1
def add_rom_fall_back_singles(self, **_args):
"""For characters in the original string not covered by romanizations and numbers,
add a fallback edge based on type, romanization of single char, or original char."""
for start in range(self.max_vertex):
end = start+1
orig_char = self.s[start]
if not self.lattice[(start, end)]:
rom, edge_annotation = orig_char, 'orig'
if self.uroman.char_is_nonspacing_mark(rom):
rom, edge_annotation = '', 'Mn'
elif self.uroman.char_is_format_char(rom): # e.g. zero-width non-joiner, zero-width joiner
rom, edge_annotation = '', 'Cf'
elif ud.category(orig_char) == 'Co':
rom, edge_annotation = '', 'Co'
elif rom == ' ':
edge_annotation = 'orig'
# elif self.uroman.char_is_space_separator(rom):
# rom, edge_annotation = ' ', 'Zs'
elif (rom2 := self.simple_top_romanization_candidate_for_span(start, end)) is not None:
rom = rom2
if regex.match(r'\+(m|ng|n|h|r)', rom):
rom = rom[1:]
edge_annotation = 'rom single'
# else the original values still hold: rom, edge_annotation = orig_char, 'orig'
self.add_edge(Edge(start, end, rom, edge_annotation))
@staticmethod
def add_new_edge(old_edges: List[Edge], start: int, end: int, new_rom: str, new_type: str, position: int | None,
old_edge_dict: dict)\
-> None:
if (start, end, new_rom) not in old_edge_dict:
new_edge = Edge(start, end, new_rom, new_type)
if position is None:
old_edges.append(new_edge)
else:
old_edges.insert(position + 1, new_edge)
old_edge_dict[(start, end, new_rom)] = new_edge
# print(f' ALT {start}-{end} {new_rom}')
def add_alternatives(self, old_edges: List[Edge]) -> None:
old_edge_dict = {}
for old_edge in old_edges:
old_edge_dict[(old_edge.start, old_edge.end, old_edge.txt)] = old_edge
for position, old_edge in enumerate(old_edges):
if old_edge.type.startswith('rom-alt'):
continue # not old
start, end = old_edge.start, old_edge.end
orig_s = self.s[start:end]
old_rom = old_edge.txt
# self.lattice[(start, end)]:
for rom_rule in self.uroman.rom_rules[orig_s]:
rom_t = rom_rule['t']
if self.cand_is_valid(rom_rule, start, end, rom_t):
rom_alts = rom_rule['t-alts']
rom_eosyl = rom_rule['t-at-end-of-syllable']
if (rom_t == old_rom) and rom_alts:
for rom_alt in rom_alts:
self.add_new_edge(old_edges, start, end, rom_alt, 'rom-alt', position,
old_edge_dict)
if (rom_t == old_rom) and rom_eosyl:
self.add_new_edge(old_edges, start, end, rom_t, 'rom-alt2', position, old_edge_dict)
if rom_eosyl == old_rom:
self.add_new_edge(old_edges, start, end, rom_t, 'rom-alt3', position, old_edge_dict)
def all_edges(self, start: int, end: int) -> List[Edge]:
result = []
for start2 in range(start, end):
for end2 in sorted(list(self.lattice[(start2, 'right')]), reverse=True):
if end2 <= end:
result.extend(self.lattice[(start2, end2)])
else:
break
return result
def best_edge_in_span(self, start: int, end: int, skip_num_edge: bool = False) -> Edge | None:
edges = self.lattice[(start, end)]
# if len(edges) >= 2: print('Multi edge', start2, end2, self.s[start2:end2], edges)
decomp_edge, other_edge, rom_edge = None, None, None
for edge in edges:
if isinstance(edge, NumEdge):
if skip_num_edge:
continue
if edge.active:
return edge
if edge.type.startswith('rom decomp'):
if decomp_edge is None:
decomp_edge = edge # plan C
elif regex.match(r'(?:rom|num)', edge.type):
if rom_edge is None:
rom_edge = edge # plan B
elif other_edge is None:
other_edge = edge # plan D
return rom_edge or decomp_edge or other_edge
def best_right_neighbor_edge(self, start: int, skip_num_edge: bool = False) -> Edge | None:
for end in sorted(list(self.lattice[(start, 'right')]), reverse=True):
if best_edge := self.best_edge_in_span(start, end, skip_num_edge=skip_num_edge):
return best_edge
return None
def best_left_neighbor_edge(self, end: int, skip_num_edge: bool = False) -> Edge | None:
for start in sorted(list(self.lattice[(end, 'left')])):
if best_edge := self.best_edge_in_span(start, end, skip_num_edge=skip_num_edge):
return best_edge
return None
def best_rom_edge_path(self, start: int, end: int, skip_num_edge: bool = False) -> List[Edge]:
"""Finds the best romanization edge path through the romanization lattice, including
non-romanized pieces such as ASCII and non-ASCII punctuation."""
result = []
start2 = start
while start2 < end:
if best_edge := self.best_right_neighbor_edge(start2, skip_num_edge=skip_num_edge):
result.append(best_edge)
start2 = best_edge.end
else: # should not happen
start2 += 1
return result
def find_rom_edge_path_backwards(self, start: int, end: int, min_char: int | None = None,
return_str: bool = False, skip_num_edge: bool = False) -> List[Edge] | str:
"""Finds a partial best path on the left from a start position to provide left contexts for
romanization rules. Can return a string or a list of edges. Is typically used for a short context,
as specified by min_char."""
result_edges = []
rom = ''
end2 = end
while start < end2:
old_end2 = end2
if new_edge := self.best_left_neighbor_edge(end2, skip_num_edge=skip_num_edge):
result_edges = [new_edge] + result_edges
rom = new_edge.txt + rom
end2 = new_edge.start
if min_char and len(rom) >= min_char:
break
if old_end2 >= end2:
end2 -= 1
if return_str:
return rom
else:
return result_edges
@staticmethod
def edge_path_to_surf(edges) -> str:
result = ''
for edge in edges:
result += edge.txt
return result
# @timer
def main():
"""This function provides a user interface, either using argparse for a command line interface,
or providing direct function calls.
First, a uroman object will have to created, loading uroman data (directory must be provided,
listed as default). This only needs to be done once.
After that you can romanize from file to file, or just romanize a string."""
# Compute data_dir based on the location of this executable script.
src_dir = os.path.dirname(os.path.realpath(__file__))
root_dir = os.path.dirname(src_dir)
data_dir = os.path.join(root_dir, "data")
# print(src_dir, root_dir, data)
parser = argparse.ArgumentParser()
parser.add_argument('direct_input', nargs='*', type=str)
parser.add_argument('--data_dir', type=Path, default=data_dir, help='uroman resource dir')
parser.add_argument('-i', '--input_filename', type=str, help='default: sys.stdin')
parser.add_argument('-o', '--output_filename', type=str, help='default: sys.stdout')
parser.add_argument('-l', '--lcode', type=str, default=None,
help='ISO 639-3 language code, e.g. eng')
# parser.add_argument('-f', '--rom_format', type=RomFormat, default=RomFormat.STR, help:'alt: RomFormat.EDGES')
parser.add_argument('-f', '--rom_format', type=RomFormat, default=RomFormat.STR,
choices=list(RomFormat), help="Output format of romanization. 'edges' provides offsets")
# The remaining arguments are mostly for development and test
parser.add_argument('--max_lines', type=int, default=None, help='limit uroman to first n lines')
parser.add_argument('--load_log', action='count', default=0, help='report load stats')
parser.add_argument('--test', action='count', default=0, help='perform/display a few tests')
parser.add_argument('-v', '--verbose', action='count', default=0)
parser.add_argument('--rebuild_ud_props', action='count', default=0,
help='rebuild UnicodeDataProps files (for development mode only)')
parser.add_argument('--rebuild_num_props', action='count', default=0,
help='rebuild NumProps file (for development mode only)')
parser.add_argument('--no_caching', action='count', default=0, help='for development mode: speed')
parser.add_argument('--silent', action='count', default=0, help='suppress ... progress')
parser.add_argument('-a', '--ablation', type=str, default='', help='for development mode: nocap')
parser.add_argument('--stats', action='count', default=0, help='for development mode: numbers')
parser.add_argument('--ignore_args', action='count', default=0, help='for usage illustration only')
parser.add_argument(PROFILE_FLAG, type=argparse.FileType('w', encoding='utf-8', errors='ignore'),
default=None, metavar='PROFILE-FILENAME', help='(optional output for performance analysis)')
args = parser.parse_args()
# copy selected (minor) args from argparse.Namespace to dict
args_dict = {'rom_format': args.rom_format, 'load_log': args.load_log, 'test': args.test, 'stats': args.stats,
'no_caching': args.no_caching, 'max_lines': args.max_lines, 'verbose': args.verbose,
'rebuild_ud_props': args.rebuild_ud_props, 'rebuild_num_props': args.rebuild_num_props,
'ablation': args.ablation, 'silent': args.silent}
pr = None
if args.profile:
gc.enable()
gc.set_debug(gc.DEBUG_STATS)
gc.set_debug(gc.DEBUG_LEAK)
pr = cProfile.Profile()
pr.enable()
'''Sample calls:
uroman.py --help
uroman.py -i ../test/multi-script.txt -o ../test/multi-script-out2.txt
uroman.py < ../test/multi-script.txt > ../test/multi-script-out2.txt
uroman.py Игорь
uroman.py Игорь --lcode ukr
uroman.py ألاسكا 서울 Καλιφόρνια
uroman.py ちょっとまってください -f edges
uroman.py "महात्मा गांधी" -f lattice
uroman.py สวัสดี --load_log
uroman.py --test
uroman.py --ignore_args
uroman.py Բարեւ -o ../test/tmp-out.txt -f edges
# In double input cases such as in the line below,
# the input-file's romanization is sent to stdout, while the direct-input romanization is sent to stderr
uroman.py ⴰⵣⵓⵍ -i ../test/multi-script.txt > ../test/multi-script-out2.txt
'''
if args.ignore_args:
# minimal calls
uroman = Uroman(args.data_dir)
s, s2, s3, s4 = 'Игорь', 'ちょっとまってください', 'ka‍n‍ne', 'महात्मा गांधी'
print(s, uroman.romanize_string(s))
print(s, uroman.romanize_string(s, lcode='ukr'))
print(s2, Edge.json_str(uroman.romanize_string(s2, rom_format=RomFormat.EDGES)))
print(s3, Edge.json_str(uroman.romanize_string(s3, rom_format=RomFormat.EDGES)))
print(s4, Edge.json_str(uroman.romanize_string(s4, rom_format=RomFormat.LATTICE)))
# Note that ../test/multi-script.txt has several lines starting with ::lcode eng etc.
# This allows users to select specific language codes to specific lines, overwriting the overall --lcodes
uroman.romanize_file(input_filename='../test/multi-script.txt',
output_filename='../test/multi-script-out3.txt')
else:
# build a Uroman object (once for many applications and different scripts and languages)
uroman = Uroman(args.data_dir, load_log=args.load_log, rebuild_ud_props=args.rebuild_ud_props,
rebuild_num_props=args.rebuild_num_props)
romanize_file_p = (args.input_filename or args.output_filename
or not (args.direct_input or args.test or args.ignore_args
or args.rebuild_ud_props or args.rebuild_num_props))
# Romanize any positional arguments, interpreted as strings to be romanized.
for s in args.direct_input:
result = uroman.romanize_string(s.rstrip(), lcode=args.lcode, **args_dict)
result_json = Edge.json_str(result)
if romanize_file_p:
# input from both file/stdin (to file/stdout) and direct-input (to stderr)
if args.input_filename:
sys.stderr.write(result_json + '\n')
# input from direct-input (but not from file/stdin) to stdout
# else pass
# no file/stdin or file/stdout, so we write romanization of direct-input to stdout
else:
print(result_json)
# If provided, apply romanization to an entire file.
if romanize_file_p:
uroman.romanize_file(args.input_filename, args.output_filename, lcode=args.lcode,
direct_input=args.direct_input, **args_dict)
if args.test:
uroman.test_output_of_selected_scripts_and_rom_rules()
uroman.test_romanization()
if uroman.stats and args.stats:
stats100 = {k: uroman.stats[k] for k in list(dict(uroman.stats))[:100]}
sys.stderr.write(f'Stats: {stats100} ...\n')
if args.profile:
if pr:
pr.disable()
ps = pstats.Stats(pr, stream=args.profile).sort_stats(pstats.SortKey.TIME)
ps.print_stats()
print(gc.get_stats())
if __name__ == "__main__":
main()