Spaces:
Configuration error
Configuration error
#!/usr/bin/env python | |
""" | |
Written by Ulf Hermjakob, USC/ISI March-April 2024 | |
uroman is a universal romanizer. It converts text in any script to the Latin alphabet. | |
This script is a Python reimplementation of an earlier Perl script, with some improvements. | |
The tool has been tested on 250 languages, with 100 or more sentences each. | |
This script is still under development and large-scale testing. Feedback welcome. | |
This script provides token-size caching (for faster runtimes). | |
Output formats include | |
(1) best romanization string | |
(2) best romanization edges ("best path"; incl. start and end positions with respect to the original string) | |
(3) best romanization with alternatives (as applicable for ambiguous romanization) | |
(4) best romanization full lattice (all edges, including superseded sub-edges) | |
See below for 'sample calls' under main() | |
""" | |
from __future__ import annotations | |
import argparse | |
from collections import defaultdict | |
# from memory_profiler import profile | |
import datetime | |
from enum import Enum | |
from fractions import Fraction | |
import gc | |
import json | |
import math | |
import os | |
import pathlib | |
from pathlib import Path | |
import pstats | |
import regex | |
import sys | |
from typing import List, Tuple | |
import unicodedata as ud | |
PROFILE_FLAG = "--profile" # also used in argparse processing | |
if PROFILE_FLAG in sys.argv: | |
import cProfile | |
# UTILITIES | |
def timer(func): | |
def wrapper(*args, **kwargs): | |
start_time = datetime.datetime.now() | |
print(f"Calling: {func.__name__}{args}") | |
print(f"Start time: {start_time:%A, %B %d, %Y at %H:%M}") | |
result = func(*args, **kwargs) | |
end_time = datetime.datetime.now() | |
time_diff = (end_time-start_time).total_seconds() | |
print(f"End time: {end_time:%A, %B %d, %Y at %H:%M}") | |
print(f"Duration: {time_diff} seconds") | |
return result | |
return wrapper | |
def slot_value_in_double_colon_del_list(line: str, slot: str, default: str | list | None = None) -> str | list | None: | |
"""For a given slot, e.g. 'cost', get its value from a line such as '::s1 of course ::s2 ::cost 0.3' -> 0.3 | |
The value can be an empty string, as for ::s2 in the example above.""" | |
m = regex.match(fr'(?:.*\s)?::{slot}(|\s+\S.*?)(?:\s+::\S.*|\s*)$', line) | |
return m.group(1).strip() if m else default | |
def has_value_in_double_colon_del_list(line: str, slot: str) -> bool: | |
return isinstance(slot_value_in_double_colon_del_list(line, slot), str) | |
def dequote_string(s: str) -> str: | |
if isinstance(s, str): | |
m = regex.match(r'''\s*(['"“])(.*)(['"”])\s*$''', s) | |
if m and ((m.group(1) + m.group(3)) in ("''", '""', '“”')): | |
return m.group(2) | |
return s | |
def last_chr(s: str) -> str: | |
if len(s): | |
return s[len(s)-1] | |
else: | |
'' | |
def ud_numeric(char: str) -> int | float | None: | |
try: | |
num_f = ud.numeric(char) | |
return int(num_f) if num_f.is_integer() else num_f | |
except (ValueError, TypeError): | |
return None | |
def robust_str_to_num(num_s: str, filename: str = None, line_number: int | None = None, silent: bool = False) \ | |
-> int | float | None: | |
if isinstance(num_s, str): | |
try: | |
return float(num_s) if "." in num_s else int(num_s) | |
except ValueError: | |
if not silent: | |
sys.stderr.write(f'Cannot convert "{num_s}" to a number') | |
if line_number: | |
sys.stderr.write(f' line: {line_number}') | |
if filename: | |
sys.stderr.write(f' file: {filename}') | |
sys.stderr.write(f'\n') | |
elif isinstance(num_s, float) or isinstance(num_s, int): | |
return num_s | |
return None | |
def first_non_none(*args): | |
for arg in args: | |
if arg is not None: | |
return arg | |
return None | |
def any_not_none(*args) -> bool: | |
for arg in args: | |
if arg is not None: | |
return True | |
return False | |
def add_non_none_to_dict(d: dict, key: str, value) -> None: | |
if value is not None: | |
d[key] = value | |
def fraction_char2fraction(fraction_char: str, fraction_value: float | None = None, | |
uroman: Uroman | None = None) -> Fraction | None: | |
s = '' | |
fraction = None | |
for ud_decomp_elem in ud.decomposition(fraction_char).split(): | |
try: | |
s += chr(int(ud_decomp_elem, 16)) | |
except ValueError: | |
s += ud_decomp_elem | |
if m := regex.match(r'<fraction>(\d+)⁄(\d+)$', s): | |
numerator_s, denominator_s = m.group(1, 2) | |
try: | |
fraction = Fraction(int(numerator_s), int(denominator_s)) | |
except ValueError: | |
fraction = None | |
if (fraction is None) and uroman and fraction_value: | |
if num_denom := uroman.unicode_float2fraction(fraction_value): | |
try: | |
fraction = Fraction(num_denom[0], num_denom[1]) | |
except ValueError: | |
fraction = None | |
return fraction | |
def chr_name(char: str) -> str: | |
"""robust version of ud.name; see related Uroman.char_name() that includes names not included in UnicodeData.txt""" | |
try: | |
return ud.name(char) | |
except (ValueError, TypeError): | |
return '' | |
def args_get(key: str, args: argparse.Namespace | None = None): | |
return vars(args)[key] if args and (key in args) else None | |
class DictClass: | |
def __init__(self, **kw_args): | |
for kw_arg in kw_args: | |
kw_arg2 = kw_arg.replace('_', '-') | |
value = kw_args[kw_arg] | |
if not (value in (None, [], False)): | |
self.__dict__[kw_arg2] = value | |
def __repr__(self): | |
return str(self.__dict__) | |
def __getitem__(self, key, default=None): | |
return self.__dict__[key] if key in self.__dict__ else default | |
def __bool__(self): | |
return len(self.__dict__) > 0 | |
class RomRule(DictClass): | |
# key: source string | |
# typical attributes: s (source), t (target), prov (provenance), lcodes (language codes) | |
# t_alts=t_alts (target alternatives), use_only_at_start_of_word, dont_use_at_start_of_word, | |
# use_only_at_end_of_word, dont_use_at_end_of_word, use_only_for_whole_word | |
pass | |
class Script(DictClass): | |
# key: lower case script_name | |
# typical attributes: script_name, direction, abugida_default_vowels, alt_script_names, languages | |
pass | |
class RomFormat(Enum): | |
"""Output format of romanization""" | |
STR = 'str' # simple string | |
EDGES = 'edges' # list of edges (includes character offsets in original string) | |
ALTS = 'alts' # lattice including alternative edges | |
LATTICE = 'lattice' # lattice including alternative and superseded edges | |
def __str__(self): | |
return self.value | |
class Uroman: | |
"""This class loads and maintains uroman data independent of any specific text corpus. | |
Typically, only a single instance will be used. (In contrast to multiple lattice instances, one per text.) | |
Methods include some testing. And finally methods to romanize a string (romanize_string()) or an entire file | |
(romanize_file()).""" | |
def __init__(self, data_dir: Path, **args): # args: load_log, rebuild_ud_props | |
self.data_dir = data_dir | |
self.rom_rules = defaultdict(list) | |
self.scripts = defaultdict(Script) | |
self.dict_bool = defaultdict(bool) | |
self.dict_str = defaultdict(str) | |
self.dict_int = defaultdict(int) | |
self.dict_num = defaultdict(lambda: None) # values are int (most common), float, or str ("1/2") | |
# num_props key: txt | |
# values: {"txt": "\u137b", "rom": "100", "value": 100, "type": "base", "mult": 1, "script": "Ethiopic"} | |
self.num_props = defaultdict(dict) | |
self.dict_set = defaultdict(set) | |
self.float2fraction = {} # caching | |
gc.disable() | |
self.load_resource_files(data_dir, args.get('load_log', False), | |
args.get('rebuild_ud_props', False), | |
args.get('rebuild_num_props', False)) | |
gc.enable() | |
self.hangul_rom = {} | |
self.rom_cache = {} # key: (s, lcode) value: t | |
self.stats = defaultdict(int) # stats, e.g. for unprocessed numbers | |
self.abugida_cache = {} # key: (script, char_rom) value: (base_rom, base_rom_plus_abugida_vowel, modified rom) | |
def second_rom_filter(self, c: str, rom: str, name: str | None) -> Tuple[str | None, str]: | |
"""Much of this code will eventually move the old Perl code to generate cleaner primary data""" | |
if rom and (' ' in rom): | |
if name is None: | |
name = self.chr_name(c) | |
if "MYANMAR VOWEL SIGN KAYAH" in name: | |
if m := regex.search(r'kayah\s+(\S+)\s*$', rom): | |
return m.group(1), name | |
if "MENDE KIKAKUI SYLLABLE" in name: | |
if m := regex.search(r'm\d+\s+(\S+)\s*$', rom): | |
return m.group(1), name | |
if regex.search(r'\S\s+\S', rom): | |
return c, name | |
return None, name | |
def load_rom_file(self, filename: str, provenance: str, file_format: str = None, load_log: bool = True): | |
"""Reads in and processes the 3 main romanization data files: (1) romanization-auto-table.txt | |
which was automatically generated from UnicodeData.txt (2) UnicodeDataOverwrite.txt that "corrects" | |
some entries in romanization-auto-table.txt and (3) romanization-table.txt which was largely manually | |
created and allows complex romanization rules, some for specific languages, some for specific contexts.""" | |
n_entries = 0 | |
try: | |
f = open(filename) | |
except FileNotFoundError: | |
sys.stderr.write(f'Cannot open file {filename}\n') | |
return | |
with (f): | |
for line_number, line in enumerate(f, 1): | |
if line.startswith('#'): | |
continue | |
if regex.match(r'^\s*$', line): # blank line | |
continue | |
line = regex.sub(r'\s{2,}#.*$', '', line) | |
if file_format == 'u2r': | |
t_at_end_of_syllable = None | |
u = dequote_string(slot_value_in_double_colon_del_list(line, 'u')) | |
try: | |
cp = int(u, 16) | |
s = chr(cp) | |
except ValueError: | |
continue | |
t = dequote_string(slot_value_in_double_colon_del_list(line, 'r')) | |
if name := slot_value_in_double_colon_del_list(line, 'name'): | |
self.dict_str[('name', s)] = name | |
if pic := slot_value_in_double_colon_del_list(line, 'pic'): | |
self.dict_str[('pic', s)] = pic | |
if tone_mark := slot_value_in_double_colon_del_list(line, 'tone-mark'): | |
self.dict_str[('tone-mark', s)] = tone_mark | |
if syllable_info := slot_value_in_double_colon_del_list(line, 'syllable-info'): | |
self.dict_str[('syllable-info', s)] = syllable_info | |
else: | |
s = dequote_string(slot_value_in_double_colon_del_list(line, 's')) | |
t = dequote_string(slot_value_in_double_colon_del_list(line, 't')) | |
t_at_end_of_syllable = dequote_string(slot_value_in_double_colon_del_list(line, | |
't-end-of-syllable')) | |
if (num_s := slot_value_in_double_colon_del_list(line, 'num')) is not None: | |
num = robust_str_to_num(num_s) | |
self.dict_num[s] = (num_s if (num is None) else num) | |
is_minus_sign = has_value_in_double_colon_del_list(line, 'is-minus-sign') | |
is_plus_sign = has_value_in_double_colon_del_list(line, 'is-plus-sign') | |
is_decimal_point = has_value_in_double_colon_del_list(line, 'is-decimal-point') | |
is_large_power = has_value_in_double_colon_del_list(line, 'is-large-power') | |
fraction_connector = slot_value_in_double_colon_del_list(line, 'fraction-connector') | |
percentage_marker = slot_value_in_double_colon_del_list(line, 'percentage-marker') | |
int_frac_connector = slot_value_in_double_colon_del_list(line, 'int-frac-connector') | |
lcode_s = slot_value_in_double_colon_del_list(line, 'lcode') | |
lcodes = regex.split(r'[,;]\s*', lcode_s) if lcode_s else [] | |
use_only_at_start_of_word = has_value_in_double_colon_del_list(line, 'use-only-at-start-of-word') | |
dont_use_at_start_of_word = has_value_in_double_colon_del_list(line, 'dont-use-at-start-of-word') | |
use_only_at_end_of_word = has_value_in_double_colon_del_list(line, 'use-only-at-end-of-word') | |
dont_use_at_end_of_word = has_value_in_double_colon_del_list(line, 'dont-use-at-end-of-word') | |
use_only_for_whole_word = has_value_in_double_colon_del_list(line, 'use-only-for-whole-word') | |
num_s = slot_value_in_double_colon_del_list(line, 'num') | |
num = robust_str_to_num(num_s, filename, line_number, silent=False) | |
t_alt_s = slot_value_in_double_colon_del_list(line, 't-alt') | |
t_alts = regex.split(r'[,;]\s*', t_alt_s) if t_alt_s else [] | |
t_alts = list(map(dequote_string, t_alts)) | |
t_mod, name2 = self.second_rom_filter(s, t, None) | |
if t_mod and (t_mod != t): | |
if t != s: | |
pass # sys.stderr.write(f'UPDATE: {s} {name2} {t} -> {t_mod}\n') | |
t = t_mod | |
if s is not None: | |
for bool_key in ('is-large-power', 'is-minus-sign', 'is-plus-sign', 'is-decimal-point'): | |
bool_value = eval(bool_key.replace('-', '_')) | |
if bool_value: | |
self.dict_bool[(bool_key, s)] = True | |
if any_not_none(t, num, is_minus_sign, is_plus_sign, is_decimal_point, is_large_power, | |
fraction_connector, percentage_marker, int_frac_connector): | |
self.register_s_prefix(s) | |
n_entries += 1 | |
# if regex.match(r'[\u2800-\u28FF]', s): print("Braille", s, t) | |
restrictions = [lcodes, use_only_at_start_of_word, dont_use_at_start_of_word, | |
use_only_at_end_of_word, dont_use_at_end_of_word, use_only_for_whole_word] | |
n_restrictions = len([restr for restr in restrictions if restr]) | |
provenance2 = provenance | |
if (t is None) and (num is not None) and (provenance2 == "rom"): | |
provenance2 = "num" | |
new_rom_rule = RomRule(s=s, t=t, prov=provenance2, lcodes=lcodes, t_alts=t_alts, num=num, | |
use_only_at_start_of_word=use_only_at_start_of_word, | |
dont_use_at_start_of_word=dont_use_at_start_of_word, | |
use_only_at_end_of_word=use_only_at_end_of_word, | |
dont_use_at_end_of_word=dont_use_at_end_of_word, | |
use_only_for_whole_word=use_only_for_whole_word, | |
t_at_end_of_syllable=t_at_end_of_syllable, | |
n_restr=n_restrictions, | |
is_minus_sign=is_minus_sign, | |
is_plus_sign=is_plus_sign, | |
is_decimal_point=is_decimal_point, | |
fraction_connector=fraction_connector, | |
percentage_marker=percentage_marker, | |
int_frac_connector=int_frac_connector, | |
is_large_power=is_large_power) | |
old_rom_rules = self.rom_rules[s] | |
if ((len(old_rom_rules) == 1) and (old_rom_rules[0]['prov'] in ('ud', 'ow')) | |
and not (lcodes or use_only_at_start_of_word or dont_use_at_start_of_word | |
or use_only_at_end_of_word or dont_use_at_end_of_word | |
or use_only_for_whole_word)): | |
self.rom_rules[s] = [new_rom_rule] # overwrite | |
else: | |
self.rom_rules[s].append(new_rom_rule) | |
# Thai | |
thai_cancellation_mark = '\u0E4C' | |
# cancellation applies to preceding letter incl. any vowel modifier letter (e.g. ศักดิ์สิทธิ์ -> saksit) | |
for cp in range(0x0E01, 0x0E4C): # Thai | |
c = chr(cp) | |
s = c + thai_cancellation_mark | |
new_rom_rule = RomRule(s=s, t='', prov='auto cancel letter') | |
if not self.rom_rules[s]: | |
self.rom_rules[s] = [new_rom_rule] | |
self.register_s_prefix(s) | |
thai_consonants = list(map(chr, range(0x0E01, 0x0E2F))) | |
thai_vowel_modifiers = ['\u0E31', '\u0E47'] + list(map(chr, range(0x0E33, 0x0E3B))) | |
for c1 in thai_consonants: | |
for v in thai_vowel_modifiers: | |
s = c1 + v + thai_cancellation_mark | |
new_rom_rule = RomRule(s=s, t='', prov='auto cancel syllable') | |
if not self.rom_rules[s]: | |
self.rom_rules[s] = [new_rom_rule] | |
self.register_s_prefix(s) | |
if load_log: | |
sys.stderr.write(f'Loaded {n_entries} from {filename}\n') | |
def load_script_file(self, filename: str, load_log: bool = True): | |
"""Reads in (typically from Scripts.txt) information about various scripts such as Devanagari, | |
incl. information such as the default abugida vowel letter (e.g. "a").""" | |
n_entries, max_n_script_name_components = 0, 0 | |
try: | |
f = open(filename) | |
except FileNotFoundError: | |
sys.stderr.write(f'Cannot open file {filename}\n') | |
return | |
with f: | |
for line_number, line in enumerate(f, 1): | |
if line.startswith('#'): | |
continue | |
if regex.match(r'^\s*$', line): # blank line | |
continue | |
line = regex.sub(r'\s{2,}#.*$', '', line) | |
if script_name := slot_value_in_double_colon_del_list(line, 'script-name'): | |
lc_script_name = script_name.lower() | |
if lc_script_name in self.scripts: | |
sys.stderr.write(f'** Ignoring duplicate script "{script_name}" ' | |
f'in line {line_number} of {filename}\n') | |
else: | |
n_entries += 1 | |
direction = slot_value_in_double_colon_del_list(line, 'direction') | |
abugida_default_vowel_s = slot_value_in_double_colon_del_list(line, | |
'abugida-default-vowel') | |
abugida_default_vowels = regex.split(r'[,;]\s*', abugida_default_vowel_s) \ | |
if abugida_default_vowel_s else [] | |
alt_script_name_s = slot_value_in_double_colon_del_list(line, 'alt-script-name') | |
alt_script_names = regex.split(r'[,;]\s*', alt_script_name_s) if alt_script_name_s else [] | |
language_s = slot_value_in_double_colon_del_list(line, 'language') | |
languages = regex.split(r'[,;]\s*', language_s) if language_s else [] | |
new_script = Script(script_name=script_name, alt_script_names=alt_script_names, | |
languages=languages, direction=direction, | |
abugida_default_vowels=abugida_default_vowels) | |
self.scripts[lc_script_name] = new_script | |
for language in languages: | |
self.dict_set[('scripts', language)].add(script_name) | |
for alt_script_name in alt_script_names: | |
lc_alt_script_name = alt_script_name.lower() | |
if lc_alt_script_name in self.scripts: | |
sys.stderr.write(f'** Ignoring duplicate alternative script name "{script_name}" ' | |
f'in line {line_number} of {filename}\n') | |
else: | |
self.scripts[lc_alt_script_name] = new_script | |
n_script_name_components = len(script_name.split()) | |
if n_script_name_components > max_n_script_name_components: | |
max_n_script_name_components = n_script_name_components | |
if max_n_script_name_components: | |
self.dict_int['max_n_script_name_components'] = max_n_script_name_components | |
if load_log: | |
sys.stderr.write(f'Loaded {n_entries} script descriptions from {filename}' | |
f' (max_n_scripts_name_components: {max_n_script_name_components})\n') | |
def extract_script_name(self, script_name_plus: str, full_char_name: str = None) -> str | None: | |
"""Using info from Scripts.txt, this script selects the script name from a Unicode, | |
e.g. given "OLD HUNGARIAN CAPITAL LETTER A", extract "Old Hungarian".""" | |
if full_char_name and script_name_plus == full_char_name: | |
return None | |
while script_name_plus: | |
if script_name_plus.lower() in self.scripts: | |
if script := self.scripts[script_name_plus.lower()]: | |
if script_name := script['script-name']: | |
return script_name | |
script_name_plus = regex.sub(r'\s*\S*\s*$', '', script_name_plus) | |
return None | |
def load_unicode_data_props(self, filename: str, load_log: bool = True): | |
"""Loads Unicode derived data from (1) UnicodeDataProps.txt, (2) UnicodeDataPropsHangul.txt | |
and UnicodeDataPropsCJK.txt with a list of valid script-specific characters.""" | |
n_script, n_script_char, n_script_vowel_sign, n_script_medial_consonant_sign, n_script_virama = 0, 0, 0, 0, 0 | |
try: | |
f = open(filename) | |
except FileNotFoundError: | |
sys.stderr.write(f'Cannot open file {filename}\n') | |
return | |
with f: | |
for line_number, line in enumerate(f, 1): | |
if line.startswith('#'): | |
continue | |
if regex.match(r'^\s*$', line): # blank line | |
continue | |
line = regex.sub(r'\s{2,}#.*$', '', line) | |
if script_name := slot_value_in_double_colon_del_list(line, 'script-name'): | |
n_script += 1 | |
for char in slot_value_in_double_colon_del_list(line, 'char', []): | |
self.dict_str[('script', char)] = script_name | |
n_script_char += 1 | |
for char in slot_value_in_double_colon_del_list(line, 'numeral', []): | |
self.dict_str[('script', char)] = script_name | |
n_script_char += 1 | |
for char in slot_value_in_double_colon_del_list(line, 'vowel-sign', []): | |
self.dict_bool[('is-vowel-sign', char)] = True | |
n_script_vowel_sign += 1 | |
for char in slot_value_in_double_colon_del_list(line, 'medial-consonant-sign', []): | |
self.dict_bool[('is-medial-consonant-sign', char)] = True | |
n_script_medial_consonant_sign += 1 | |
for char in slot_value_in_double_colon_del_list(line, 'sign-virama', []): | |
self.dict_bool[('is-virama', char)] = True | |
n_script_virama += 1 | |
if load_log: | |
sys.stderr.write(f'Loaded from {filename} mappings of {n_script_char:,d} characters ' | |
f'to {n_script} script{"" if n_script == 1 else "s"}') | |
if n_script_vowel_sign or n_script_virama or n_script_medial_consonant_sign: | |
sys.stderr.write(f', with a total of {n_script_vowel_sign} vowel signs, ' | |
f'{n_script_medial_consonant_sign} medial consonant signs ' | |
f'and {n_script_virama} viramas') | |
sys.stderr.write('.\n') | |
def load_num_props(self, filename: str, load_log: bool = True): | |
"""Loads Unicode derived data from (1) UnicodeDataProps.txt, (2) UnicodeDataPropsHangul.txt | |
and UnicodeDataPropsCJK.txt with a list of valid script-specific characters.""" | |
n_entries = 0 | |
try: | |
f = open(filename) | |
except FileNotFoundError: | |
sys.stderr.write(f'Cannot open file {filename}\n') | |
return | |
with f: | |
for line_number, line in enumerate(f, 1): | |
if line.startswith('#'): | |
continue | |
if regex.match(r'^\s*$', line): # blank line | |
continue | |
d = json.loads(line) | |
if isinstance(d, dict): | |
if txt := d.get('txt'): | |
self.num_props[txt] = d | |
n_entries += 1 | |
else: | |
sys.stderr.write(f'Missing txt in l.{line_number} in file {filename}: {line.strip()}\n') | |
for bool_key in ('is-large-power',): | |
if d.get(bool_key): | |
self.dict_bool[(bool_key, txt)] = True | |
else: | |
sys.stderr.write(f'json in l.{line_number} in file {filename} not a dict: {line.strip()}\n') | |
if load_log: | |
sys.stderr.write(f'Loaded {n_entries} entries from {filename}\n') | |
def de_accent_pinyin(s: str) -> str: | |
"""De-accents a string from "liú" to "liu" and "ü" to "u" (to help process file Chinese_to_Pinyin.txt).""" | |
result = '' | |
for char in s: | |
if decomp := ud.decomposition(char).split(): | |
try: | |
decomp_chars = [chr(int(x, 16)) for x in decomp] | |
letters = [x for x in decomp_chars if ud.category(x).startswith('L')] | |
except ValueError: | |
sys.stderr.write(f'Cannot decode {decomp}\n') | |
continue | |
if len(letters) == 1: | |
result += letters[0] | |
else: | |
sys.stderr.write(f'Cannot decode {decomp} (expected 1 letter)\n') | |
else: | |
result += char | |
result = result.replace('ü', 'u') | |
return result | |
def register_s_prefix(self, s: str): | |
for prefix_len in range(1, len(s) + 1): | |
self.dict_bool[('s-prefix', s[:prefix_len])] = True | |
def load_chinese_pinyin_file(self, filename: str, load_log: bool = True): | |
"""Loads file Chinese_to_Pinyin.txt which maps Chinese characters to their Latin form.""" | |
n_entries = 0 | |
try: | |
f = open(filename) | |
except FileNotFoundError: | |
sys.stderr.write(f'Cannot open file {filename}\n') | |
return | |
with f: | |
for line_number, line in enumerate(f, 1): | |
if line.startswith('#'): | |
continue | |
if regex.match(r'^\s*$', line): # blank line | |
continue | |
try: | |
chinese, pinyin = line.rstrip().split() | |
rom = self.de_accent_pinyin(pinyin) | |
except ValueError: | |
sys.stderr.write(f'Cannot process line {line_number} in file {filename}: {line}') | |
else: | |
s = chinese | |
new_rom_rule = RomRule(s=s, t=rom, prov='rom pinyin', lcodes=[]) | |
self.rom_rules[chinese].append(new_rom_rule) | |
self.register_s_prefix(s) | |
n_entries += 1 | |
if load_log: | |
sys.stderr.write(f'Loaded {n_entries} script descriptions from {filename}\n') | |
def add_char_to_rebuild_unicode_data_dict(d: dict, script_name: str, prop_class: str, char: str): | |
d['script-names'].add(script_name) | |
key = (script_name, prop_class) | |
if key in d: | |
d[key].append(char) | |
else: | |
d[key] = [char] | |
def rebuild_unicode_data_props(self, out_filename: str, cjk: str = None, hangul: str = None): | |
"""This functions rebuilds UnicodeDataProps*.txt This might be useful when a new UnicodeData.txt | |
version is released, or additional information is extracted from Unicode to UnicodeDataProps.txt | |
Regular users normally never have to call this function.""" | |
d = {'script-names': set()} | |
n_script_refs = 0 | |
codepoint = -1 | |
prop_classes = {'char'} | |
while codepoint < 0xF0000: | |
codepoint += 1 | |
c = chr(codepoint) | |
if not (char_name := self.chr_name(c)): | |
continue | |
for prop_name_comp2 in ('VOWEL SIGN', | |
('MEDIAL CONSONANT SIGN', 'CONSONANT SIGN MEDIAL', 'CONSONANT SIGN SHAN MEDIAL', | |
'CONSONANT SIGN MON MEDIAL'), | |
('SIGN VIRAMA', 'SIGN ASAT', 'AL-LAKUNA', 'SIGN COENG', 'SIGN PAMAAEH', | |
'CHARACTER PHINTHU'), | |
('NUMERAL', 'NUMBER', 'DIGIT', 'FRACTION')): | |
if prop_name_comp2 and isinstance(prop_name_comp2, tuple): | |
prop_list = prop_name_comp2 | |
else: | |
prop_list = (prop_name_comp2,) | |
for prop_name_comp in prop_list: | |
prop_class = prop_list[0].lower().replace(' ', '-') | |
if prop_class not in prop_classes: | |
prop_classes.add(prop_class) | |
script_name_cand = regex.sub(fr'\s+{prop_name_comp}\b.*$', '', char_name) | |
if script_name := self.extract_script_name(script_name_cand, char_name): | |
self.add_char_to_rebuild_unicode_data_dict(d, script_name, prop_class, c) | |
script_name_cand = regex.sub(r'\s+(CONSONANT|LETTER|LIGATURE|SIGN|SYLLABLE|SYLLABICS|VOWEL|' | |
r'IDEOGRAPH|HIEROGLYPH|POINT|ACCENT|CHARACTER|TIPPI|ADDAK|IRI|URA|' | |
r'SYMBOL GENITIVE|SYMBOL COMPLETED|SYMBOL LOCATIVE|SYMBOL AFOREMENTIONED|' | |
r'AU LENGTH MARK)\b.*$', '', | |
char_name) | |
if script_name := self.extract_script_name(script_name_cand, char_name): | |
self.add_char_to_rebuild_unicode_data_dict(d, script_name, 'char', c) | |
n_script_refs += 1 | |
# print(sorted(d['script-names'])) | |
prop_classes = sorted(prop_classes) | |
out_filenames = [x for x in [out_filename, cjk, hangul] if x] | |
cjk2 = cjk if cjk else out_filename | |
hangul2 = hangul if hangul else out_filename | |
for out_file in out_filenames: | |
try: | |
f_out = open(out_file, 'w') | |
except OSError: | |
sys.stderr.write(f'Cannot write to file {out_file}\n') | |
continue | |
with f_out: | |
for script_name in sorted(d['script-names']): | |
if script_name == 'CJK': | |
if out_file != cjk2: | |
continue | |
elif script_name == 'Hangul': | |
if out_file != hangul2: | |
continue | |
else: | |
if out_file != out_filename: | |
continue | |
prop_components = [f"::script-name {script_name}"] | |
for prop_class in prop_classes: | |
key = (script_name, prop_class) | |
if key in d: | |
if chars := ''.join(d[key]): | |
if prop_class in ('char',): | |
prop_components.append(f"::n-{prop_class} {len(chars)}") | |
prop_components.append(f"::{prop_class} {chars}") | |
f_out.write(f"{' '.join(prop_components)}\n") | |
sys.stderr.write(f"Rebuilt {out_filenames} with {n_script_refs} characters " | |
f"for {len(d['script-names'])} scripts.\n") | |
def rebuild_num_props(self, out_filename: str, err_filename: str): | |
n_out, n_err = 0, 0 | |
with open(out_filename, 'w') as f_out, open(err_filename, 'w') as f_err: | |
codepoint = -1 | |
while codepoint < 0xF0000: | |
codepoint += 1 | |
char = chr(codepoint) | |
num = first_non_none(ud_numeric(char), # robust ud.numeric | |
self.num_value(char)) # uroman table includes extra num values, e.g. for Egyptian | |
if num is None: | |
continue | |
result_dict = {} | |
orig_txt = char | |
value: int | float | None = None # non-fraction-value(3 1/2) = 3 | |
fraction: Fraction | None = None # fraction(3 1/2) = Fraction(1, 2) | |
num_base = None # num_base(500) = 100 | |
base_multiplier = None # base_multiplier(500) = 5 | |
script = None | |
is_large_power = self.dict_bool[('is-large-power', char)] | |
# num_base is typically a power of 10: 1, 10, 100, 1000, 10000, 100000, 1000000, ... | |
# exceptions might include 12 for the 'dozen' in popular English 'two dozen and one' (2*12+1=25) | |
# exceptions might include 20 for the 'score' in archaic English 'four score and seven' (4*20+7=87) | |
# exceptions might include 20 for the 'vingt' as in standard French 'quatre-vingt-treize' (4*20+13=93) | |
if script_name := self.chr_script_name(char): | |
script = script_name | |
elif char in '0123456789': | |
script = 'ascii-digit' | |
name = self.chr_name(char) | |
exclude_from_number_processing = False | |
for scrypt_type in ('SUPERSCRIPT', 'SUBSCRIPT', | |
'CIRCLED', 'PARENTHESIZED', 'SEGMENTED', 'MATHEMATICAL', 'ROMAN NUMERAL', | |
'FULL STOP', 'COMMA'): | |
if scrypt_type in name: | |
script = '*' + scrypt_type.lower().replace(' ', '-') | |
exclude_from_number_processing = True | |
break | |
for scrypt_type in ('VULGAR FRACTION',): | |
if scrypt_type in name: | |
script = scrypt_type.lower().replace(' ', '-') | |
break | |
if exclude_from_number_processing: | |
continue | |
if isinstance(num, int): | |
value = num | |
if 0 <= num <= 9: | |
num_base = 1 | |
base_multiplier = num | |
if "DIGIT" in name: | |
num_type = 'digit' | |
else: | |
# Chinese numbers 零 (0), 一 (1), ... 九 (9) have numeric values, | |
# but are NOT (full) digits | |
num_type = 'digit-like' | |
elif m := regex.match(r'([0-9]+?)(0*)$', str(num)): | |
base_multiplier = int(m.group(1)) # non_base_value(500) = 5 | |
num_base = int('1' + m.group(2)) | |
num_type = 'base' if base_multiplier == 1 else 'multi' | |
else: | |
num_type = 'other-int' # Do such cases exist? | |
elif ("FRACTION" in name) and (fraction := fraction_char2fraction(char, num, self)): | |
fraction = fraction | |
num_type = 'fraction' | |
else: | |
num_type = 'other-num' # Do such cases exist? Yes. Bengali currency numerators, ... | |
value_s = '' if value is None else str(value) | |
fraction_s = '' if fraction is None else f'{fraction.numerator}/{fraction.denominator}' | |
fraction_list = None if fraction is None else [fraction.numerator, fraction.denominator] | |
delimiter_s = ' ' if value_s and fraction_s else '' | |
rom = (value_s + delimiter_s + fraction_s) or orig_txt | |
add_non_none_to_dict(result_dict, 'txt', orig_txt) | |
add_non_none_to_dict(result_dict, 'rom', rom) | |
add_non_none_to_dict(result_dict, 'value', value) | |
add_non_none_to_dict(result_dict, 'fraction', fraction_list) | |
add_non_none_to_dict(result_dict, 'type', num_type) | |
if is_large_power: | |
result_dict['is-large-power'] = True | |
add_non_none_to_dict(result_dict, 'base', num_base) | |
add_non_none_to_dict(result_dict, 'mult', base_multiplier) | |
add_non_none_to_dict(result_dict, 'script', script) | |
if num_type.startswith('other'): | |
add_non_none_to_dict(result_dict, 'name', name) | |
f_err.write(json.dumps(result_dict) + '\n') | |
n_err += 1 | |
else: | |
if not script: | |
add_non_none_to_dict(result_dict, 'name', name) | |
f_out.write(json.dumps(result_dict) + '\n') | |
n_out += 1 | |
sys.stderr.write(f'Processed {codepoint} codepoints,\n wrote {n_out} lines to {out_filename}\n' | |
f' and {n_err} lines to {err_filename}\n') | |
def load_resource_files(self, data_dir: Path, load_log: bool = False, | |
rebuild_ud_props: bool = False, rebuild_num_props: bool = False): | |
"""Loads all resource files needed for romanization.""" | |
data_dir = data_dir | |
if not isinstance(data_dir, pathlib.Path): | |
sys.stderr.write(f'Error: data_dir is of {type(data_dir)}, not a Path.\n' | |
f' Cannot load any resource files.\n') | |
return | |
self.load_rom_file(os.path.join(data_dir, "romanization-auto-table.txt"), | |
'ud', file_format='rom', load_log=load_log) | |
self.load_rom_file(os.path.join(data_dir, "UnicodeDataOverwrite.txt"), | |
'ow', file_format='u2r', load_log=load_log) | |
self.load_rom_file(os.path.join(data_dir, "romanization-table.txt"), | |
'man', file_format='rom', load_log=load_log) | |
self.load_chinese_pinyin_file(os.path.join(data_dir, "Chinese_to_Pinyin.txt"), load_log=load_log) | |
self.load_script_file(os.path.join(data_dir, "Scripts.txt"), load_log=load_log) | |
self.load_num_props(os.path.join(data_dir, "NumProps.jsonl"), load_log=load_log) | |
for base_file in ("UnicodeDataProps.txt", "UnicodeDataPropsCJK.txt", "UnicodeDataPropsHangul.txt"): | |
self.load_unicode_data_props(os.path.join(data_dir, base_file), load_log=load_log) | |
if rebuild_ud_props: | |
self.rebuild_unicode_data_props(os.path.join(data_dir, "UnicodeDataProps.txt"), | |
cjk=os.path.join(data_dir, "UnicodeDataPropsCJK.txt"), | |
hangul=os.path.join(data_dir, "UnicodeDataPropsHangul.txt")) | |
if rebuild_num_props: | |
self.rebuild_num_props(os.path.join(data_dir, "NumProps.jsonl"), | |
os.path.join(data_dir, "NumPropsRejects.jsonl")) | |
def unicode_hangul_romanization(self, s: str, pass_through_p: bool = False): | |
"""Special algorithmic solution to convert (Korean) Hangul characters to the Latin alphabet.""" | |
if cached_rom := self.hangul_rom.get(s, None): | |
return cached_rom | |
leads = "g gg n d dd r m b bb s ss - j jj c k t p h".split() | |
vowels = "a ae ya yae eo e yeo ye o wa wai oe yo u weo we wi yu eu yi i".split() | |
tails = "- g gg gs n nj nh d l lg lm lb ls lt lp lh m b bs s ss ng j c k t p h".split() | |
result = "" | |
for c in s: | |
cp = ord(c) | |
if 0xAC00 <= cp <= 0xD7A3: | |
code = cp - 0xAC00 | |
lead_index = int(code / (28 * 21)) | |
vowel_index = int(code / 28) % 21 | |
tail_index = code % 28 | |
rom = leads[lead_index] + vowels[vowel_index] + tails[tail_index] | |
rom = rom.replace('-', '') | |
self.hangul_rom[c] = rom | |
result += rom | |
elif pass_through_p: | |
result += c | |
return result | |
def char_is_nonspacing_mark(s) -> bool: | |
""" Checks whether a character is a nonspacing mark, e.g. combining accents, points, vowel signs""" | |
return (len(s) == 1) and (ud.category(s) == 'Mn') | |
def char_is_format_char(s) -> bool: | |
""" Checks whether a character is a formatting character, e.g. a zero-with joiner/non-joiner""" | |
return (len(s) == 1) and (ud.category(s) == 'Cf') | |
def char_is_space_separator(s) -> bool: | |
""" Checks whether a character is a space, | |
e.g. ' ', non-breakable space, en space, ideographic (Chinese) space, Ogham space mark | |
but excluding \t, \r, \n""" | |
return (len(s) == 1) and (ud.category(s) == 'Zs') | |
def chr_name(self, char: str) -> str: | |
try: | |
return ud.name(char) | |
except (ValueError, TypeError): | |
if name := self.dict_str[('name', char)]: | |
return name | |
return '' | |
def num_value(self, s: str) -> int | float | Fraction | None: | |
"""rom_rules include numeric values beyond UnicodeData.txt, e.g. for Egyptian numerals""" | |
for rom_rule in self.rom_rules[s]: | |
if (num := rom_rule['num']) is not None: | |
return num | |
return None | |
def rom_rule_value(self, s: str, key: str): | |
for rom_rule in self.rom_rules[s]: | |
if (value := rom_rule.get(key)) is not None: | |
return value | |
return None | |
def unicode_float2fraction(self, num: float, precision: float = 0.000001) -> Tuple[int, int] | None: | |
"""only for common unicode fractions""" | |
if chached_value := self.float2fraction.get(num, None): | |
return chached_value | |
for numerator in (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11): | |
for denominator in (2, 3, 4, 5, 6, 8, 12, 16, 20, 32, 40, 64, 80, 160, 320): | |
if abs(numerator / denominator - num) < precision: | |
result = numerator, denominator | |
self.float2fraction[num] = result | |
return result | |
return None | |
def chr_script_name(self, char: str) -> str: | |
"""For letters, diacritics, numerals etc.""" | |
return self.dict_str[('script', char)] | |
def test_output_of_selected_scripts_and_rom_rules(self): | |
"""Low level test function that checks and displays romanization information.""" | |
output = '' | |
for s in ("Oriya", "Chinese"): | |
d = self.scripts[s.lower()] | |
output += f'SCRIPT {s} {d}\n' | |
for s in ('ƿ', 'β', 'и', 'μπ', '⠹', '亿', 'ちょ', 'и', '𓍧', '正', '分之', 'ऽ', 'ศ', 'ด์'): | |
d = self.rom_rules[s] | |
output += f'DICT {s} {d}\n' | |
for s in ('ƿ', 'β', 'न', 'ु'): | |
output += f'SCRIPT-NAME {s} {self.chr_script_name(s)}\n' | |
for s in ('万', '\uF8F7', '\U00013368', '\U0001308B', '\u0E48', '\u0E40'): | |
name = self.chr_name(s) | |
num = self.dict_num[s] | |
pic = self.dict_str[('pic', s)] | |
tone_mark = self.dict_str[('tone-mark', s)] | |
syllable_info = self.dict_str[('syllable-info', s)] | |
is_large_power = self.dict_bool[('is-large-power', s)] | |
output += f'PROPS {s}' | |
if name: | |
output += f' name: {name}' | |
if num: | |
output += f' num: {num} ({type(num).__name__})' | |
if pic: | |
output += f' pic: {pic}' | |
if tone_mark: | |
output += f' tone-mark: {tone_mark}' | |
if syllable_info: | |
output += f' syllable-info: {syllable_info}' | |
if is_large_power: | |
output += f' is-large-power: {is_large_power}' | |
output += '\n' | |
mayan12 = '\U0001D2EC' | |
egyptian600 = '𓍧' | |
runic90 = '𐍁' | |
klingon2 = '\uF8F2' | |
for offset, c in enumerate(f'9九万萬百፲፱፻፸¾0²₂AⅫ⑫൵{runic90}{mayan12}{egyptian600}{klingon2}'): | |
output += f'NUM-EDGE: {NumEdge(offset, offset+1, c, self)}\n' | |
for s in ('\u00bc', '\u0968'): | |
output += f'NUM-PROPS: {self.num_props[s]}\n' | |
print(output) | |
def test_romanization(self, **args): | |
"""A few full cases of romanization testing.""" | |
tests = [('ألاسكا', None), ('यह एक अच्छा अनुवाद है.', 'hin'), ('ちょっとまってください', 'kor'), | |
('Μπανγκαλόρ', 'ell'), ('Зеленський', 'ukr'), ('കേരളം', 'mal')] | |
for test in tests: | |
s = test[0] | |
lcode = test[1] if len(test) >= 2 else None | |
rom = self.romanize_string(s, lcode=lcode, **args) | |
sys.stderr.write(f'ROM {s} -> {rom}\n') | |
n_alerts = 0 | |
codepoint = -1 | |
while codepoint < 0xF0000: | |
codepoint += 1 | |
c = chr(codepoint) | |
rom = self.romanize_string(c) | |
if regex.search(r'\s', rom) and regex.search(r'\S', rom): | |
name = self.chr_name(c) | |
sys.stderr.write(f'U+{codepoint:04X} {c} {name} {rom}\n') | |
n_alerts += 1 | |
sys.stderr.write(f'{n_alerts} alerts for roms with spaces\n') | |
def romanize_file(self, input_filename: str | None = None, output_filename: str | None = None, | |
lcode: str | None = None, direct_input: List[str] = None, **args): | |
"""Script to apply romanization to an entire file. Input and output files needed. | |
Language code (lcode) recommended.""" | |
f_in_to_be_closed, f_out_to_be_closed = False, False | |
if direct_input and (input_filename is None): | |
f_in = direct_input # list of lines | |
elif isinstance(input_filename, str): | |
try: | |
f_in = open(input_filename) | |
f_in_to_be_closed = True | |
except OSError: | |
sys.stderr.write(f'Error in romanize_file: Cannot open file {input_filename}\n') | |
f_in = None | |
elif input_filename is None: | |
f_in = sys.stdin | |
else: | |
sys.stderr.write(f"Error in romanize_file: argument 'input_filename' {input_filename} " | |
f"is of wrong type: {type(input_filename)} (should be str)\n") | |
f_in = None | |
if isinstance(output_filename, str): | |
try: | |
f_out = open(str(output_filename), 'w') | |
f_out_to_be_closed = True | |
except OSError: | |
sys.stderr.write(f'Error in romanize_file: Cannot write to file {output_filename}\n') | |
f_out = None | |
elif output_filename is None: | |
f_out = sys.stdout | |
else: | |
sys.stderr.write(f"Error in romanize_file: argument 'output_filename' {output_filename} " | |
f"is of wrong type: {type(output_filename)} (should be str)\n") | |
f_out = None | |
if f_in and f_out: | |
max_lines = args.get('max_lines', None) | |
progress_dots_output = False | |
for line_number, line in enumerate(f_in, 1): | |
if m := regex.match(r'(::lcode\s+)([a-z]{3})(\s+)(.*?)\s*$', line): | |
lcode_kw, lcode2, space, snt = m.group(1, 2, 3, 4) | |
rom_result = self.romanize_string(snt, lcode2 or lcode, **args) | |
if args.get('rom_format', None) == RomFormat.STR: | |
lcode_prefix = f"{lcode_kw}{lcode2}{space}" | |
f_out.write(lcode_prefix + rom_result + '\n') | |
else: | |
lcode_prefix = f'[0, 0, "", "lcode: {lcode2}"]' # meta edge with lcode info | |
prefixed_edges = [lcode_prefix] + self.romanize_string(snt, lcode2 or lcode, **args) | |
f_out.write(Edge.json_str(prefixed_edges) + '\n') | |
else: | |
f_out.write(Edge.json_str(self.romanize_string(line.rstrip(), lcode, **args)) + '\n') | |
if not args.get('silent'): | |
if line_number % 100 == 0: | |
if line_number % 1000 == 0: | |
sys.stderr.write(str(line_number)) | |
else: | |
sys.stderr.write('.') | |
progress_dots_output = True | |
sys.stderr.flush() | |
gc.collect() | |
if max_lines and line_number >= max_lines: | |
break | |
if progress_dots_output: | |
sys.stderr.write('\n') | |
sys.stderr.flush() | |
if f_in_to_be_closed: | |
f_in.close() | |
if f_out_to_be_closed: | |
f_out.close() | |
def apply_any_offset_to_cached_rom_result(cached_rom_result: str | List[Edge], offset: int = 0) \ | |
-> str | List[Edge]: | |
if isinstance(cached_rom_result, str): | |
return cached_rom_result | |
elif offset == 0: | |
return cached_rom_result | |
else: | |
return [Edge(edge.start + offset, edge.end + offset, edge.txt, edge.type) for edge in cached_rom_result] | |
def romanize_string_core(self, s: str, lcode: str | None, rom_format: RomFormat, cache_p: bool, | |
offset: int = 0, **args) -> str | List[Edge]: | |
"""Script to support token-by-token romanization with caching for higher speed.""" | |
if cache_p: | |
cached_rom = self.rom_cache.get((s, lcode, rom_format), None) | |
if cached_rom is not None: | |
return self.apply_any_offset_to_cached_rom_result(cached_rom, offset) | |
lat = Lattice(s, uroman=self, lcode=lcode) | |
lat.pick_tibetan_vowel_edge(**args) | |
lat.prep_braille(**args) | |
lat.add_romanization(**args) | |
lat.add_numbers(self, **args) | |
lat.add_braille_numbers(**args) | |
lat.add_rom_fall_back_singles(**args) | |
if rom_format == RomFormat.LATTICE: | |
all_edges = lat.all_edges(0, len(s)) | |
lat.add_alternatives(all_edges) | |
if cache_p: | |
self.rom_cache[(s, lcode, rom_format)] = all_edges | |
result = self.apply_any_offset_to_cached_rom_result(all_edges, offset) | |
else: | |
best_edges = lat.best_rom_edge_path(0, len(s)) | |
if rom_format in (RomFormat.EDGES, RomFormat.ALTS): | |
if rom_format == RomFormat.ALTS: | |
lat.add_alternatives(best_edges) | |
if cache_p: | |
self.rom_cache[(s, lcode, rom_format)] = best_edges | |
result = self.apply_any_offset_to_cached_rom_result(best_edges, offset) | |
else: | |
rom = lat.edge_path_to_surf(best_edges) | |
del lat | |
if cache_p: | |
self.rom_cache[(s, lcode, rom_format)] = rom | |
result = rom | |
return result | |
def romanize_string(self, s: str, lcode: str | None = None, rom_format: RomFormat = RomFormat.STR, **args) \ | |
-> str | List[Edge]: | |
"""Main entry point for romanizing a string. Recommended argument: lcode (language code). | |
recursive only used for development. | |
Method returns a string or a list of edges (with start and end offsets).""" | |
lcode = lcode or args.get('lcode', None) | |
# print('rom::', s, 'lcode:', lcode, 'print-lattice:', print_lattice_p) | |
# with caching (for string format output only for now) | |
if cache_p := not args.get('no_caching', False): | |
rest, offset = s, 0 | |
result = '' if rom_format == RomFormat.STR else [] | |
while m3 := regex.match(r'(.*?)([.,; ]*[ 。][.,; ]*)(.*)$', rest): | |
pre, delimiter, rest = m3.group(1, 2, 3) | |
result += self.romanize_string_core(pre, lcode, rom_format, cache_p, offset, **args) | |
offset += len(pre) | |
result += self.romanize_string_core(delimiter, lcode, rom_format, cache_p, offset, **args) | |
offset += len(delimiter) | |
result += self.romanize_string_core(rest, lcode, rom_format, cache_p, offset, **args) | |
return result | |
else: | |
return self.romanize_string_core(s, lcode, rom_format, cache_p, 0, **args) | |
class Edge: | |
"""This class defines edges that span part of a sentence with a specific romanization. | |
There might be multiple edges for a given span. The edges in turn are part of the | |
romanization lattice.""" | |
def __init__(self, start: int, end: int, s: str, annotation: str = None): | |
self.start = start | |
self.end = end | |
self.txt = s | |
self.type = annotation | |
def __str__(self): | |
return f'[{self.start}-{self.end}] {self.txt} ({self.type})' | |
def __repr__(self): | |
return str(self) | |
def json(self) -> str: # start - end - text - annotation | |
return json.dumps([self.start, self.end, self.txt, self.type]) | |
def json_str(rom_result: List[Edge] | str) -> str: | |
if isinstance(rom_result, str): | |
return rom_result | |
else: | |
result = '[' | |
for edge in rom_result: | |
if isinstance(edge, Edge): | |
result += edge.json() | |
else: | |
result += str(edge) | |
result += ']' | |
return result | |
class NumEdge(Edge): | |
def __init__(self, start: int, end: int, s: str, uroman: Uroman | None, active: bool = False): | |
"""For NumEdge, the s argument is in original language (not yet romanized).""" | |
# For speed, much of this processing should at some point be cached in data files. | |
Edge.__init__(self, start, end, s) | |
self.orig_txt, self.txt = s, s | |
self.value, self.fraction, self.num_base, self.base_multiplier = None, None, None, None | |
self.type, self.script, self.is_large_power, self.active = None, None, False, active | |
self.n_decimals = None | |
self.value_s = None # precision for 3.14159265358979323846264338327950288419716939937510582097494 | |
if start+1 == end: | |
char = s[0] | |
if d := uroman.num_props.get(char): | |
self.active = True | |
self.value = d.get('value') | |
fraction_list = d.get('fraction') | |
self.fraction = Fraction(fraction_list[0], fraction_list[1]) if fraction_list else None | |
self.num_base = d.get('base') | |
self.base_multiplier = d.get('mult') | |
self.type = d.get('type') | |
self.script = d.get('script') | |
self.is_large_power = d.get('is-large-power') | |
self.update() | |
def update(self, | |
value: int | float | None = None, | |
value_s: str | None = None, | |
fraction: Fraction | None = None, | |
n_decimals: int | None = None, | |
num_base: int | None = None, | |
base_multiplier: int | float | None = None, | |
script: str | None = None, | |
e_type: str | None = None, | |
orig_txt: str | None = None) -> str: | |
self.value = first_non_none(value, self.value) | |
self.value_s = first_non_none(value_s, self.value_s) | |
self.fraction = first_non_none(fraction, self.fraction) | |
self.n_decimals = first_non_none(n_decimals, self.n_decimals) | |
self.num_base = first_non_none(num_base, self.num_base) | |
self.base_multiplier = first_non_none(base_multiplier, self.base_multiplier) | |
self.script = first_non_none(script, self.script) | |
self.type = first_non_none(e_type, self.type) | |
self.orig_txt = first_non_none(orig_txt, self.orig_txt) | |
if self.value_s is not None: | |
value_s = self.value_s | |
elif self.value is None: | |
value_s = '' | |
elif isinstance(self.value, float) and (self.n_decimals is not None): | |
value_s = first_non_none(self.value_s, f'{self.value:0.{self.n_decimals}f}') | |
else: | |
value_s = str(self.value) | |
fraction_s = '' if self.fraction is None else f'{self.fraction.numerator}/{self.fraction.denominator}' | |
delimiter_s = ' ' if value_s and fraction_s else '' | |
self.txt = (value_s + delimiter_s + fraction_s) or self.orig_txt | |
return self.txt | |
def __str__(self): | |
if self.num_base is not None: | |
if self.base_multiplier is not None: | |
b_clause = f'{self.base_multiplier}*{self.num_base}' | |
else: | |
b_clause = str(self.num_base) | |
else: | |
b_clause = None | |
return (('' if self.active else ' *') | |
+ f'[{self.start}-{self.end}] {self.orig_txt} R:{self.txt} T:{self.type}' | |
+ (' LP' if self.is_large_power else '') | |
+ (f' B:{b_clause}' if (b_clause is not None) else '') | |
+ (f' V:{self.value}' if ((self.value is not None) and (str(self.value) != self.txt)) else '') | |
+ (f' VS:{self.value_s}' if ((self.value_s is not None) and (self.value_s != self.txt)) else '') | |
+ (f' F:.{self.n_decimals}f' if self.n_decimals else f'') | |
+ (f' S:{self.script}' if self.script else '')) | |
class Lattice: | |
"""Lattice for a specific romanization instance. Has edges.""" | |
def __init__(self, s: str, uroman: Uroman, lcode: str = None): | |
self.s = s | |
self.lcode = lcode | |
self.lattice = defaultdict(set) | |
self.max_vertex = len(s) | |
self.uroman = uroman | |
self.props = {} | |
self.simple_top_rom_cache = {} | |
self.contains_script = defaultdict(bool) | |
self.check_for_scripts() | |
def check_for_scripts(self): | |
for c in self.s: | |
script_name = self.uroman.chr_script_name(c) | |
self.contains_script[script_name] = True | |
if regex.search(r'[\u2800-\u28FF]', self.s): | |
self.contains_script['Braille'] = True | |
def add_edge(self, edge: Edge): | |
self.lattice[(edge.start, edge.end)].add(edge) | |
self.lattice[(edge.start, 'right')].add(edge.end) | |
self.lattice[(edge.end, 'left')].add(edge.start) | |
def __str__(self): | |
edges = [] | |
for start in range(self.max_vertex): | |
for end in self.lattice[(start, 'right')]: | |
for edge in self.lattice[(start, end)]: | |
edges.append(f'[{start}-{end}] {edge.txt} ({edge.type})') | |
return ' '.join(edges) | |
def char_is_braille(c: str) -> bool: | |
return 0x2800 <= ord(c[0]) <= 0x28FF | |
# Help Tibet | |
def char_is_subjoined_letter(self, c: str) -> bool: | |
return "SUBJOINED LETTER" in self.uroman.chr_name(c) | |
def char_is_regular_letter(self, c: str) -> bool: | |
char_name = self.uroman.chr_name(c) | |
return ("LETTER" in char_name) and not ("SUBJOINED" in char_name) | |
def char_is_letter(self, c: str) -> bool: | |
return "LETTER" in self.uroman.chr_name(c) | |
def char_is_vowel_sign(self, c: str) -> bool: | |
return self.uroman.dict_bool[('is-vowel-sign', c)] | |
def char_is_letter_or_vowel_sign(self, c: str) -> bool: | |
return self.char_is_letter(c) or self.char_is_vowel_sign(c) | |
def is_at_start_of_word(self, position: int) -> bool: | |
# return not regex.match(r'(?:\pL|\pM)', self.s[position-1:position]) | |
first_char = self.s[position] | |
first_char_is_braille = self.char_is_braille(first_char) | |
end = position | |
if (preceded_by_alpha := self.props.get(('preceded_by_alpha', end), None)) in (True, False): | |
return not preceded_by_alpha | |
for start in self.lattice[(end, 'left')]: | |
for edge in self.lattice[(start, end)]: | |
prev_letter = None if edge.txt == '' else edge.txt[-1] | |
if len(edge.txt) and (prev_letter.isalpha() or (first_char_is_braille and (prev_letter in ["'"]))): | |
self.props[('preceded_by_alpha', position)] = True | |
return False | |
self.props[('preceded_by_alpha', position)] = False | |
return True | |
def is_at_end_of_word(self, position: int) -> bool: | |
if (cached_followed_by_alpha := self.props.get(('followed_by_alpha', position), None)) in (True, False): | |
return not cached_followed_by_alpha | |
start = position | |
while (start+1 < self.max_vertex) \ | |
and self.uroman.char_is_nonspacing_mark(self.s[start]) \ | |
and ('NUKTA' in self.uroman.chr_name(self.s[start])): | |
start += 1 | |
for end in range(start + 1, self.max_vertex + 1): | |
s = self.s[start:end] | |
if not self.uroman.dict_bool[('s-prefix', s)]: | |
break | |
for rom_rule in self.uroman.rom_rules[s]: | |
rom = rom_rule['t'] | |
if (not rom_rule['use-only-at-start-of-word']) and regex.search(r'\pL', rom): | |
self.props[('followed_by_alpha', position)] = True | |
return False | |
self.props[('followed_by_alpha', position)] = False | |
return True | |
def is_at_end_of_syllable(self, position: int) -> Tuple[bool, str]: | |
"""At least initially for Thai""" | |
prev_char = self.s[position-2] if position >= 2 else None | |
# char = self.s[position-1] if position >= 1 else None | |
next_char = self.s[position] if position < self.max_vertex else None | |
if self.uroman.dict_str[('tone-mark', next_char)]: | |
adj_position = position + 1 | |
next_char = self.s[adj_position] if adj_position < self.max_vertex else None | |
# print('TONE-MARK', position, next_char) | |
else: | |
adj_position = position | |
next_char2 = self.s[adj_position + 1] if adj_position + 1 < self.max_vertex else None | |
if prev_char is None: | |
return False, 'start-of-string' | |
if not regex.search(r'(?:\pL|\pM)$', prev_char): # start of token | |
return False, 'start-of-token' | |
if self.uroman.dict_str[('syllable-info', prev_char)] == 'written-pre-consonant-spoken-post-consonant': | |
return False, 'pre-post-vowel-on-left' | |
if self.uroman.dict_str[('syllable-info', next_char)] == 'written-pre-consonant-spoken-post-consonant': | |
return True, 'pre-post-vowel-on-right' | |
if adj_position >= self.max_vertex: # end of string | |
return True, 'end-of-string' | |
# if not self.char_is_letter_or_vowel_sign(next_char): # end of token | |
if not regex.match(r'(?:\pL|\pM)', next_char): # end of token | |
return True, 'end-of-token' | |
if position > 0: | |
left_edge = self.best_left_neighbor_edge(position-1) | |
if left_edge and regex.search(r'[bcdfghjklmnpqrstvxz]$', left_edge.txt): | |
return False, 'consonant-to-the-left' | |
next_char_rom = first_non_none(self.simple_top_romanization_candidate_for_span(adj_position, | |
adj_position + 2, | |
simple_search=True), | |
self.simple_top_romanization_candidate_for_span(adj_position, | |
adj_position + 1, | |
simple_search=True), | |
"?") | |
if not regex.match(r"[aeiou]", next_char_rom.lower()): # followed by consonant | |
return True, f'not-followed-by-vowel {next_char_rom}' | |
if (next_char == '\u0E2D') and (next_char2 is not None): # THAI CHARACTER O ANG | |
next_char2_rom = first_non_none(self.simple_top_romanization_candidate_for_span(adj_position+1, | |
adj_position+2, | |
simple_search=True), | |
"?") | |
if regex.match(r"[aeiou]", next_char2_rom.lower()): | |
return True, 'o-ang-followed-by-vowel' # In that context Thai char. "o ang" is considered a consonant | |
return False, 'not-at-syllable-end-by-default' | |
def romanization_by_first_rule(self, s) -> str | None: | |
try: | |
return self.uroman.rom_rules[s][0]['t'] | |
except IndexError: | |
return None | |
def expand_rom_with_special_chars(self, rom: str, start: int, end: int, **args) \ | |
-> Tuple[str, int, int, str | None]: | |
"""This method contains a number of special romanization heuristics that typically modify | |
an existing or preliminary edge based on context.""" | |
orig_start = start | |
uroman = self.uroman | |
full_string = self.s | |
annot = None | |
if rom == '': | |
return rom, start, end, None | |
prev_char = (full_string[start-1] if start >= 1 else '') | |
first_char = full_string[start] | |
last_char = full_string[end-1] | |
next_char = (full_string[end] if end < len(full_string) else '') | |
# \u2820 is the Braille character indicating that the next letter is upper case | |
if (prev_char == '\u2820') and regex.match(r'[a-z]', rom): | |
return rom[0].upper() + rom[1:], start-1, end, 'rom exp' | |
# Normalize multi-upper case THessalonike -> Thessalonike, but don't change THESSALONIKE | |
if start+1 == end and rom.isupper() and next_char.islower(): | |
ablation = args.get('ablation', '') # VERBOSE | |
if not ('nocap' in ablation): | |
rom = rom.capitalize() | |
# Japanese small tsu (and Gurmukhi addak) used as consonant doubler: | |
if (prev_char and prev_char in 'っッ\u0A71') \ | |
and (uroman.chr_script_name(prev_char) == uroman.chr_script_name(prev_char)) \ | |
and (m_double_consonant := regex.match(r'(ch|[bcdfghjklmnpqrstwz])', rom)): | |
# return m_double_consonant.group(1).replace('ch', 't') + rom, start-1, end, 'rom exp' | |
# expansion might additional apply to the right | |
if prev_char in 'っッ': # for Japanese, per Hepburn, use tch | |
rom = m_double_consonant.group(1).replace('ch', 't') + rom | |
else: | |
rom = m_double_consonant.group(1).replace('ch', 'c') + rom | |
start = start-1 | |
first_char = full_string[start] | |
prev_char = (full_string[start-1] if start >= 1 else '') | |
# Thai | |
if uroman.chr_script_name(first_char) == 'Thai': | |
if (start+1 == end) and regex.match(r'[bcdfghjklmnpqrstvwxyz]+$', rom): | |
if uroman.dict_str[('syllable-info', prev_char)] == 'written-pre-consonant-spoken-post-consonant': | |
for vowel_prefix_len in [1]: | |
if vowel_prefix_len <= start: | |
for vowel_suffix_len in [3, 2, 1]: | |
if end + vowel_suffix_len <= len(full_string): | |
pattern = (full_string[start-vowel_prefix_len: start] | |
+ '–' | |
+ full_string[end:end+vowel_suffix_len]) | |
if uroman.rom_rules[pattern]: | |
vowel_rom_rule = uroman.rom_rules[pattern][0] | |
vowel_rom = vowel_rom_rule['t'] | |
# print(f" PATTERN {pattern} ({full_string[start:end]}/{rom}) {rom}{vowel_rom}") | |
return rom + vowel_rom, start-vowel_prefix_len, end+vowel_suffix_len, 'rom exp' | |
if (uroman.chr_script_name(prev_char) == 'Thai') \ | |
and (uroman.dict_str[('syllable-info', prev_char)] | |
== 'written-pre-consonant-spoken-post-consonant') \ | |
and regex.match(r'[bcdfghjklmnpqrstvwxyz]', rom) \ | |
and (vowel_rom := self.romanization_by_first_rule(prev_char)): | |
return rom + vowel_rom, start-1, end, 'rom exp' | |
# THAI CHARACTER O ANG | |
if (first_char == '\u0E2D') and (end - start == 1): | |
prev_script = uroman.chr_script_name(prev_char) | |
next_script = uroman.chr_script_name(next_char) | |
prev_rom = self.find_rom_edge_path_backwards(0, start, 1, return_str=True) | |
next_rom = self.romanization_by_first_rule(next_char) | |
# if not recursive: | |
# lc = uroman.romanize_string(full_string[:start], lcode=self.lcode, recursive=True) | |
# rc = uroman.romanize_string(full_string[end:], lcode=self.lcode, recursive=True) | |
# print('PP', start, end, prev_script, next_script, prev_rom, next_rom, ' LC:', lc[-40:], | |
# ' RC:', rc[:40]) | |
# delete THAI CHARACTER O ANG unless it is surrounded on both sides by a Thai consonant | |
if not ((prev_script == 'Thai') and (next_script == 'Thai') | |
and regex.match(r'[bcdfghjklmnpqrstvwxz]+$', prev_rom) | |
and regex.match(r'[bcdfghjklmnpqrstvwxz]+$', next_rom)): | |
# if not recursive: | |
# print(f'* DELETE O ANG {first_char} {start}-{end} LC: {lc[-40:]} RC: {rc[:40]}') | |
return '', start, end, 'rom del' | |
# Coptic: consonant + grace-accent = e + consonant | |
if next_char and (next_char == "\u0300") and (uroman.chr_script_name(last_char) == "Coptic")\ | |
and (not self.simple_top_romanization_candidate_for_span(orig_start, end+1)): | |
rom = 'e' + rom | |
end = end+1 | |
last_char = full_string[end - 1] | |
next_char = (full_string[end] if end < len(full_string) else '') | |
annot = 'rom exp' | |
# Japanese small y: ki + small ya = kya etc. | |
if (next_char and next_char in 'ゃゅょャュョ') \ | |
and (uroman.chr_script_name(last_char) == uroman.chr_script_name(next_char)) \ | |
and regex.search(r'([bcdfghjklmnpqrstvwxyz]i$)', rom) \ | |
and (y_rom := self.romanization_by_first_rule(next_char)) \ | |
and (not self.simple_top_romanization_candidate_for_span(orig_start, end+1)) \ | |
and (not self.simple_top_romanization_candidate_for_span(start, end+1)): | |
rom = rom[:-1] + y_rom | |
end = end+1 | |
last_char = full_string[end - 1] | |
next_char = (full_string[end] if end < len(full_string) else '') | |
annot = 'rom exp' | |
# Japanese vowel lengthener (U+30FC) | |
last_rom_char = last_chr(rom) | |
if (next_char == 'ー') \ | |
and (uroman.chr_script_name(last_char) in ('Hiragana', 'Katakana')) \ | |
and (last_rom_char in 'aeiou'): | |
return rom + last_rom_char, start, end+1, 'rom exp' | |
# Virama (in Indian languages) | |
if self.uroman.dict_bool[('is-virama', next_char)]: | |
return rom, start, end + 1, "rom exp" | |
if rom.startswith(' ') and ((start == 0) or (prev_char == ' ')): | |
rom = rom[1:] | |
if rom.endswith(' ') and ((end == len(full_string)+1) or (next_char == ' ')): | |
rom = rom[:-1] | |
return rom, start, end, annot | |
def prep_braille(self, **_args) -> None: | |
if self.contains_script['Braille']: | |
dots6 = '\u2820' # characters in following word are upper case | |
all_caps = False | |
for i, c in enumerate(self.s): | |
if (i >= 1) and (self.s[i-1] == dots6) and (c == dots6): | |
all_caps = True | |
elif all_caps: | |
if c in '\u2800': # Braille space | |
all_caps = False | |
else: | |
self.props[('is-upper', i)] = True | |
def pick_tibetan_vowel_edge(self, **args) -> None: | |
if not self.contains_script['Tibetan']: | |
return None | |
verbose = bool(args.get('verbose')) | |
s = self.s | |
uroman = self.uroman | |
tibetan_syllable = [] | |
tibetan_letter_positions = [] | |
for start in range(self.max_vertex): | |
c = s[start] | |
if (uroman.chr_script_name(c) == 'Tibetan') and self.char_is_letter_or_vowel_sign(c): | |
tibetan_letter_positions.append(start) | |
else: | |
if tibetan_letter_positions: | |
tibetan_syllable.append(tibetan_letter_positions) | |
tibetan_letter_positions = [] | |
if tibetan_letter_positions: | |
tibetan_syllable.append(tibetan_letter_positions) | |
for tibetan_letter_positions in tibetan_syllable: | |
vowel_pos = None | |
orig_txt = '' | |
roms = [] | |
subjoined_letter_positions = [] | |
first_letter_position = tibetan_letter_positions[0] | |
for i in tibetan_letter_positions: | |
c = s[i] | |
orig_txt += c | |
rom = first_non_none(self.simple_top_romanization_candidate_for_span(i, i+1), "?") | |
self.props[('edge-vowel', i)] = None | |
if self.char_is_vowel_sign(c) or (rom and regex.match(r"[aeiou]+$", rom)): | |
vowel_pos = i | |
self.props[('edge-vowel', i)] = True | |
# delete any syllable initial ' before vowel | |
if roms == ["'"]: | |
self.props[('edge-delete', i-1)] = True | |
elif self.char_is_subjoined_letter(c): | |
subjoined_letter_positions.append(i) | |
if i > first_letter_position: | |
if c == "\u0FB0": | |
vowel_pos = i-1 | |
self.props[('edge-vowel', i-1)] = True | |
else: | |
self.props[('edge-vowel', i-1)] = False | |
rom = regex.sub(r'([bcdfghjklmnpqrstvwxyz].*)a$', r'\1', rom) | |
elif c == "\u0F60": # Tibetan letter -a (') | |
self.props[('edge-vowel', i)] = False | |
if i > first_letter_position: | |
vowel_pos = i-1 | |
self.props[('edge-vowel', i-1)] = True | |
if i == tibetan_letter_positions[-1]: | |
self.props[('edge-delete', i)] = True | |
if roms and not (roms[-1] in "aeiou"): | |
rom = "a'" | |
else: | |
rom = "'" | |
else: | |
rom = regex.sub(r'([bcdfghjklmnpqrstvwxyz].*)a$', r'\1', rom) | |
roms.append(rom) | |
if vowel_pos is not None: | |
for i in tibetan_letter_positions: | |
if self.props.get(('edge-vowel', i)) is None: | |
self.props[('edge-vowel', i)] = False | |
else: | |
best_cost, best_vowel_pos, best_pre, best_post = math.inf, None, None, None | |
n_letters = len(tibetan_letter_positions) | |
for i in tibetan_letter_positions: | |
rel_pos = i - first_letter_position | |
pre, post = ''.join(roms[:rel_pos+1]), ''.join(roms[rel_pos+1:]) | |
if self.props.get(('edge-vowel', i)) is False: | |
cost = 20 | |
if cost < best_cost: | |
best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post | |
elif n_letters == 1: | |
cost = 0 | |
if cost < best_cost: | |
best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post | |
elif n_letters == 2: | |
cost = 0 if i == 0 else 0.1 | |
if cost < best_cost: | |
best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post | |
else: | |
good_suffix = regex.match(r"(?:|[bcdfghjklmnpqrstvwxz]|bh|bs|ch|cs|dd|ddh|" | |
r"dh|dz|dzh|gh|gr|gs|kh|khs|kss|n|nn|nt|ms|ng|ngs|ns|ph|" | |
r"rm|sh|ss|th|ts|tsh|tt|tth|zh|zhs)'?$", post) | |
good_prefix = regex.match(r"'?(?:.|bd|br|brg|brgy|bs|bsh|bst|bt|bts|by|bz|bzh|" | |
r"ch|db|dby|dk|dm|dp|dpy|dr|" | |
r"gl|gn|gr|gs|gt|gy|gzh|kh|khr|khy|kr|ky|ld|lh|lt|mkh|mny|mth|mtsh|" | |
r"ny|ph|phr|phy|rgy|rk|el|rn|rny|rt|rts|" | |
r"sk|skr|sky|sl|sm|sn|sny|sp|spy|sr|st|th|ts|tsh)$", pre) | |
subjoined_suffix = all([x in subjoined_letter_positions | |
for x in tibetan_letter_positions[rel_pos+2:]]) | |
# print('GOOD', good_suffix, good_prefix, subjoined_suffix, f'{pre}a{post}', | |
# subjoined_letter_positions, tibetan_letter_positions[rel_pos+2:]) | |
if good_suffix and good_prefix: | |
cost = len(pre) * 0.1 | |
elif good_suffix: | |
cost = len(pre) | |
elif subjoined_suffix and good_prefix: | |
cost = len(pre) * 0.3 | |
elif subjoined_suffix: | |
cost = len(pre) * 0.5 | |
else: | |
cost = math.inf | |
if cost < best_cost: | |
best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post | |
if best_vowel_pos is not None: | |
for i in tibetan_letter_positions: | |
if self.props.get(('edge-vowel', i)) is None: | |
value = (i == best_vowel_pos) | |
self.props[('edge-vowel', i)] = value | |
if verbose: | |
best_cost = best_cost if isinstance(best_cost, int) else round(best_cost, 2) | |
sys.stderr.write(f'Tib. best cost: "{best_pre}a{best_post}" o:{orig_txt} c:{round(best_cost, 2)}' | |
f' p:{best_vowel_pos} {tibetan_letter_positions}\n') | |
def add_default_abugida_vowel(self, rom: str, start: int, end: int, annotation: str = '') -> str: | |
"""Adds an abugida vowel (e.g. "a") where needed. Important for many languages in South Asia.""" | |
uroman = self.uroman | |
s = self.s | |
try: | |
first_s_char = s[start] | |
last_s_char = s[end-1] | |
script_name = uroman.chr_script_name(first_s_char) | |
script = self.uroman.scripts[script_name.lower()] | |
if not (abugida_default_vowels := script['abugida-default-vowels']): | |
return rom | |
key = (script, rom) | |
if key in uroman.abugida_cache: | |
base_rom, base_rom_plus_vowel, mod_rom = uroman.abugida_cache[key] | |
rom = mod_rom | |
else: | |
vowels_regex1 = '|'.join(abugida_default_vowels) # e.g. 'a' or 'a|o' | |
vowels_regex2 = '|'.join(map(lambda x: x + '+', abugida_default_vowels)) # e.g. 'a+' or 'a+|o+' | |
if m := regex.match(fr'([cfghkmnqrstxy]?y)({vowels_regex2})-?$', rom): | |
base_rom = m.group(1) | |
base_rom_plus_vowel = base_rom + m.group(2) | |
elif m := regex.match(fr'([bcdfghjklmnpqrstvwxyz]+)({vowels_regex1})-?$', rom): | |
base_rom = m.group(1) | |
base_rom_plus_vowel = base_rom + m.group(2) | |
if rom.endswith('-') and (start+1 == end) and rom[0].isalpha(): | |
rom = rom[:-1] | |
else: | |
base_rom = rom | |
base_rom_plus_vowel = base_rom + abugida_default_vowels[0] | |
if (not regex.match(r"[bcdfghjklmnpqrstvwxyz]+$", base_rom) | |
and (not ((script_name == 'Tibetan') and (base_rom == "'")))): | |
base_rom, base_rom_plus_vowel = None, None | |
uroman.abugida_cache[key] = (base_rom, base_rom_plus_vowel, rom) | |
if base_rom is None: | |
return rom | |
if 'tail' in annotation: | |
return rom | |
prev_s_char = s[start-1] if start >= 1 else '' | |
next_s_char = s[end] if len(s) > end else '' | |
next2_s_char = s[end+1] if len(s) > end+1 else '' | |
if script_name == 'Tibetan': | |
if self.props.get(('edge-delete', start)): | |
return '' | |
elif self.props.get(('edge-vowel', start)): | |
return base_rom_plus_vowel | |
else: | |
return base_rom | |
if (next_s_char and ((base_rom in "bcdfghklmnpqrstvwz") or (base_rom in ["ng"])) | |
and (next_s_char in "យ")): # Khmer yo | |
return base_rom | |
if self.uroman.dict_bool[('is-vowel-sign', next_s_char)]: | |
return base_rom | |
if self.uroman.dict_bool[('is-medial-consonant-sign', next_s_char)]: | |
return base_rom | |
if self.char_is_subjoined_letter(next_s_char): | |
return base_rom | |
if self.uroman.char_is_nonspacing_mark(next_s_char) \ | |
and self.uroman.dict_bool[('is-vowel-sign', next2_s_char)]: | |
return base_rom | |
if self.uroman.dict_bool[('is-virama', next_s_char)]: | |
return base_rom | |
if self.uroman.char_is_nonspacing_mark(next_s_char) \ | |
and self.uroman.dict_bool[('is-virama', next2_s_char)]: | |
return base_rom | |
if self.uroman.dict_bool[('is-virama', prev_s_char)]: | |
return base_rom_plus_vowel | |
if self.is_at_start_of_word(start) and not regex.search('r[aeiou]', rom): | |
return base_rom_plus_vowel | |
# delete many final schwas from most Devanagari languages (except: Sanskrit) | |
if self.is_at_end_of_word(end): | |
if (script_name in ("Devanagari",)) and (self.lcode not in ('san',)): # Sanskrit | |
return rom | |
else: | |
return base_rom_plus_vowel | |
if uroman.chr_script_name(prev_s_char) != script_name: | |
return base_rom_plus_vowel | |
if 'VOCALIC' in self.uroman.chr_name(last_s_char): | |
return base_rom | |
if uroman.chr_script_name(next_s_char) == script_name: | |
return base_rom_plus_vowel | |
except Exception: | |
return rom | |
else: | |
pass | |
# print('ABUGIDA', rom, start, script_name, script, abugida_default_vowels, prev_s_char, next_s_char) | |
return rom | |
def cand_is_valid(self, rom_rule: RomRule, start: int, end: int, rom: str) -> bool: | |
if rom is None: | |
return False | |
if rom_rule['dont-use-at-start-of-word'] and self.is_at_start_of_word(start): | |
return False | |
if rom_rule['use-only-at-start-of-word'] and not self.is_at_start_of_word(start): | |
return False | |
if rom_rule['dont-use-at-end-of-word'] and self.is_at_end_of_word(end): | |
return False | |
if rom_rule['use-only-at-end-of-word'] and not self.is_at_end_of_word(end): | |
return False | |
if rom_rule['use-only-for-whole-word'] \ | |
and not (self.is_at_start_of_word(start) and self.is_at_end_of_word(end)): | |
return False | |
if (lcodes := rom_rule['lcodes']) and (self.lcode not in lcodes): | |
return False | |
return True | |
# @profile | |
def simple_sorted_romanization_candidates_for_span(self, start, end) -> List[str]: | |
s = self.s[start:end] | |
if not self.uroman.dict_bool[('s-prefix', s)]: | |
return [] | |
rom_rule_candidates = [] | |
for rom_rule in self.uroman.rom_rules[s]: | |
rom = rom_rule['t'] | |
if self.cand_is_valid(rom_rule, start, end, rom): | |
rom_rule_candidates.append((rom_rule['n-restr'] or 0, rom_rule['t'])) | |
rom_rule_candidates.sort(reverse=True) | |
return [x[1] for x in rom_rule_candidates] | |
def simple_top_romanization_candidate_for_span(self, start, end, simple_search: bool = False) -> str | None: | |
if (start < 0) or (end > self.max_vertex): | |
return None | |
span_range = (start, end) | |
if (cached_result := self.simple_top_rom_cache.get(span_range)) is not None: | |
return cached_result | |
best_cand, best_n_restr, best_rom_rule = None, None, None | |
for rom_rule in self.uroman.rom_rules[self.s[start:end]]: | |
if self.cand_is_valid(rom_rule, start, end, rom_rule['t']): | |
n_restr = rom_rule['n-restr'] or 0 | |
if best_n_restr is None or (n_restr > best_n_restr): | |
best_cand, best_n_restr, best_rom_rule = rom_rule['t'], n_restr, rom_rule | |
if simple_search: | |
return best_cand | |
if best_rom_rule: | |
t_at_end_of_syllable = best_rom_rule['t-at-end-of-syllable'] | |
if t_at_end_of_syllable is not None: | |
is_at_end_of_syllable, rationale = self.is_at_end_of_syllable(end) | |
if is_at_end_of_syllable: | |
best_cand = t_at_end_of_syllable | |
# print(f" SIMPLE {start}-{end} {best_cand} ({best_rom_rule['t']},{t_at_end_of_syllable}) " | |
# f"END:{is_at_end_of_syllable} ({rationale})") | |
self.simple_top_rom_cache[span_range] = best_cand | |
# if (best_rom_rule is not None) and ('cancel' in (prov := best_rom_rule['prov'])): | |
# sys.stderr.write(f' Cancel {self.s} ({start}-{end}) {prov} {self.s[start:end]}\n') | |
return best_cand | |
def decomp_rom(self, char_position: int) -> str | None: | |
"""Input: decomposable character such as ﻼ or ½ | |
Output: la or 1/2""" | |
full_string = self.s | |
char = full_string[char_position] | |
rom = None | |
if ud_decomp_s := ud.decomposition(char): | |
format_comps = [] | |
other_comps = [] | |
decomp_s = '' | |
# name = self.uroman.chr_name(char) | |
for ud_decomp_elem in ud_decomp_s.split(): | |
if ud_decomp_elem.startswith("<"): | |
format_comps.append(ud_decomp_elem) | |
else: | |
try: | |
norm_char = chr(int(ud_decomp_elem, 16)) | |
except ValueError: | |
other_comps.append(ud_decomp_elem) | |
else: | |
decomp_s += norm_char | |
if (format_comps and (format_comps[0] not in ('<super>', '<sub>', '<noBreak>', '<compat>')) | |
and (not other_comps) and decomp_s): | |
rom = self.uroman.romanize_string(decomp_s, self.lcode) | |
# make sure to add a space for 23½ -> 23 1/2 | |
if rom and ud.numeric(char, None): | |
rom = rom.replace('⁄', '/') | |
if char_position >= 1 and ud.numeric(full_string[char_position-1], None): | |
rom = ' ' + rom | |
if (char_position+1 < len(full_string)) and ud.numeric(full_string[char_position+1], None): | |
rom += ' ' | |
return rom | |
def add_romanization(self, **args): | |
"""Adds a romanization edge to the romanization lattice.""" | |
for start in range(self.max_vertex): | |
for end in range(start+1, self.max_vertex+1): | |
if not self.uroman.dict_bool[('s-prefix', self.s[start:end])]: | |
break | |
if (rom := self.simple_top_romanization_candidate_for_span(start, end)) is not None: | |
if self.contains_script['Braille'] and (start+1 == end): | |
if self.props.get(('is-upper', start)): | |
rom = rom.upper() | |
edge_annotation = 'rom' | |
if regex.match(r'\+(m|ng|n|h|r)', rom): | |
rom, edge_annotation = rom[1:], 'rom tail' | |
rom = self.add_default_abugida_vowel(rom, start, end, annotation=edge_annotation) | |
# orig_rom, orig_start, orig_end = rom, start, end | |
rom, start2, end2, exp_edge_annotation \ | |
= self.expand_rom_with_special_chars(rom, start, end, annotation=edge_annotation, | |
recursive=args.get('recursive', False), **args) | |
edge_annotation = exp_edge_annotation or edge_annotation | |
# if (orig_rom, orig_start, orig_end) != (rom, start, end): | |
# print(f'EXP {s} {orig_rom} {orig_start}-{orig_end} -> {rom} {start}-{end}') | |
# if rom != rom_orig: print('** Add ABUGIDA', rom, start, end, rom2) | |
self.add_edge(Edge(start2, end2, rom, edge_annotation)) | |
if start < len(self.s): | |
char = self.s[start] | |
cp = ord(char) | |
# Korean Hangul characters | |
if 0xAC00 <= cp <= 0xD7A3: | |
if rom := self.uroman.unicode_hangul_romanization(char): | |
self.add_edge(Edge(start, start+1, rom, 'rom')) | |
# character decomposition | |
if rom_decomp := self.decomp_rom(start): | |
self.add_edge(Edge(start, start + 1, rom_decomp, 'rom decomp')) | |
def update_edge_list(edges, new_edge, old_edges) -> List[NumEdge]: | |
new_edge_not_yet_added = True | |
result = [] | |
for edge in edges: | |
if edge in old_edges: | |
edge.active = False | |
if new_edge_not_yet_added: | |
result.append(new_edge) | |
new_edge_not_yet_added = False | |
else: | |
result.append(edge) | |
if new_edge_not_yet_added: | |
result.append(new_edge) | |
return result | |
def edge_is_digit(edge: Edge | None) -> bool: | |
return (isinstance(edge, NumEdge) | |
and (edge.value is not None) | |
and isinstance(edge.value, int) | |
and (edge.type == 'digit') | |
and (0 <= edge.value <= 9) | |
and (edge.end - edge.start == 1)) | |
def is_gap_null_edge(edge: Edge) -> bool: | |
return isinstance(edge, NumEdge) and (edge.orig_txt in ('零', '〇')) | |
def braille_digit(char: str) -> str | None: | |
position = '\u281A\u2801\u2803\u2809\u2819\u2811\u280B\u281B\u2813\u280A'.find(char) # Braille 0-9 | |
return str(position) if position >= 0 else None | |
def add_braille_number(self, start: int, end: int, txt: str, **_args) -> None: | |
new_edge = NumEdge(start, end, txt, self.uroman) | |
new_edge.type = 'number' | |
self.add_edge(new_edge) | |
def add_braille_numbers(self, **_args): | |
if self.contains_script['Braille']: | |
s = self.s | |
num_s, start = '', None | |
for i in range(len(s)): | |
char = s[i] | |
if char == '\u283C': # number mark | |
if start is None: | |
start = i | |
elif (start is not None) and (digit_s := self.braille_digit(char)): | |
num_s += digit_s | |
elif (start is not None) and (char == '\u2832'): # period | |
num_s += '.' | |
elif (start is not None) and (char == '\u2802'): # comma | |
num_s += ',' | |
elif isinstance(start, int): | |
self.add_braille_number(start, i, num_s) | |
num_s, start = '', None | |
if start is not None: | |
self.add_braille_number(start, len(s), num_s) | |
def add_numbers(self, uroman, **args): | |
"""Adds a numerical romanization edge to the romanization lattice, currently just for digits. | |
To be significantly expanded to cover complex Chinese, Egyptian, Amharic numbers.""" | |
verbose = bool(args.get('verbose')) | |
s = self.s | |
num_edges = [] | |
for start in range(len(s)): | |
char = s[start] | |
if uroman.num_props[char]: | |
new_edge = NumEdge(start, start + 1, char, uroman) | |
num_edges.append(new_edge) | |
if verbose: | |
print('NumEdge', new_edge) | |
self.add_edge(new_edge) | |
# D1 sequence of digits 1234 | |
for edge in num_edges: | |
if self.edge_is_digit(edge) and edge.active: # and (edge.value != 0): | |
n_decimal_points = 0 | |
n_decimals = None | |
new_value_s = str(edge.value) | |
sub_edges = [edge] | |
prev_edge = edge | |
while True: | |
right_edge = self.best_right_neighbor_edge(prev_edge.end) | |
if self.edge_is_digit(right_edge): | |
sub_edges.append(right_edge) | |
new_value_s += str(right_edge.value) | |
if n_decimals is not None: | |
n_decimals += 1 | |
prev_edge = right_edge | |
elif ((prev_edge.end < len(s)) and (s[prev_edge.end] == '.') and (n_decimal_points == 0) | |
and (right_edge2 := self.best_right_neighbor_edge(prev_edge.end + 1)) | |
and self.edge_is_digit(right_edge2)): | |
if right_edge is None: | |
right_edge = Edge(prev_edge.end, prev_edge.end+1, s[prev_edge.end], | |
'decimal period') | |
self.add_edge(right_edge) | |
sub_edges.append(right_edge) | |
sub_edges.append(right_edge2) | |
new_value_s += '.' + str(right_edge2.value) | |
n_decimal_points += 1 | |
n_decimals = 1 | |
prev_edge = right_edge2 | |
else: | |
break | |
if len(sub_edges) >= 2: | |
new_value = float(new_value_s) if '.' in new_value_s else int(new_value_s) | |
new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True) | |
new_edge.update(value=new_value, value_s=new_value_s, n_decimals=n_decimals, num_base=1, | |
e_type='D1', script=sub_edges[-1].script) | |
self.add_edge(new_edge) | |
num_edges = self.update_edge_list(num_edges, new_edge, sub_edges) | |
if verbose: | |
print(new_edge.type, new_edge) | |
# G1 combine (*) "single digits" 2*100=200, 3*10= 30 | |
for edge in num_edges: | |
if (isinstance(edge, NumEdge) and edge.active and (edge.num_base == 1) | |
and isinstance(edge.value, int) and (edge.value >= 1)): | |
right_edge = self.best_right_neighbor_edge(edge.end, skip_num_edge=False) | |
if (right_edge | |
and isinstance(right_edge, NumEdge) | |
and right_edge.active | |
and isinstance(right_edge.value, int) | |
and (right_edge.num_base > 1) | |
and (not right_edge.is_large_power)): | |
new_value = edge.value * right_edge.value | |
new_edge = NumEdge(edge.start, right_edge.end, str(new_value), uroman, active=True) | |
new_edge.update(value=new_value, num_base=right_edge.num_base, e_type='G1', | |
orig_txt=edge.orig_txt + right_edge.orig_txt, | |
script=right_edge.script) | |
self.add_edge(new_edge) | |
num_edges = self.update_edge_list(num_edges, new_edge, [edge, right_edge]) | |
if verbose: | |
print(new_edge.type, new_edge) | |
# G2 combine (+) G1 "single digits" 200+30+4=234 (within larger blocks of 1000, 1000000) | |
for edge in num_edges: | |
if isinstance(edge, NumEdge) and edge.active and isinstance(edge.value, int) and not edge.is_large_power: | |
sub_edges = [edge] | |
prev_edge = edge | |
prev_non_edge = edge # None if (edge.orig_txt in '零') else prev_edge | |
while (prev_edge | |
and (right_edge := self.best_right_neighbor_edge(prev_edge.end, skip_num_edge=False)) | |
and isinstance(right_edge, NumEdge) | |
and right_edge.active | |
and isinstance(right_edge.value, int) | |
and (not right_edge.is_large_power) | |
and (self.is_gap_null_edge(prev_non_edge) | |
or ((prev_non_edge.num_base > right_edge.value) | |
and (prev_non_edge.num_base > right_edge.num_base)))): | |
sub_edges.append(right_edge) | |
prev_edge = right_edge | |
if not self.is_gap_null_edge(right_edge): | |
prev_non_edge = right_edge | |
if len(sub_edges) >= 2: | |
new_value = sum([e.value for e in sub_edges]) | |
new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True) | |
new_edge.update(value=new_value, num_base=sub_edges[-1].num_base, e_type='G2', | |
orig_txt=''.join([e.orig_txt for e in sub_edges]), | |
script=sub_edges[-1].script) | |
self.add_edge(new_edge) | |
num_edges = self.update_edge_list(num_edges, new_edge, sub_edges) | |
new_edge.type = 'G2' | |
if verbose: | |
print(new_edge.type, new_edge) | |
# G3 combine (*) G2 blocks with large powers, e.g. 234*1000 = 234000 | |
for edge in num_edges: | |
if (isinstance(edge, NumEdge) and edge.active and (not edge.is_large_power) | |
and (isinstance(edge.value, int) or isinstance(edge.value, float))): | |
right_edge = self.best_right_neighbor_edge(edge.end, skip_num_edge=False) | |
if (right_edge | |
and isinstance(right_edge, NumEdge) | |
and right_edge.active | |
and isinstance(right_edge.value, int) | |
and (right_edge.num_base > 1) | |
and right_edge.is_large_power): | |
new_value = round(edge.value * right_edge.value, 5) | |
if isinstance(new_value, float) and new_value.is_integer(): | |
new_value = int(new_value) | |
new_edge = NumEdge(edge.start, right_edge.end, str(new_value), uroman, active=True) | |
new_edge.update(value=new_value, num_base=right_edge.num_base, e_type='G3', | |
orig_txt=edge.orig_txt + right_edge.orig_txt, | |
script=right_edge.script) | |
self.add_edge(new_edge) | |
num_edges = self.update_edge_list(num_edges, new_edge, [edge, right_edge]) | |
if verbose: | |
print(new_edge.type, new_edge) | |
# G4 combine (+) G3 blocks 234000+567=234567 | |
for edge in num_edges: | |
if isinstance(edge, NumEdge) and edge.active and isinstance(edge.value, int): | |
sub_edges = [edge] | |
while ((prev_edge := sub_edges[-1]) | |
and (right_edge := self.best_right_neighbor_edge(prev_edge.end, skip_num_edge=False)) | |
and isinstance(right_edge, NumEdge) | |
and right_edge.active | |
and isinstance(right_edge.value, int) | |
and (prev_edge.num_base > right_edge.value) | |
and (prev_edge.num_base > right_edge.num_base)): | |
if ((prev_edge.script == 'CJK') | |
and (prev_edge.num_base >= 1000) | |
and ('tag' not in prev_edge.type) | |
and regex.match('10+$', str(prev_edge.num_base)) | |
and (1 <= right_edge.value <= 9) | |
and (right_edge.start + 1 == right_edge.end)): | |
new_num_base = prev_edge.num_base // 10 | |
new_value = new_num_base * right_edge.value | |
# print('DIGIT TAG', prev_edge, right_edge, new_value) | |
right_edge.value = new_value | |
right_edge.num_base = new_num_base | |
right_edge.type = 'G4tag' | |
sub_edges.append(right_edge) | |
if len(sub_edges) >= 2: | |
new_value = sum([e.value for e in sub_edges]) | |
new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True) | |
new_edge.update(value=new_value, num_base=sub_edges[-1].num_base, e_type='G4', | |
orig_txt=''.join([e.orig_txt for e in sub_edges]), | |
script=sub_edges[-1].script) | |
self.add_edge(new_edge) | |
num_edges = self.update_edge_list(num_edges, new_edge, sub_edges) | |
if verbose: | |
print(new_edge.type, new_edge) | |
# F1 | |
for edge in num_edges: | |
# cushion fractions with spaces as needed: e.g. 23½ -> 23 1/2 or 十一五 -> 11 5 | |
if isinstance(edge, NumEdge) and regex.match(r'\d', edge.txt): | |
left_edge = self.best_left_neighbor_edge(edge.start) | |
if left_edge and regex.search(r'\d$', left_edge.txt): | |
if edge.fraction: | |
sep = ' ' | |
else: | |
sep = '·' | |
edge.txt = sep + edge.txt | |
for edge in num_edges: | |
if (isinstance(edge, NumEdge) and edge.active and (edge.value is not None) | |
and (((edge.value > 1000) and (edge.start + 1 == edge.end)) | |
or (edge.orig_txt in '兩參参伍陆陸什') | |
or (edge.orig_txt in ('京兆', )))): | |
edge.active = False | |
if verbose: # or (num_edges and any([e.type in ['G1', 'G2', 'G3', 'G4'] for e in num_edges])): | |
if num_edges: | |
print('actives:') | |
for num_edge in num_edges: | |
print(num_edge) | |
for start in range(len(s)): | |
start_char = s[start] | |
if (best_edge := self.best_edge_in_span(start, start+1)) and isinstance(best_edge, NumEdge): | |
continue | |
if (num := ud_numeric(start_char)) is not None: | |
name = self.uroman.chr_name(start_char) | |
if ("DIGIT" in name) and isinstance(num, int) and (0 <= num <= 9): | |
# if start_char not in '0123456789': print('DIGIT', s[start], num, name) | |
self.add_edge(Edge(start, start + 1, str(num), 'num')) | |
else: | |
uroman.stats[('*NUM', start_char, num)] += 1 | |
def add_rom_fall_back_singles(self, **_args): | |
"""For characters in the original string not covered by romanizations and numbers, | |
add a fallback edge based on type, romanization of single char, or original char.""" | |
for start in range(self.max_vertex): | |
end = start+1 | |
orig_char = self.s[start] | |
if not self.lattice[(start, end)]: | |
rom, edge_annotation = orig_char, 'orig' | |
if self.uroman.char_is_nonspacing_mark(rom): | |
rom, edge_annotation = '', 'Mn' | |
elif self.uroman.char_is_format_char(rom): # e.g. zero-width non-joiner, zero-width joiner | |
rom, edge_annotation = '', 'Cf' | |
elif ud.category(orig_char) == 'Co': | |
rom, edge_annotation = '', 'Co' | |
elif rom == ' ': | |
edge_annotation = 'orig' | |
# elif self.uroman.char_is_space_separator(rom): | |
# rom, edge_annotation = ' ', 'Zs' | |
elif (rom2 := self.simple_top_romanization_candidate_for_span(start, end)) is not None: | |
rom = rom2 | |
if regex.match(r'\+(m|ng|n|h|r)', rom): | |
rom = rom[1:] | |
edge_annotation = 'rom single' | |
# else the original values still hold: rom, edge_annotation = orig_char, 'orig' | |
self.add_edge(Edge(start, end, rom, edge_annotation)) | |
def add_new_edge(old_edges: List[Edge], start: int, end: int, new_rom: str, new_type: str, position: int | None, | |
old_edge_dict: dict)\ | |
-> None: | |
if (start, end, new_rom) not in old_edge_dict: | |
new_edge = Edge(start, end, new_rom, new_type) | |
if position is None: | |
old_edges.append(new_edge) | |
else: | |
old_edges.insert(position + 1, new_edge) | |
old_edge_dict[(start, end, new_rom)] = new_edge | |
# print(f' ALT {start}-{end} {new_rom}') | |
def add_alternatives(self, old_edges: List[Edge]) -> None: | |
old_edge_dict = {} | |
for old_edge in old_edges: | |
old_edge_dict[(old_edge.start, old_edge.end, old_edge.txt)] = old_edge | |
for position, old_edge in enumerate(old_edges): | |
if old_edge.type.startswith('rom-alt'): | |
continue # not old | |
start, end = old_edge.start, old_edge.end | |
orig_s = self.s[start:end] | |
old_rom = old_edge.txt | |
# self.lattice[(start, end)]: | |
for rom_rule in self.uroman.rom_rules[orig_s]: | |
rom_t = rom_rule['t'] | |
if self.cand_is_valid(rom_rule, start, end, rom_t): | |
rom_alts = rom_rule['t-alts'] | |
rom_eosyl = rom_rule['t-at-end-of-syllable'] | |
if (rom_t == old_rom) and rom_alts: | |
for rom_alt in rom_alts: | |
self.add_new_edge(old_edges, start, end, rom_alt, 'rom-alt', position, | |
old_edge_dict) | |
if (rom_t == old_rom) and rom_eosyl: | |
self.add_new_edge(old_edges, start, end, rom_t, 'rom-alt2', position, old_edge_dict) | |
if rom_eosyl == old_rom: | |
self.add_new_edge(old_edges, start, end, rom_t, 'rom-alt3', position, old_edge_dict) | |
def all_edges(self, start: int, end: int) -> List[Edge]: | |
result = [] | |
for start2 in range(start, end): | |
for end2 in sorted(list(self.lattice[(start2, 'right')]), reverse=True): | |
if end2 <= end: | |
result.extend(self.lattice[(start2, end2)]) | |
else: | |
break | |
return result | |
def best_edge_in_span(self, start: int, end: int, skip_num_edge: bool = False) -> Edge | None: | |
edges = self.lattice[(start, end)] | |
# if len(edges) >= 2: print('Multi edge', start2, end2, self.s[start2:end2], edges) | |
decomp_edge, other_edge, rom_edge = None, None, None | |
for edge in edges: | |
if isinstance(edge, NumEdge): | |
if skip_num_edge: | |
continue | |
if edge.active: | |
return edge | |
if edge.type.startswith('rom decomp'): | |
if decomp_edge is None: | |
decomp_edge = edge # plan C | |
elif regex.match(r'(?:rom|num)', edge.type): | |
if rom_edge is None: | |
rom_edge = edge # plan B | |
elif other_edge is None: | |
other_edge = edge # plan D | |
return rom_edge or decomp_edge or other_edge | |
def best_right_neighbor_edge(self, start: int, skip_num_edge: bool = False) -> Edge | None: | |
for end in sorted(list(self.lattice[(start, 'right')]), reverse=True): | |
if best_edge := self.best_edge_in_span(start, end, skip_num_edge=skip_num_edge): | |
return best_edge | |
return None | |
def best_left_neighbor_edge(self, end: int, skip_num_edge: bool = False) -> Edge | None: | |
for start in sorted(list(self.lattice[(end, 'left')])): | |
if best_edge := self.best_edge_in_span(start, end, skip_num_edge=skip_num_edge): | |
return best_edge | |
return None | |
def best_rom_edge_path(self, start: int, end: int, skip_num_edge: bool = False) -> List[Edge]: | |
"""Finds the best romanization edge path through the romanization lattice, including | |
non-romanized pieces such as ASCII and non-ASCII punctuation.""" | |
result = [] | |
start2 = start | |
while start2 < end: | |
if best_edge := self.best_right_neighbor_edge(start2, skip_num_edge=skip_num_edge): | |
result.append(best_edge) | |
start2 = best_edge.end | |
else: # should not happen | |
start2 += 1 | |
return result | |
def find_rom_edge_path_backwards(self, start: int, end: int, min_char: int | None = None, | |
return_str: bool = False, skip_num_edge: bool = False) -> List[Edge] | str: | |
"""Finds a partial best path on the left from a start position to provide left contexts for | |
romanization rules. Can return a string or a list of edges. Is typically used for a short context, | |
as specified by min_char.""" | |
result_edges = [] | |
rom = '' | |
end2 = end | |
while start < end2: | |
old_end2 = end2 | |
if new_edge := self.best_left_neighbor_edge(end2, skip_num_edge=skip_num_edge): | |
result_edges = [new_edge] + result_edges | |
rom = new_edge.txt + rom | |
end2 = new_edge.start | |
if min_char and len(rom) >= min_char: | |
break | |
if old_end2 >= end2: | |
end2 -= 1 | |
if return_str: | |
return rom | |
else: | |
return result_edges | |
def edge_path_to_surf(edges) -> str: | |
result = '' | |
for edge in edges: | |
result += edge.txt | |
return result | |
# @timer | |
def main(): | |
"""This function provides a user interface, either using argparse for a command line interface, | |
or providing direct function calls. | |
First, a uroman object will have to created, loading uroman data (directory must be provided, | |
listed as default). This only needs to be done once. | |
After that you can romanize from file to file, or just romanize a string.""" | |
# Compute data_dir based on the location of this executable script. | |
src_dir = os.path.dirname(os.path.realpath(__file__)) | |
root_dir = os.path.dirname(src_dir) | |
data_dir = os.path.join(root_dir, "data") | |
# print(src_dir, root_dir, data) | |
parser = argparse.ArgumentParser() | |
parser.add_argument('direct_input', nargs='*', type=str) | |
parser.add_argument('--data_dir', type=Path, default=data_dir, help='uroman resource dir') | |
parser.add_argument('-i', '--input_filename', type=str, help='default: sys.stdin') | |
parser.add_argument('-o', '--output_filename', type=str, help='default: sys.stdout') | |
parser.add_argument('-l', '--lcode', type=str, default=None, | |
help='ISO 639-3 language code, e.g. eng') | |
# parser.add_argument('-f', '--rom_format', type=RomFormat, default=RomFormat.STR, help:'alt: RomFormat.EDGES') | |
parser.add_argument('-f', '--rom_format', type=RomFormat, default=RomFormat.STR, | |
choices=list(RomFormat), help="Output format of romanization. 'edges' provides offsets") | |
# The remaining arguments are mostly for development and test | |
parser.add_argument('--max_lines', type=int, default=None, help='limit uroman to first n lines') | |
parser.add_argument('--load_log', action='count', default=0, help='report load stats') | |
parser.add_argument('--test', action='count', default=0, help='perform/display a few tests') | |
parser.add_argument('-v', '--verbose', action='count', default=0) | |
parser.add_argument('--rebuild_ud_props', action='count', default=0, | |
help='rebuild UnicodeDataProps files (for development mode only)') | |
parser.add_argument('--rebuild_num_props', action='count', default=0, | |
help='rebuild NumProps file (for development mode only)') | |
parser.add_argument('--no_caching', action='count', default=0, help='for development mode: speed') | |
parser.add_argument('--silent', action='count', default=0, help='suppress ... progress') | |
parser.add_argument('-a', '--ablation', type=str, default='', help='for development mode: nocap') | |
parser.add_argument('--stats', action='count', default=0, help='for development mode: numbers') | |
parser.add_argument('--ignore_args', action='count', default=0, help='for usage illustration only') | |
parser.add_argument(PROFILE_FLAG, type=argparse.FileType('w', encoding='utf-8', errors='ignore'), | |
default=None, metavar='PROFILE-FILENAME', help='(optional output for performance analysis)') | |
args = parser.parse_args() | |
# copy selected (minor) args from argparse.Namespace to dict | |
args_dict = {'rom_format': args.rom_format, 'load_log': args.load_log, 'test': args.test, 'stats': args.stats, | |
'no_caching': args.no_caching, 'max_lines': args.max_lines, 'verbose': args.verbose, | |
'rebuild_ud_props': args.rebuild_ud_props, 'rebuild_num_props': args.rebuild_num_props, | |
'ablation': args.ablation, 'silent': args.silent} | |
pr = None | |
if args.profile: | |
gc.enable() | |
gc.set_debug(gc.DEBUG_STATS) | |
gc.set_debug(gc.DEBUG_LEAK) | |
pr = cProfile.Profile() | |
pr.enable() | |
'''Sample calls: | |
uroman.py --help | |
uroman.py -i ../test/multi-script.txt -o ../test/multi-script-out2.txt | |
uroman.py < ../test/multi-script.txt > ../test/multi-script-out2.txt | |
uroman.py Игорь | |
uroman.py Игорь --lcode ukr | |
uroman.py ألاسكا 서울 Καλιφόρνια | |
uroman.py ちょっとまってください -f edges | |
uroman.py "महात्मा गांधी" -f lattice | |
uroman.py สวัสดี --load_log | |
uroman.py --test | |
uroman.py --ignore_args | |
uroman.py Բարեւ -o ../test/tmp-out.txt -f edges | |
# In double input cases such as in the line below, | |
# the input-file's romanization is sent to stdout, while the direct-input romanization is sent to stderr | |
uroman.py ⴰⵣⵓⵍ -i ../test/multi-script.txt > ../test/multi-script-out2.txt | |
''' | |
if args.ignore_args: | |
# minimal calls | |
uroman = Uroman(args.data_dir) | |
s, s2, s3, s4 = 'Игорь', 'ちょっとまってください', 'kanne', 'महात्मा गांधी' | |
print(s, uroman.romanize_string(s)) | |
print(s, uroman.romanize_string(s, lcode='ukr')) | |
print(s2, Edge.json_str(uroman.romanize_string(s2, rom_format=RomFormat.EDGES))) | |
print(s3, Edge.json_str(uroman.romanize_string(s3, rom_format=RomFormat.EDGES))) | |
print(s4, Edge.json_str(uroman.romanize_string(s4, rom_format=RomFormat.LATTICE))) | |
# Note that ../test/multi-script.txt has several lines starting with ::lcode eng etc. | |
# This allows users to select specific language codes to specific lines, overwriting the overall --lcodes | |
uroman.romanize_file(input_filename='../test/multi-script.txt', | |
output_filename='../test/multi-script-out3.txt') | |
else: | |
# build a Uroman object (once for many applications and different scripts and languages) | |
uroman = Uroman(args.data_dir, load_log=args.load_log, rebuild_ud_props=args.rebuild_ud_props, | |
rebuild_num_props=args.rebuild_num_props) | |
romanize_file_p = (args.input_filename or args.output_filename | |
or not (args.direct_input or args.test or args.ignore_args | |
or args.rebuild_ud_props or args.rebuild_num_props)) | |
# Romanize any positional arguments, interpreted as strings to be romanized. | |
for s in args.direct_input: | |
result = uroman.romanize_string(s.rstrip(), lcode=args.lcode, **args_dict) | |
result_json = Edge.json_str(result) | |
if romanize_file_p: | |
# input from both file/stdin (to file/stdout) and direct-input (to stderr) | |
if args.input_filename: | |
sys.stderr.write(result_json + '\n') | |
# input from direct-input (but not from file/stdin) to stdout | |
# else pass | |
# no file/stdin or file/stdout, so we write romanization of direct-input to stdout | |
else: | |
print(result_json) | |
# If provided, apply romanization to an entire file. | |
if romanize_file_p: | |
uroman.romanize_file(args.input_filename, args.output_filename, lcode=args.lcode, | |
direct_input=args.direct_input, **args_dict) | |
if args.test: | |
uroman.test_output_of_selected_scripts_and_rom_rules() | |
uroman.test_romanization() | |
if uroman.stats and args.stats: | |
stats100 = {k: uroman.stats[k] for k in list(dict(uroman.stats))[:100]} | |
sys.stderr.write(f'Stats: {stats100} ...\n') | |
if args.profile: | |
if pr: | |
pr.disable() | |
ps = pstats.Stats(pr, stream=args.profile).sort_stats(pstats.SortKey.TIME) | |
ps.print_stats() | |
print(gc.get_stats()) | |
if __name__ == "__main__": | |
main() | |