ALEPH_WEO-WEBETA

Configuration error

File size: 122,352 Bytes

607e564

#!/usr/bin/env python

"""

Written by Ulf Hermjakob, USC/ISI  March-April 2024

uroman is a universal romanizer. It converts text in any script to the Latin alphabet.

This script is a Python reimplementation of an earlier Perl script, with some improvements.

The tool has been tested on 250 languages, with 100 or more sentences each.

This script is still under development and large-scale testing. Feedback welcome.

This script provides token-size caching (for faster runtimes).

Output formats include

  (1) best romanization string

  (2) best romanization edges ("best path"; incl. start and end positions with respect to the original string)

  (3) best romanization with alternatives (as applicable for ambiguous romanization)

  (4) best romanization full lattice (all edges, including superseded sub-edges)

See below for 'sample calls' under main()

"""


from __future__ import annotations
import argparse
from collections import defaultdict
# from memory_profiler import profile
import datetime
from enum import Enum
from fractions import Fraction
import gc
import json
import math
import os
import pathlib
from pathlib import Path
import pstats
import regex
import sys
from typing import List, Tuple
import unicodedata as ud
PROFILE_FLAG = "--profile"  # also used in argparse processing
if PROFILE_FLAG in sys.argv:
    import cProfile

# UTILITIES


def timer(func):
    def wrapper(*args, **kwargs):
        start_time = datetime.datetime.now()
        print(f"Calling: {func.__name__}{args}")
        print(f"Start time: {start_time:%A, %B %d, %Y at %H:%M}")
        result = func(*args, **kwargs)
        end_time = datetime.datetime.now()
        time_diff = (end_time-start_time).total_seconds()
        print(f"End time: {end_time:%A, %B %d, %Y at %H:%M}")
        print(f"Duration: {time_diff} seconds")
        return result
    return wrapper


def slot_value_in_double_colon_del_list(line: str, slot: str, default: str | list | None = None) -> str | list | None:
    """For a given slot, e.g. 'cost', get its value from a line such as '::s1 of course ::s2 ::cost 0.3' -> 0.3

    The value can be an empty string, as for ::s2 in the example above."""
    m = regex.match(fr'(?:.*\s)?::{slot}(|\s+\S.*?)(?:\s+::\S.*|\s*)$', line)
    return m.group(1).strip() if m else default


def has_value_in_double_colon_del_list(line: str, slot: str) -> bool:
    return isinstance(slot_value_in_double_colon_del_list(line, slot), str)


def dequote_string(s: str) -> str:
    if isinstance(s, str):
        m = regex.match(r'''\s*(['"“])(.*)(['"”])\s*$''', s)
        if m and ((m.group(1) + m.group(3)) in ("''", '""', '“”')):
            return m.group(2)
    return s


def last_chr(s: str) -> str:
    if len(s):
        return s[len(s)-1]
    else:
        ''


def ud_numeric(char: str) -> int | float | None:
    try:
        num_f = ud.numeric(char)
        return int(num_f) if num_f.is_integer() else num_f
    except (ValueError, TypeError):
        return None


def robust_str_to_num(num_s: str, filename: str = None, line_number: int | None = None, silent: bool = False) \
        -> int | float | None:
    if isinstance(num_s, str):
        try:
            return float(num_s) if "." in num_s else int(num_s)
        except ValueError:
            if not silent:
                sys.stderr.write(f'Cannot convert "{num_s}" to a number')
                if line_number:
                    sys.stderr.write(f' line: {line_number}')
                if filename:
                    sys.stderr.write(f' file: {filename}')
                sys.stderr.write(f'\n')
    elif isinstance(num_s, float) or isinstance(num_s, int):
        return num_s
    return None


def first_non_none(*args):
    for arg in args:
        if arg is not None:
            return arg
    return None


def any_not_none(*args) -> bool:
    for arg in args:
        if arg is not None:
            return True
    return False


def add_non_none_to_dict(d: dict, key: str, value) -> None:
    if value is not None:
        d[key] = value


def fraction_char2fraction(fraction_char: str, fraction_value: float | None = None,

                           uroman: Uroman | None = None) -> Fraction | None:
    s = ''
    fraction = None
    for ud_decomp_elem in ud.decomposition(fraction_char).split():
        try:
            s += chr(int(ud_decomp_elem, 16))
        except ValueError:
            s += ud_decomp_elem
    if m := regex.match(r'<fraction>(\d+)⁄(\d+)$', s):
        numerator_s, denominator_s = m.group(1, 2)
        try:
            fraction = Fraction(int(numerator_s), int(denominator_s))
        except ValueError:
            fraction = None
    if (fraction is None) and uroman and fraction_value:
        if num_denom := uroman.unicode_float2fraction(fraction_value):
            try:
                fraction = Fraction(num_denom[0], num_denom[1])
            except ValueError:
                fraction = None
    return fraction


def chr_name(char: str) -> str:
    """robust version of ud.name; see related Uroman.char_name() that includes names not included in UnicodeData.txt"""
    try:
        return ud.name(char)
    except (ValueError, TypeError):
        return ''


def args_get(key: str, args: argparse.Namespace | None = None):
    return vars(args)[key] if args and (key in args) else None


class DictClass:
    def __init__(self, **kw_args):
        for kw_arg in kw_args:
            kw_arg2 = kw_arg.replace('_', '-')
            value = kw_args[kw_arg]
            if not (value in (None, [], False)):
                self.__dict__[kw_arg2] = value

    def __repr__(self):
        return str(self.__dict__)

    def __getitem__(self, key, default=None):
        return self.__dict__[key] if key in self.__dict__ else default

    def __bool__(self):
        return len(self.__dict__) > 0


class RomRule(DictClass):
    # key: source string
    # typical attributes: s (source), t (target), prov (provenance), lcodes (language codes)
    # t_alts=t_alts (target alternatives), use_only_at_start_of_word, dont_use_at_start_of_word,
    # use_only_at_end_of_word, dont_use_at_end_of_word, use_only_for_whole_word
    pass


class Script(DictClass):
    # key: lower case script_name
    # typical attributes: script_name, direction, abugida_default_vowels, alt_script_names, languages
    pass


class RomFormat(Enum):
    """Output format of romanization"""
    STR = 'str'          # simple string
    EDGES = 'edges'      # list of edges (includes character offsets in original string)
    ALTS = 'alts'        # lattice including alternative edges
    LATTICE = 'lattice'  # lattice including alternative and superseded edges

    def __str__(self):
        return self.value


class Uroman:
    """This class loads and maintains uroman data independent of any specific text corpus.

    Typically, only a single instance will be used. (In contrast to multiple lattice instances, one per text.)

    Methods include some testing. And finally methods to romanize a string (romanize_string()) or an entire file

    (romanize_file())."""
    def __init__(self, data_dir: Path, **args):  # args: load_log, rebuild_ud_props
        self.data_dir = data_dir
        self.rom_rules = defaultdict(list)
        self.scripts = defaultdict(Script)
        self.dict_bool = defaultdict(bool)
        self.dict_str = defaultdict(str)
        self.dict_int = defaultdict(int)
        self.dict_num = defaultdict(lambda: None)   # values are int (most common), float, or str ("1/2")
        # num_props key: txt
        # values:  {"txt": "\u137b", "rom": "100", "value": 100, "type": "base", "mult": 1, "script": "Ethiopic"}
        self.num_props = defaultdict(dict)
        self.dict_set = defaultdict(set)
        self.float2fraction = {}  # caching
        gc.disable()
        self.load_resource_files(data_dir, args.get('load_log', False),
                                 args.get('rebuild_ud_props', False),
                                 args.get('rebuild_num_props', False))
        gc.enable()
        self.hangul_rom = {}
        self.rom_cache = {}   # key: (s, lcode) value: t
        self.stats = defaultdict(int)  # stats, e.g. for unprocessed numbers
        self.abugida_cache = {}  # key: (script, char_rom) value: (base_rom, base_rom_plus_abugida_vowel, modified rom)

    def second_rom_filter(self, c: str, rom: str, name: str | None) -> Tuple[str | None, str]:
        """Much of this code will eventually move the old Perl code to generate cleaner primary data"""
        if rom and (' ' in rom):
            if name is None:
                name = self.chr_name(c)
            if "MYANMAR VOWEL SIGN KAYAH" in name:
                if m := regex.search(r'kayah\s+(\S+)\s*$', rom):
                    return m.group(1), name
            if "MENDE KIKAKUI SYLLABLE" in name:
                if m := regex.search(r'm\d+\s+(\S+)\s*$', rom):
                    return m.group(1), name
            if regex.search(r'\S\s+\S', rom):
                return c, name
        return None, name

    def load_rom_file(self, filename: str, provenance: str, file_format: str = None, load_log: bool = True):
        """Reads in and processes the 3 main romanization data files: (1) romanization-auto-table.txt

        which was automatically generated from UnicodeData.txt (2) UnicodeDataOverwrite.txt that "corrects"

        some entries in romanization-auto-table.txt and (3) romanization-table.txt which was largely manually

        created and allows complex romanization rules, some for specific languages, some for specific contexts."""
        n_entries = 0
        try:
            f = open(filename)
        except FileNotFoundError:
            sys.stderr.write(f'Cannot open file {filename}\n')
            return
        with (f):
            for line_number, line in enumerate(f, 1):
                if line.startswith('#'):
                    continue
                if regex.match(r'^\s*$', line):  # blank line
                    continue
                line = regex.sub(r'\s{2,}#.*$', '', line)
                if file_format == 'u2r':
                    t_at_end_of_syllable = None
                    u = dequote_string(slot_value_in_double_colon_del_list(line, 'u'))
                    try:
                        cp = int(u, 16)
                        s = chr(cp)
                    except ValueError:
                        continue
                    t = dequote_string(slot_value_in_double_colon_del_list(line, 'r'))
                    if name := slot_value_in_double_colon_del_list(line, 'name'):
                        self.dict_str[('name', s)] = name
                    if pic := slot_value_in_double_colon_del_list(line, 'pic'):
                        self.dict_str[('pic', s)] = pic
                    if tone_mark := slot_value_in_double_colon_del_list(line, 'tone-mark'):
                        self.dict_str[('tone-mark', s)] = tone_mark
                    if syllable_info := slot_value_in_double_colon_del_list(line, 'syllable-info'):
                        self.dict_str[('syllable-info', s)] = syllable_info
                else:
                    s = dequote_string(slot_value_in_double_colon_del_list(line, 's'))
                    t = dequote_string(slot_value_in_double_colon_del_list(line, 't'))
                    t_at_end_of_syllable = dequote_string(slot_value_in_double_colon_del_list(line,
                                                                                              't-end-of-syllable'))
                if (num_s := slot_value_in_double_colon_del_list(line, 'num')) is not None:
                    num = robust_str_to_num(num_s)
                    self.dict_num[s] = (num_s if (num is None) else num)
                is_minus_sign = has_value_in_double_colon_del_list(line, 'is-minus-sign')
                is_plus_sign = has_value_in_double_colon_del_list(line, 'is-plus-sign')
                is_decimal_point = has_value_in_double_colon_del_list(line, 'is-decimal-point')
                is_large_power = has_value_in_double_colon_del_list(line, 'is-large-power')
                fraction_connector = slot_value_in_double_colon_del_list(line, 'fraction-connector')
                percentage_marker = slot_value_in_double_colon_del_list(line, 'percentage-marker')
                int_frac_connector = slot_value_in_double_colon_del_list(line, 'int-frac-connector')
                lcode_s = slot_value_in_double_colon_del_list(line, 'lcode')
                lcodes = regex.split(r'[,;]\s*', lcode_s) if lcode_s else []
                use_only_at_start_of_word = has_value_in_double_colon_del_list(line, 'use-only-at-start-of-word')
                dont_use_at_start_of_word = has_value_in_double_colon_del_list(line, 'dont-use-at-start-of-word')
                use_only_at_end_of_word = has_value_in_double_colon_del_list(line, 'use-only-at-end-of-word')
                dont_use_at_end_of_word = has_value_in_double_colon_del_list(line, 'dont-use-at-end-of-word')
                use_only_for_whole_word = has_value_in_double_colon_del_list(line, 'use-only-for-whole-word')
                num_s = slot_value_in_double_colon_del_list(line, 'num')
                num = robust_str_to_num(num_s, filename, line_number, silent=False)
                t_alt_s = slot_value_in_double_colon_del_list(line, 't-alt')
                t_alts = regex.split(r'[,;]\s*', t_alt_s) if t_alt_s else []
                t_alts = list(map(dequote_string, t_alts))
                t_mod, name2 = self.second_rom_filter(s, t, None)
                if t_mod and (t_mod != t):
                    if t != s:
                        pass  # sys.stderr.write(f'UPDATE: {s} {name2} {t} -> {t_mod}\n')
                    t = t_mod
                if s is not None:
                    for bool_key in ('is-large-power', 'is-minus-sign', 'is-plus-sign', 'is-decimal-point'):
                        bool_value = eval(bool_key.replace('-', '_'))
                        if bool_value:
                            self.dict_bool[(bool_key, s)] = True
                    if any_not_none(t, num, is_minus_sign, is_plus_sign, is_decimal_point, is_large_power,
                                    fraction_connector, percentage_marker, int_frac_connector):
                        self.register_s_prefix(s)
                        n_entries += 1
                        # if regex.match(r'[\u2800-\u28FF]', s): print("Braille", s, t)
                        restrictions = [lcodes, use_only_at_start_of_word, dont_use_at_start_of_word,
                                        use_only_at_end_of_word, dont_use_at_end_of_word, use_only_for_whole_word]
                        n_restrictions = len([restr for restr in restrictions if restr])
                        provenance2 = provenance
                        if (t is None) and (num is not None) and (provenance2 == "rom"):
                            provenance2 = "num"
                        new_rom_rule = RomRule(s=s, t=t, prov=provenance2, lcodes=lcodes, t_alts=t_alts, num=num,
                                               use_only_at_start_of_word=use_only_at_start_of_word,
                                               dont_use_at_start_of_word=dont_use_at_start_of_word,
                                               use_only_at_end_of_word=use_only_at_end_of_word,
                                               dont_use_at_end_of_word=dont_use_at_end_of_word,
                                               use_only_for_whole_word=use_only_for_whole_word,
                                               t_at_end_of_syllable=t_at_end_of_syllable,
                                               n_restr=n_restrictions,
                                               is_minus_sign=is_minus_sign,
                                               is_plus_sign=is_plus_sign,
                                               is_decimal_point=is_decimal_point,
                                               fraction_connector=fraction_connector,
                                               percentage_marker=percentage_marker,
                                               int_frac_connector=int_frac_connector,
                                               is_large_power=is_large_power)
                        old_rom_rules = self.rom_rules[s]
                        if ((len(old_rom_rules) == 1) and (old_rom_rules[0]['prov'] in ('ud', 'ow'))
                                and not (lcodes or use_only_at_start_of_word or dont_use_at_start_of_word
                                         or use_only_at_end_of_word or dont_use_at_end_of_word
                                         or use_only_for_whole_word)):
                            self.rom_rules[s] = [new_rom_rule]  # overwrite
                        else:
                            self.rom_rules[s].append(new_rom_rule)
        # Thai
        thai_cancellation_mark = '\u0E4C'
        # cancellation applies to preceding letter incl. any vowel modifier letter (e.g. ศักดิ์สิทธิ์ -> saksit)
        for cp in range(0x0E01, 0x0E4C):   # Thai
            c = chr(cp)
            s = c + thai_cancellation_mark
            new_rom_rule = RomRule(s=s, t='', prov='auto cancel letter')
            if not self.rom_rules[s]:
                self.rom_rules[s] = [new_rom_rule]
                self.register_s_prefix(s)
        thai_consonants = list(map(chr, range(0x0E01, 0x0E2F)))
        thai_vowel_modifiers = ['\u0E31', '\u0E47'] + list(map(chr, range(0x0E33, 0x0E3B)))
        for c1 in thai_consonants:
            for v in thai_vowel_modifiers:
                s = c1 + v + thai_cancellation_mark
                new_rom_rule = RomRule(s=s, t='', prov='auto cancel syllable')
                if not self.rom_rules[s]:
                    self.rom_rules[s] = [new_rom_rule]
                    self.register_s_prefix(s)
        if load_log:
            sys.stderr.write(f'Loaded {n_entries} from {filename}\n')

    def load_script_file(self, filename: str, load_log: bool = True):
        """Reads in (typically from Scripts.txt) information about various scripts such as Devanagari,

        incl. information such as the default abugida vowel letter (e.g. "a")."""
        n_entries, max_n_script_name_components = 0, 0
        try:
            f = open(filename)
        except FileNotFoundError:
            sys.stderr.write(f'Cannot open file {filename}\n')
            return
        with f:
            for line_number, line in enumerate(f, 1):
                if line.startswith('#'):
                    continue
                if regex.match(r'^\s*$', line):  # blank line
                    continue
                line = regex.sub(r'\s{2,}#.*$', '', line)
                if script_name := slot_value_in_double_colon_del_list(line, 'script-name'):
                    lc_script_name = script_name.lower()
                    if lc_script_name in self.scripts:
                        sys.stderr.write(f'** Ignoring duplicate script "{script_name}" '
                                         f'in line {line_number} of {filename}\n')
                    else:
                        n_entries += 1
                        direction = slot_value_in_double_colon_del_list(line, 'direction')
                        abugida_default_vowel_s = slot_value_in_double_colon_del_list(line,
                                                                                      'abugida-default-vowel')
                        abugida_default_vowels = regex.split(r'[,;]\s*', abugida_default_vowel_s) \
                            if abugida_default_vowel_s else []
                        alt_script_name_s = slot_value_in_double_colon_del_list(line, 'alt-script-name')
                        alt_script_names = regex.split(r'[,;]\s*', alt_script_name_s) if alt_script_name_s else []
                        language_s = slot_value_in_double_colon_del_list(line, 'language')
                        languages = regex.split(r'[,;]\s*', language_s) if language_s else []
                        new_script = Script(script_name=script_name, alt_script_names=alt_script_names,
                                            languages=languages, direction=direction,
                                            abugida_default_vowels=abugida_default_vowels)
                        self.scripts[lc_script_name] = new_script
                        for language in languages:
                            self.dict_set[('scripts', language)].add(script_name)
                        for alt_script_name in alt_script_names:
                            lc_alt_script_name = alt_script_name.lower()
                            if lc_alt_script_name in self.scripts:
                                sys.stderr.write(f'** Ignoring duplicate alternative script name "{script_name}" '
                                                 f'in line {line_number} of {filename}\n')
                            else:
                                self.scripts[lc_alt_script_name] = new_script
                    n_script_name_components = len(script_name.split())
                    if n_script_name_components > max_n_script_name_components:
                        max_n_script_name_components = n_script_name_components
        if max_n_script_name_components:
            self.dict_int['max_n_script_name_components'] = max_n_script_name_components
        if load_log:
            sys.stderr.write(f'Loaded {n_entries} script descriptions from {filename}'
                             f' (max_n_scripts_name_components: {max_n_script_name_components})\n')

    def extract_script_name(self, script_name_plus: str, full_char_name: str = None) -> str | None:
        """Using info from Scripts.txt, this script selects the script name from a Unicode,

        e.g. given "OLD HUNGARIAN CAPITAL LETTER A", extract "Old Hungarian"."""
        if full_char_name and script_name_plus == full_char_name:
            return None
        while script_name_plus:
            if script_name_plus.lower() in self.scripts:
                if script := self.scripts[script_name_plus.lower()]:
                    if script_name := script['script-name']:
                        return script_name
            script_name_plus = regex.sub(r'\s*\S*\s*$', '', script_name_plus)
        return None

    def load_unicode_data_props(self, filename: str, load_log: bool = True):
        """Loads Unicode derived data from (1) UnicodeDataProps.txt, (2) UnicodeDataPropsHangul.txt

        and UnicodeDataPropsCJK.txt with a list of valid script-specific characters."""
        n_script, n_script_char, n_script_vowel_sign, n_script_medial_consonant_sign, n_script_virama = 0, 0, 0, 0, 0
        try:
            f = open(filename)
        except FileNotFoundError:
            sys.stderr.write(f'Cannot open file {filename}\n')
            return
        with f:
            for line_number, line in enumerate(f, 1):
                if line.startswith('#'):
                    continue
                if regex.match(r'^\s*$', line):  # blank line
                    continue
                line = regex.sub(r'\s{2,}#.*$', '', line)
                if script_name := slot_value_in_double_colon_del_list(line, 'script-name'):
                    n_script += 1
                    for char in slot_value_in_double_colon_del_list(line, 'char', []):
                        self.dict_str[('script', char)] = script_name
                        n_script_char += 1
                    for char in slot_value_in_double_colon_del_list(line, 'numeral', []):
                        self.dict_str[('script', char)] = script_name
                        n_script_char += 1
                    for char in slot_value_in_double_colon_del_list(line, 'vowel-sign', []):
                        self.dict_bool[('is-vowel-sign', char)] = True
                        n_script_vowel_sign += 1
                    for char in slot_value_in_double_colon_del_list(line, 'medial-consonant-sign', []):
                        self.dict_bool[('is-medial-consonant-sign', char)] = True
                        n_script_medial_consonant_sign += 1
                    for char in slot_value_in_double_colon_del_list(line, 'sign-virama', []):
                        self.dict_bool[('is-virama', char)] = True
                        n_script_virama += 1
        if load_log:
            sys.stderr.write(f'Loaded from {filename} mappings of {n_script_char:,d} characters '
                             f'to {n_script} script{"" if n_script == 1 else "s"}')
            if n_script_vowel_sign or n_script_virama or n_script_medial_consonant_sign:
                sys.stderr.write(f', with a total of {n_script_vowel_sign} vowel signs, '
                                 f'{n_script_medial_consonant_sign} medial consonant signs '
                                 f'and {n_script_virama} viramas')
            sys.stderr.write('.\n')

    def load_num_props(self, filename: str, load_log: bool = True):
        """Loads Unicode derived data from (1) UnicodeDataProps.txt, (2) UnicodeDataPropsHangul.txt

        and UnicodeDataPropsCJK.txt with a list of valid script-specific characters."""
        n_entries = 0
        try:
            f = open(filename)
        except FileNotFoundError:
            sys.stderr.write(f'Cannot open file {filename}\n')
            return
        with f:
            for line_number, line in enumerate(f, 1):
                if line.startswith('#'):
                    continue
                if regex.match(r'^\s*$', line):  # blank line
                    continue
                d = json.loads(line)
                if isinstance(d, dict):
                    if txt := d.get('txt'):
                        self.num_props[txt] = d
                        n_entries += 1
                    else:
                        sys.stderr.write(f'Missing txt in l.{line_number} in file {filename}: {line.strip()}\n')
                    for bool_key in ('is-large-power',):
                        if d.get(bool_key):
                            self.dict_bool[(bool_key, txt)] = True
                else:
                    sys.stderr.write(f'json in l.{line_number} in file {filename} not a dict: {line.strip()}\n')
        if load_log:
            sys.stderr.write(f'Loaded {n_entries} entries from {filename}\n')

    @staticmethod
    def de_accent_pinyin(s: str) -> str:
        """De-accents a string from "liú" to "liu" and "ü" to "u" (to help process file Chinese_to_Pinyin.txt)."""
        result = ''
        for char in s:
            if decomp := ud.decomposition(char).split():
                try:
                    decomp_chars = [chr(int(x, 16)) for x in decomp]
                    letters = [x for x in decomp_chars if ud.category(x).startswith('L')]
                except ValueError:
                    sys.stderr.write(f'Cannot decode {decomp}\n')
                    continue
                if len(letters) == 1:
                    result += letters[0]
                else:
                    sys.stderr.write(f'Cannot decode {decomp} (expected 1 letter)\n')
            else:
                result += char
        result = result.replace('ü', 'u')
        return result

    def register_s_prefix(self, s: str):
        for prefix_len in range(1, len(s) + 1):
            self.dict_bool[('s-prefix', s[:prefix_len])] = True

    def load_chinese_pinyin_file(self, filename: str, load_log: bool = True):
        """Loads file Chinese_to_Pinyin.txt which maps Chinese characters to their Latin form."""
        n_entries = 0
        try:
            f = open(filename)
        except FileNotFoundError:
            sys.stderr.write(f'Cannot open file {filename}\n')
            return
        with f:
            for line_number, line in enumerate(f, 1):
                if line.startswith('#'):
                    continue
                if regex.match(r'^\s*$', line):  # blank line
                    continue
                try:
                    chinese, pinyin = line.rstrip().split()
                    rom = self.de_accent_pinyin(pinyin)
                except ValueError:
                    sys.stderr.write(f'Cannot process line {line_number} in file {filename}: {line}')
                else:
                    s = chinese
                    new_rom_rule = RomRule(s=s, t=rom, prov='rom pinyin', lcodes=[])
                    self.rom_rules[chinese].append(new_rom_rule)
                    self.register_s_prefix(s)
                    n_entries += 1
        if load_log:
            sys.stderr.write(f'Loaded {n_entries} script descriptions from {filename}\n')

    @staticmethod
    def add_char_to_rebuild_unicode_data_dict(d: dict, script_name: str, prop_class: str, char: str):
        d['script-names'].add(script_name)
        key = (script_name, prop_class)
        if key in d:
            d[key].append(char)
        else:
            d[key] = [char]

    def rebuild_unicode_data_props(self, out_filename: str, cjk: str = None, hangul: str = None):
        """This functions rebuilds UnicodeDataProps*.txt This might be useful when a new UnicodeData.txt

        version is released, or additional information is extracted from Unicode to UnicodeDataProps.txt

        Regular users normally never have to call this function."""
        d = {'script-names': set()}
        n_script_refs = 0
        codepoint = -1
        prop_classes = {'char'}
        while codepoint < 0xF0000:
            codepoint += 1
            c = chr(codepoint)
            if not (char_name := self.chr_name(c)):
                continue
            for prop_name_comp2 in ('VOWEL SIGN',
                                    ('MEDIAL CONSONANT SIGN', 'CONSONANT SIGN MEDIAL', 'CONSONANT SIGN SHAN MEDIAL',
                                     'CONSONANT SIGN MON MEDIAL'),
                                    ('SIGN VIRAMA', 'SIGN ASAT', 'AL-LAKUNA', 'SIGN COENG', 'SIGN PAMAAEH',
                                     'CHARACTER PHINTHU'),
                                    ('NUMERAL', 'NUMBER', 'DIGIT', 'FRACTION')):
                if prop_name_comp2 and isinstance(prop_name_comp2, tuple):
                    prop_list = prop_name_comp2
                else:
                    prop_list = (prop_name_comp2,)
                for prop_name_comp in prop_list:
                    prop_class = prop_list[0].lower().replace(' ', '-')
                    if prop_class not in prop_classes:
                        prop_classes.add(prop_class)
                    script_name_cand = regex.sub(fr'\s+{prop_name_comp}\b.*$', '', char_name)
                    if script_name := self.extract_script_name(script_name_cand, char_name):
                        self.add_char_to_rebuild_unicode_data_dict(d, script_name, prop_class, c)
            script_name_cand = regex.sub(r'\s+(CONSONANT|LETTER|LIGATURE|SIGN|SYLLABLE|SYLLABICS|VOWEL|'
                                         r'IDEOGRAPH|HIEROGLYPH|POINT|ACCENT|CHARACTER|TIPPI|ADDAK|IRI|URA|'
                                         r'SYMBOL GENITIVE|SYMBOL COMPLETED|SYMBOL LOCATIVE|SYMBOL AFOREMENTIONED|'
                                         r'AU LENGTH MARK)\b.*$', '',
                                         char_name)
            if script_name := self.extract_script_name(script_name_cand, char_name):
                self.add_char_to_rebuild_unicode_data_dict(d, script_name, 'char', c)
                n_script_refs += 1
        # print(sorted(d['script-names']))
        prop_classes = sorted(prop_classes)
        out_filenames = [x for x in [out_filename, cjk, hangul] if x]
        cjk2 = cjk if cjk else out_filename
        hangul2 = hangul if hangul else out_filename
        for out_file in out_filenames:
            try:
                f_out = open(out_file, 'w')
            except OSError:
                sys.stderr.write(f'Cannot write to file {out_file}\n')
                continue
            with f_out:
                for script_name in sorted(d['script-names']):
                    if script_name == 'CJK':
                        if out_file != cjk2:
                            continue
                    elif script_name == 'Hangul':
                        if out_file != hangul2:
                            continue
                    else:
                        if out_file != out_filename:
                            continue
                    prop_components = [f"::script-name {script_name}"]
                    for prop_class in prop_classes:
                        key = (script_name, prop_class)
                        if key in d:
                            if chars := ''.join(d[key]):
                                if prop_class in ('char',):
                                    prop_components.append(f"::n-{prop_class} {len(chars)}")
                                prop_components.append(f"::{prop_class} {chars}")
                    f_out.write(f"{' '.join(prop_components)}\n")
        sys.stderr.write(f"Rebuilt {out_filenames} with {n_script_refs} characters "
                         f"for {len(d['script-names'])} scripts.\n")

    def rebuild_num_props(self, out_filename: str, err_filename: str):
        n_out, n_err = 0, 0
        with open(out_filename, 'w') as f_out, open(err_filename, 'w') as f_err:
            codepoint = -1
            while codepoint < 0xF0000:
                codepoint += 1
                char = chr(codepoint)
                num = first_non_none(ud_numeric(char),  # robust ud.numeric
                                     self.num_value(char))  # uroman table includes extra num values, e.g. for Egyptian
                if num is None:
                    continue
                result_dict = {}
                orig_txt = char
                value: int | float | None = None  # non-fraction-value(3 1/2) = 3
                fraction: Fraction | None = None  # fraction(3 1/2) = Fraction(1, 2)
                num_base = None  # num_base(500) = 100
                base_multiplier = None  # base_multiplier(500) = 5
                script = None
                is_large_power = self.dict_bool[('is-large-power', char)]
                # num_base is typically a power of 10: 1, 10, 100, 1000, 10000, 100000, 1000000, ...
                # exceptions might include 12 for the 'dozen' in popular English 'two dozen and one' (2*12+1=25)
                # exceptions might include 20 for the 'score' in archaic English 'four score and seven' (4*20+7=87)
                # exceptions might include 20 for the 'vingt' as in standard French 'quatre-vingt-treize' (4*20+13=93)
                if script_name := self.chr_script_name(char):
                    script = script_name
                elif char in '0123456789':
                    script = 'ascii-digit'
                name = self.chr_name(char)
                exclude_from_number_processing = False
                for scrypt_type in ('SUPERSCRIPT', 'SUBSCRIPT',
                                    'CIRCLED', 'PARENTHESIZED', 'SEGMENTED', 'MATHEMATICAL', 'ROMAN NUMERAL',
                                    'FULL STOP', 'COMMA'):
                    if scrypt_type in name:
                        script = '*' + scrypt_type.lower().replace(' ', '-')
                        exclude_from_number_processing = True
                        break
                for scrypt_type in ('VULGAR FRACTION',):
                    if scrypt_type in name:
                        script = scrypt_type.lower().replace(' ', '-')
                        break
                if exclude_from_number_processing:
                    continue
                if isinstance(num, int):
                    value = num
                    if 0 <= num <= 9:
                        num_base = 1
                        base_multiplier = num
                        if "DIGIT" in name:
                            num_type = 'digit'
                        else:
                            # Chinese numbers 零 (0), 一 (1), ... 九 (9) have numeric values,
                            # but are NOT (full) digits
                            num_type = 'digit-like'
                    elif m := regex.match(r'([0-9]+?)(0*)$', str(num)):
                        base_multiplier = int(m.group(1))  # non_base_value(500) = 5
                        num_base = int('1' + m.group(2))
                        num_type = 'base' if base_multiplier == 1 else 'multi'
                    else:
                        num_type = 'other-int'  # Do such cases exist?
                elif ("FRACTION" in name) and (fraction := fraction_char2fraction(char, num, self)):
                    fraction = fraction
                    num_type = 'fraction'
                else:
                    num_type = 'other-num'  # Do such cases exist? Yes. Bengali currency numerators, ...
                value_s = '' if value is None else str(value)
                fraction_s = '' if fraction is None else f'{fraction.numerator}/{fraction.denominator}'
                fraction_list = None if fraction is None else [fraction.numerator, fraction.denominator]
                delimiter_s = ' ' if value_s and fraction_s else ''
                rom = (value_s + delimiter_s + fraction_s) or orig_txt
                add_non_none_to_dict(result_dict, 'txt', orig_txt)
                add_non_none_to_dict(result_dict, 'rom', rom)
                add_non_none_to_dict(result_dict, 'value', value)
                add_non_none_to_dict(result_dict, 'fraction', fraction_list)
                add_non_none_to_dict(result_dict, 'type', num_type)
                if is_large_power:
                    result_dict['is-large-power'] = True
                add_non_none_to_dict(result_dict, 'base', num_base)
                add_non_none_to_dict(result_dict, 'mult', base_multiplier)
                add_non_none_to_dict(result_dict, 'script', script)
                if num_type.startswith('other'):
                    add_non_none_to_dict(result_dict, 'name', name)
                    f_err.write(json.dumps(result_dict) + '\n')
                    n_err += 1
                else:
                    if not script:
                        add_non_none_to_dict(result_dict, 'name', name)
                    f_out.write(json.dumps(result_dict) + '\n')
                    n_out += 1
        sys.stderr.write(f'Processed {codepoint} codepoints,\n  wrote {n_out} lines to {out_filename}\n'
                         f'    and {n_err} lines to {err_filename}\n')

    def load_resource_files(self, data_dir: Path, load_log: bool = False,

                            rebuild_ud_props: bool = False, rebuild_num_props: bool = False):
        """Loads all resource files needed for romanization."""
        data_dir = data_dir
        if not isinstance(data_dir, pathlib.Path):
            sys.stderr.write(f'Error: data_dir is of {type(data_dir)}, not a Path.\n'
                             f'       Cannot load any resource files.\n')
            return
        self.load_rom_file(os.path.join(data_dir, "romanization-auto-table.txt"),
                           'ud', file_format='rom', load_log=load_log)
        self.load_rom_file(os.path.join(data_dir, "UnicodeDataOverwrite.txt"),
                           'ow', file_format='u2r', load_log=load_log)
        self.load_rom_file(os.path.join(data_dir, "romanization-table.txt"),
                           'man', file_format='rom', load_log=load_log)
        self.load_chinese_pinyin_file(os.path.join(data_dir, "Chinese_to_Pinyin.txt"), load_log=load_log)
        self.load_script_file(os.path.join(data_dir, "Scripts.txt"), load_log=load_log)
        self.load_num_props(os.path.join(data_dir, "NumProps.jsonl"), load_log=load_log)
        for base_file in ("UnicodeDataProps.txt", "UnicodeDataPropsCJK.txt", "UnicodeDataPropsHangul.txt"):
            self.load_unicode_data_props(os.path.join(data_dir, base_file), load_log=load_log)
        if rebuild_ud_props:
            self.rebuild_unicode_data_props(os.path.join(data_dir, "UnicodeDataProps.txt"),
                                            cjk=os.path.join(data_dir, "UnicodeDataPropsCJK.txt"),
                                            hangul=os.path.join(data_dir, "UnicodeDataPropsHangul.txt"))
        if rebuild_num_props:
            self.rebuild_num_props(os.path.join(data_dir, "NumProps.jsonl"),
                                   os.path.join(data_dir, "NumPropsRejects.jsonl"))

    def unicode_hangul_romanization(self, s: str, pass_through_p: bool = False):
        """Special algorithmic solution to convert (Korean) Hangul characters to the Latin alphabet."""
        if cached_rom := self.hangul_rom.get(s, None):
            return cached_rom
        leads = "g gg n d dd r m b bb s ss - j jj c k t p h".split()
        vowels = "a ae ya yae eo e yeo ye o wa wai oe yo u weo we wi yu eu yi i".split()
        tails = "- g gg gs n nj nh d l lg lm lb ls lt lp lh m b bs s ss ng j c k t p h".split()
        result = ""
        for c in s:
            cp = ord(c)
            if 0xAC00 <= cp <= 0xD7A3:
                code = cp - 0xAC00
                lead_index = int(code / (28 * 21))
                vowel_index = int(code / 28) % 21
                tail_index = code % 28
                rom = leads[lead_index] + vowels[vowel_index] + tails[tail_index]
                rom = rom.replace('-', '')
                self.hangul_rom[c] = rom
                result += rom
            elif pass_through_p:
                result += c
        return result

    @staticmethod
    def char_is_nonspacing_mark(s) -> bool:
        """ Checks whether a character is a nonspacing mark, e.g. combining accents, points, vowel signs"""
        return (len(s) == 1) and (ud.category(s) == 'Mn')

    @staticmethod
    def char_is_format_char(s) -> bool:
        """ Checks whether a character is a formatting character, e.g. a zero-with joiner/non-joiner"""
        return (len(s) == 1) and (ud.category(s) == 'Cf')

    @staticmethod
    def char_is_space_separator(s) -> bool:
        """ Checks whether a character is a space,

            e.g. ' ', non-breakable space, en space, ideographic (Chinese) space, Ogham space mark

            but excluding \t, \r, \n"""
        return (len(s) == 1) and (ud.category(s) == 'Zs')

    def chr_name(self, char: str) -> str:
        try:
            return ud.name(char)
        except (ValueError, TypeError):
            if name := self.dict_str[('name', char)]:
                return name
        return ''

    def num_value(self, s: str) -> int | float | Fraction | None:
        """rom_rules include numeric values beyond UnicodeData.txt, e.g. for Egyptian numerals"""
        for rom_rule in self.rom_rules[s]:
            if (num := rom_rule['num']) is not None:
                return num
        return None

    def rom_rule_value(self, s: str, key: str):
        for rom_rule in self.rom_rules[s]:
            if (value := rom_rule.get(key)) is not None:
                return value
        return None

    def unicode_float2fraction(self, num: float, precision: float = 0.000001) -> Tuple[int, int] | None:
        """only for common unicode fractions"""
        if chached_value := self.float2fraction.get(num, None):
            return chached_value
        for numerator in (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11):
            for denominator in (2, 3, 4, 5, 6, 8, 12, 16, 20, 32, 40, 64, 80, 160, 320):
                if abs(numerator / denominator - num) < precision:
                    result = numerator, denominator
                    self.float2fraction[num] = result
                    return result
        return None

    def chr_script_name(self, char: str) -> str:
        """For letters, diacritics, numerals etc."""
        return self.dict_str[('script', char)]

    def test_output_of_selected_scripts_and_rom_rules(self):
        """Low level test function that checks and displays romanization information."""
        output = ''
        for s in ("Oriya", "Chinese"):
            d = self.scripts[s.lower()]
            output += f'SCRIPT {s} {d}\n'
        for s in ('ƿ', 'β', 'и', 'μπ', '⠹', '亿', 'ちょ', 'и', '𓍧', '正', '分之', 'ऽ', 'ศ', 'ด์'):
            d = self.rom_rules[s]
            output += f'DICT {s} {d}\n'
        for s in ('ƿ', 'β', 'न', 'ु'):
            output += f'SCRIPT-NAME {s} {self.chr_script_name(s)}\n'
        for s in ('万', '\uF8F7', '\U00013368', '\U0001308B', '\u0E48', '\u0E40'):
            name = self.chr_name(s)
            num = self.dict_num[s]
            pic = self.dict_str[('pic', s)]
            tone_mark = self.dict_str[('tone-mark', s)]
            syllable_info = self.dict_str[('syllable-info', s)]
            is_large_power = self.dict_bool[('is-large-power', s)]
            output += f'PROPS {s}'
            if name:
                output += f'  name: {name}'
            if num:
                output += f'  num: {num} ({type(num).__name__})'
            if pic:
                output += f'  pic: {pic}'
            if tone_mark:
                output += f'  tone-mark: {tone_mark}'
            if syllable_info:
                output += f'  syllable-info: {syllable_info}'
            if is_large_power:
                output += f'  is-large-power: {is_large_power}'
            output += '\n'
        mayan12 = '\U0001D2EC'
        egyptian600 = '𓍧'
        runic90 = '𐍁'
        klingon2 = '\uF8F2'
        for offset, c in enumerate(f'9九万萬百፲፱፻፸¾0²₂AⅫ⑫൵{runic90}{mayan12}{egyptian600}{klingon2}'):
            output += f'NUM-EDGE: {NumEdge(offset, offset+1, c, self)}\n'
        for s in ('\u00bc', '\u0968'):
            output += f'NUM-PROPS: {self.num_props[s]}\n'
        print(output)

    def test_romanization(self, **args):
        """A few full cases of romanization testing."""
        tests = [('ألاسكا', None), ('यह एक अच्छा अनुवाद है.', 'hin'), ('ちょっとまってください', 'kor'),
                 ('Μπανγκαλόρ', 'ell'), ('Зеленський', 'ukr'), ('കേരളം', 'mal')]
        for test in tests:
            s = test[0]
            lcode = test[1] if len(test) >= 2 else None
            rom = self.romanize_string(s, lcode=lcode, **args)
            sys.stderr.write(f'ROM {s} -> {rom}\n')
        n_alerts = 0
        codepoint = -1
        while codepoint < 0xF0000:
            codepoint += 1
            c = chr(codepoint)
            rom = self.romanize_string(c)
            if regex.search(r'\s', rom) and regex.search(r'\S', rom):
                name = self.chr_name(c)
                sys.stderr.write(f'U+{codepoint:04X} {c} {name}  {rom}\n')
                n_alerts += 1
        sys.stderr.write(f'{n_alerts} alerts for roms with spaces\n')

    def romanize_file(self, input_filename: str | None = None, output_filename: str | None = None,

                      lcode: str | None = None, direct_input: List[str] = None, **args):
        """Script to apply romanization to an entire file. Input and output files needed.

        Language code (lcode) recommended."""
        f_in_to_be_closed, f_out_to_be_closed = False, False
        if direct_input and (input_filename is None):
            f_in = direct_input  # list of lines
        elif isinstance(input_filename, str):
            try:
                f_in = open(input_filename)
                f_in_to_be_closed = True
            except OSError:
                sys.stderr.write(f'Error in romanize_file: Cannot open file {input_filename}\n')
                f_in = None
        elif input_filename is None:
            f_in = sys.stdin
        else:
            sys.stderr.write(f"Error in romanize_file: argument 'input_filename' {input_filename} "
                             f"is of wrong type: {type(input_filename)} (should be str)\n")
            f_in = None
        if isinstance(output_filename, str):
            try:
                f_out = open(str(output_filename), 'w')
                f_out_to_be_closed = True
            except OSError:
                sys.stderr.write(f'Error in romanize_file: Cannot write to file {output_filename}\n')
                f_out = None
        elif output_filename is None:
            f_out = sys.stdout
        else:
            sys.stderr.write(f"Error in romanize_file: argument 'output_filename' {output_filename} "
                             f"is of wrong type: {type(output_filename)} (should be str)\n")
            f_out = None
        if f_in and f_out:
            max_lines = args.get('max_lines', None)
            progress_dots_output = False
            for line_number, line in enumerate(f_in, 1):
                if m := regex.match(r'(::lcode\s+)([a-z]{3})(\s+)(.*?)\s*$', line):
                    lcode_kw, lcode2, space, snt = m.group(1, 2, 3, 4)
                    rom_result = self.romanize_string(snt, lcode2 or lcode, **args)
                    if args.get('rom_format', None) == RomFormat.STR:
                        lcode_prefix = f"{lcode_kw}{lcode2}{space}"
                        f_out.write(lcode_prefix + rom_result + '\n')
                    else:
                        lcode_prefix = f'[0, 0, "", "lcode: {lcode2}"]'  # meta edge with lcode info
                        prefixed_edges = [lcode_prefix] + self.romanize_string(snt, lcode2 or lcode, **args)
                        f_out.write(Edge.json_str(prefixed_edges) + '\n')
                else:
                    f_out.write(Edge.json_str(self.romanize_string(line.rstrip(), lcode, **args)) + '\n')
                if not args.get('silent'):
                    if line_number % 100 == 0:
                        if line_number % 1000 == 0:
                            sys.stderr.write(str(line_number))
                        else:
                            sys.stderr.write('.')
                        progress_dots_output = True
                        sys.stderr.flush()
                        gc.collect()
                if max_lines and line_number >= max_lines:
                    break
            if progress_dots_output:
                sys.stderr.write('\n')
                sys.stderr.flush()
        if f_in_to_be_closed:
            f_in.close()
        if f_out_to_be_closed:
            f_out.close()

    @staticmethod
    def apply_any_offset_to_cached_rom_result(cached_rom_result: str | List[Edge], offset: int = 0) \
            -> str | List[Edge]:
        if isinstance(cached_rom_result, str):
            return cached_rom_result
        elif offset == 0:
            return cached_rom_result
        else:
            return [Edge(edge.start + offset, edge.end + offset, edge.txt, edge.type) for edge in cached_rom_result]

    def romanize_string_core(self, s: str, lcode: str | None, rom_format: RomFormat, cache_p: bool,

                             offset: int = 0, **args) -> str | List[Edge]:
        """Script to support token-by-token romanization with caching for higher speed."""
        if cache_p:
            cached_rom = self.rom_cache.get((s, lcode, rom_format), None)
            if cached_rom is not None:
                return self.apply_any_offset_to_cached_rom_result(cached_rom, offset)
        lat = Lattice(s, uroman=self, lcode=lcode)
        lat.pick_tibetan_vowel_edge(**args)
        lat.prep_braille(**args)
        lat.add_romanization(**args)
        lat.add_numbers(self, **args)
        lat.add_braille_numbers(**args)
        lat.add_rom_fall_back_singles(**args)
        if rom_format == RomFormat.LATTICE:
            all_edges = lat.all_edges(0, len(s))
            lat.add_alternatives(all_edges)
            if cache_p:
                self.rom_cache[(s, lcode, rom_format)] = all_edges
            result = self.apply_any_offset_to_cached_rom_result(all_edges, offset)
        else:
            best_edges = lat.best_rom_edge_path(0, len(s))
            if rom_format in (RomFormat.EDGES, RomFormat.ALTS):
                if rom_format == RomFormat.ALTS:
                    lat.add_alternatives(best_edges)
                if cache_p:
                    self.rom_cache[(s, lcode, rom_format)] = best_edges
                result = self.apply_any_offset_to_cached_rom_result(best_edges, offset)
            else:
                rom = lat.edge_path_to_surf(best_edges)
                del lat
                if cache_p:
                    self.rom_cache[(s, lcode, rom_format)] = rom
                result = rom
        return result

    def romanize_string(self, s: str, lcode: str | None = None, rom_format: RomFormat = RomFormat.STR, **args) \
            -> str | List[Edge]:
        """Main entry point for romanizing a string. Recommended argument: lcode (language code).

        recursive only used for development.

        Method returns a string or a list of edges (with start and end offsets)."""
        lcode = lcode or args.get('lcode', None)
        # print('rom::', s, 'lcode:', lcode, 'print-lattice:', print_lattice_p)

        # with caching (for string format output only for now)
        if cache_p := not args.get('no_caching', False):
            rest, offset = s, 0
            result = '' if rom_format == RomFormat.STR else []
            while m3 := regex.match(r'(.*?)([.,; ]*[ 。][.,; ]*)(.*)$', rest):
                pre, delimiter, rest = m3.group(1, 2, 3)
                result += self.romanize_string_core(pre, lcode, rom_format, cache_p, offset, **args)
                offset += len(pre)
                result += self.romanize_string_core(delimiter, lcode, rom_format, cache_p, offset, **args)
                offset += len(delimiter)
            result += self.romanize_string_core(rest, lcode, rom_format, cache_p, offset, **args)
            return result
        else:
            return self.romanize_string_core(s, lcode, rom_format, cache_p, 0, **args)


class Edge:
    """This class defines edges that span part of a sentence with a specific romanization.

    There might be multiple edges for a given span. The edges in turn are part of the

    romanization lattice."""
    def __init__(self, start: int, end: int, s: str, annotation: str = None):
        self.start = start
        self.end = end
        self.txt = s
        self.type = annotation

    def __str__(self):
        return f'[{self.start}-{self.end}] {self.txt} ({self.type})'

    def __repr__(self):
        return str(self)

    def json(self) -> str:  # start - end - text - annotation
        return json.dumps([self.start, self.end, self.txt, self.type])

    @staticmethod
    def json_str(rom_result: List[Edge] | str) -> str:
        if isinstance(rom_result, str):
            return rom_result
        else:
            result = '['
            for edge in rom_result:
                if isinstance(edge, Edge):
                    result += edge.json()
                else:
                    result += str(edge)
            result += ']'
            return result


class NumEdge(Edge):
    def __init__(self, start: int, end: int, s: str, uroman: Uroman | None, active: bool = False):
        """For NumEdge, the s argument is in original language (not yet romanized)."""
        # For speed, much of this processing should at some point be cached in data files.
        Edge.__init__(self, start, end, s)
        self.orig_txt, self.txt = s, s
        self.value, self.fraction, self.num_base, self.base_multiplier = None, None, None, None
        self.type, self.script, self.is_large_power, self.active = None, None, False, active
        self.n_decimals = None
        self.value_s = None     # precision for 3.14159265358979323846264338327950288419716939937510582097494
        if start+1 == end:
            char = s[0]
            if d := uroman.num_props.get(char):
                self.active = True
                self.value = d.get('value')
                fraction_list = d.get('fraction')
                self.fraction = Fraction(fraction_list[0], fraction_list[1]) if fraction_list else None
                self.num_base = d.get('base')
                self.base_multiplier = d.get('mult')
                self.type = d.get('type')
                self.script = d.get('script')
                self.is_large_power = d.get('is-large-power')
                self.update()

    def update(self,

               value: int | float | None = None,

               value_s: str | None = None,

               fraction: Fraction | None = None,

               n_decimals: int | None = None,

               num_base: int | None = None,

               base_multiplier: int | float | None = None,

               script: str | None = None,

               e_type: str | None = None,

               orig_txt: str | None = None) -> str:
        self.value = first_non_none(value, self.value)
        self.value_s = first_non_none(value_s, self.value_s)
        self.fraction = first_non_none(fraction, self.fraction)
        self.n_decimals = first_non_none(n_decimals, self.n_decimals)
        self.num_base = first_non_none(num_base, self.num_base)
        self.base_multiplier = first_non_none(base_multiplier, self.base_multiplier)
        self.script = first_non_none(script, self.script)
        self.type = first_non_none(e_type, self.type)
        self.orig_txt = first_non_none(orig_txt, self.orig_txt)
        if self.value_s is not None:
            value_s = self.value_s
        elif self.value is None:
            value_s = ''
        elif isinstance(self.value, float) and (self.n_decimals is not None):
            value_s = first_non_none(self.value_s, f'{self.value:0.{self.n_decimals}f}')
        else:
            value_s = str(self.value)
        fraction_s = '' if self.fraction is None else f'{self.fraction.numerator}/{self.fraction.denominator}'
        delimiter_s = ' ' if value_s and fraction_s else ''
        self.txt = (value_s + delimiter_s + fraction_s) or self.orig_txt
        return self.txt

    def __str__(self):
        if self.num_base is not None:
            if self.base_multiplier is not None:
                b_clause = f'{self.base_multiplier}*{self.num_base}'
            else:
                b_clause = str(self.num_base)
        else:
            b_clause = None
        return (('' if self.active else ' *')
                + f'[{self.start}-{self.end}] {self.orig_txt} R:{self.txt} T:{self.type}'
                + (' LP' if self.is_large_power else '')
                + (f' B:{b_clause}' if (b_clause is not None) else '')
                + (f' V:{self.value}' if ((self.value is not None) and (str(self.value) != self.txt)) else '')
                + (f' VS:{self.value_s}' if ((self.value_s is not None) and (self.value_s != self.txt)) else '')
                + (f' F:.{self.n_decimals}f' if self.n_decimals else f'')
                + (f' S:{self.script}' if self.script else ''))


class Lattice:
    """Lattice for a specific romanization instance. Has edges."""
    def __init__(self, s: str, uroman: Uroman, lcode: str = None):
        self.s = s
        self.lcode = lcode
        self.lattice = defaultdict(set)
        self.max_vertex = len(s)
        self.uroman = uroman
        self.props = {}
        self.simple_top_rom_cache = {}
        self.contains_script = defaultdict(bool)
        self.check_for_scripts()

    def check_for_scripts(self):
        for c in self.s:
            script_name = self.uroman.chr_script_name(c)
            self.contains_script[script_name] = True
            if regex.search(r'[\u2800-\u28FF]', self.s):
                self.contains_script['Braille'] = True

    def add_edge(self, edge: Edge):
        self.lattice[(edge.start, edge.end)].add(edge)
        self.lattice[(edge.start, 'right')].add(edge.end)
        self.lattice[(edge.end, 'left')].add(edge.start)

    def __str__(self):
        edges = []
        for start in range(self.max_vertex):
            for end in self.lattice[(start, 'right')]:
                for edge in self.lattice[(start, end)]:
                    edges.append(f'[{start}-{end}] {edge.txt} ({edge.type})')
        return ' '.join(edges)

    @staticmethod
    def char_is_braille(c: str) -> bool:
        return 0x2800 <= ord(c[0]) <= 0x28FF

    # Help Tibet
    def char_is_subjoined_letter(self, c: str) -> bool:
        return "SUBJOINED LETTER" in self.uroman.chr_name(c)

    def char_is_regular_letter(self, c: str) -> bool:
        char_name = self.uroman.chr_name(c)
        return ("LETTER" in char_name) and not ("SUBJOINED" in char_name)

    def char_is_letter(self, c: str) -> bool:
        return "LETTER" in self.uroman.chr_name(c)

    def char_is_vowel_sign(self, c: str) -> bool:
        return self.uroman.dict_bool[('is-vowel-sign', c)]

    def char_is_letter_or_vowel_sign(self, c: str) -> bool:
        return self.char_is_letter(c) or self.char_is_vowel_sign(c)

    def is_at_start_of_word(self, position: int) -> bool:
        # return not regex.match(r'(?:\pL|\pM)', self.s[position-1:position])
        first_char = self.s[position]
        first_char_is_braille = self.char_is_braille(first_char)
        end = position
        if (preceded_by_alpha := self.props.get(('preceded_by_alpha', end), None)) in (True, False):
            return not preceded_by_alpha
        for start in self.lattice[(end, 'left')]:
            for edge in self.lattice[(start, end)]:
                prev_letter = None if edge.txt == '' else edge.txt[-1]
                if len(edge.txt) and (prev_letter.isalpha() or (first_char_is_braille and (prev_letter in ["'"]))):
                    self.props[('preceded_by_alpha', position)] = True
                    return False
        self.props[('preceded_by_alpha', position)] = False
        return True

    def is_at_end_of_word(self, position: int) -> bool:
        if (cached_followed_by_alpha := self.props.get(('followed_by_alpha', position), None)) in (True, False):
            return not cached_followed_by_alpha
        start = position
        while (start+1 < self.max_vertex) \
                and self.uroman.char_is_nonspacing_mark(self.s[start]) \
                and ('NUKTA' in self.uroman.chr_name(self.s[start])):
            start += 1
        for end in range(start + 1, self.max_vertex + 1):
            s = self.s[start:end]
            if not self.uroman.dict_bool[('s-prefix', s)]:
                break
            for rom_rule in self.uroman.rom_rules[s]:
                rom = rom_rule['t']
                if (not rom_rule['use-only-at-start-of-word']) and regex.search(r'\pL', rom):
                    self.props[('followed_by_alpha', position)] = True
                    return False
        self.props[('followed_by_alpha', position)] = False
        return True

    def is_at_end_of_syllable(self, position: int) -> Tuple[bool, str]:
        """At least initially for Thai"""
        prev_char = self.s[position-2] if position >= 2 else None
        # char = self.s[position-1] if position >= 1 else None
        next_char = self.s[position] if position < self.max_vertex else None
        if self.uroman.dict_str[('tone-mark', next_char)]:
            adj_position = position + 1
            next_char = self.s[adj_position] if adj_position < self.max_vertex else None
            # print('TONE-MARK', position, next_char)
        else:
            adj_position = position
        next_char2 = self.s[adj_position + 1] if adj_position + 1 < self.max_vertex else None
        if prev_char is None:
            return False, 'start-of-string'
        if not regex.search(r'(?:\pL|\pM)$', prev_char):  # start of token
            return False, 'start-of-token'
        if self.uroman.dict_str[('syllable-info', prev_char)] == 'written-pre-consonant-spoken-post-consonant':
            return False, 'pre-post-vowel-on-left'
        if self.uroman.dict_str[('syllable-info', next_char)] == 'written-pre-consonant-spoken-post-consonant':
            return True, 'pre-post-vowel-on-right'
        if adj_position >= self.max_vertex:  # end of string
            return True, 'end-of-string'
        # if not self.char_is_letter_or_vowel_sign(next_char):  # end of token
        if not regex.match(r'(?:\pL|\pM)', next_char):  # end of token
            return True, 'end-of-token'
        if position > 0:
            left_edge = self.best_left_neighbor_edge(position-1)
            if left_edge and regex.search(r'[bcdfghjklmnpqrstvxz]$', left_edge.txt):
                return False, 'consonant-to-the-left'
        next_char_rom = first_non_none(self.simple_top_romanization_candidate_for_span(adj_position,
                                                                                       adj_position + 2,
                                                                                       simple_search=True),
                                       self.simple_top_romanization_candidate_for_span(adj_position,
                                                                                       adj_position + 1,
                                                                                       simple_search=True),
                                       "?")
        if not regex.match(r"[aeiou]", next_char_rom.lower()):  # followed by consonant
            return True, f'not-followed-by-vowel {next_char_rom}'
        if (next_char == '\u0E2D') and (next_char2 is not None):  # THAI CHARACTER O ANG
            next_char2_rom = first_non_none(self.simple_top_romanization_candidate_for_span(adj_position+1,
                                                                                            adj_position+2,
                                                                                            simple_search=True),
                                            "?")
            if regex.match(r"[aeiou]", next_char2_rom.lower()):
                return True, 'o-ang-followed-by-vowel'  # In that context Thai char. "o ang" is considered a consonant
        return False, 'not-at-syllable-end-by-default'

    def romanization_by_first_rule(self, s) -> str | None:
        try:
            return self.uroman.rom_rules[s][0]['t']
        except IndexError:
            return None

    def expand_rom_with_special_chars(self, rom: str, start: int, end: int, **args) \
            -> Tuple[str, int, int, str | None]:
        """This method contains a number of special romanization heuristics that typically modify

        an existing or preliminary edge based on context."""
        orig_start = start
        uroman = self.uroman
        full_string = self.s
        annot = None
        if rom == '':
            return rom, start, end, None
        prev_char = (full_string[start-1] if start >= 1 else '')
        first_char = full_string[start]
        last_char = full_string[end-1]
        next_char = (full_string[end] if end < len(full_string) else '')
        # \u2820 is the Braille character indicating that the next letter is upper case
        if (prev_char == '\u2820') and regex.match(r'[a-z]', rom):
            return rom[0].upper() + rom[1:], start-1, end, 'rom exp'
        # Normalize multi-upper case THessalonike -> Thessalonike, but don't change THESSALONIKE
        if start+1 == end and rom.isupper() and next_char.islower():
            ablation = args.get('ablation', '')     # VERBOSE
            if not ('nocap' in ablation):
                rom = rom.capitalize()
        # Japanese small tsu (and Gurmukhi addak) used as consonant doubler:
        if (prev_char and prev_char in 'っッ\u0A71') \
                and (uroman.chr_script_name(prev_char) == uroman.chr_script_name(prev_char)) \
                and (m_double_consonant := regex.match(r'(ch|[bcdfghjklmnpqrstwz])', rom)):
            # return m_double_consonant.group(1).replace('ch', 't') + rom, start-1, end, 'rom exp'
            # expansion might additional apply to the right
            if prev_char in 'っッ':  # for Japanese, per Hepburn, use tch
                rom = m_double_consonant.group(1).replace('ch', 't') + rom
            else:
                rom = m_double_consonant.group(1).replace('ch', 'c') + rom
            start = start-1
            first_char = full_string[start]
            prev_char = (full_string[start-1] if start >= 1 else '')
        # Thai
        if uroman.chr_script_name(first_char) == 'Thai':
            if (start+1 == end) and regex.match(r'[bcdfghjklmnpqrstvwxyz]+$', rom):
                if uroman.dict_str[('syllable-info', prev_char)] == 'written-pre-consonant-spoken-post-consonant':
                    for vowel_prefix_len in [1]:
                        if vowel_prefix_len <= start:
                            for vowel_suffix_len in [3, 2, 1]:
                                if end + vowel_suffix_len <= len(full_string):
                                    pattern = (full_string[start-vowel_prefix_len: start]
                                               + '–'
                                               + full_string[end:end+vowel_suffix_len])
                                    if uroman.rom_rules[pattern]:
                                        vowel_rom_rule = uroman.rom_rules[pattern][0]
                                        vowel_rom = vowel_rom_rule['t']
                                        # print(f" PATTERN {pattern} ({full_string[start:end]}/{rom}) {rom}{vowel_rom}")
                                        return rom + vowel_rom, start-vowel_prefix_len, end+vowel_suffix_len, 'rom exp'
            if (uroman.chr_script_name(prev_char) == 'Thai') \
                    and (uroman.dict_str[('syllable-info', prev_char)]
                         == 'written-pre-consonant-spoken-post-consonant') \
                    and regex.match(r'[bcdfghjklmnpqrstvwxyz]', rom) \
                    and (vowel_rom := self.romanization_by_first_rule(prev_char)):
                return rom + vowel_rom, start-1, end, 'rom exp'
            # THAI CHARACTER O ANG
            if (first_char == '\u0E2D') and (end - start == 1):
                prev_script = uroman.chr_script_name(prev_char)
                next_script = uroman.chr_script_name(next_char)
                prev_rom = self.find_rom_edge_path_backwards(0, start, 1, return_str=True)
                next_rom = self.romanization_by_first_rule(next_char)
                # if not recursive:
                #     lc = uroman.romanize_string(full_string[:start], lcode=self.lcode, recursive=True)
                #     rc = uroman.romanize_string(full_string[end:], lcode=self.lcode, recursive=True)
                #     print('PP', start, end, prev_script, next_script, prev_rom, next_rom, '  LC:', lc[-40:],
                #           '  RC:', rc[:40])
                # delete THAI CHARACTER O ANG unless it is surrounded on both sides by a Thai consonant
                if not ((prev_script == 'Thai') and (next_script == 'Thai')
                        and regex.match(r'[bcdfghjklmnpqrstvwxz]+$', prev_rom)
                        and regex.match(r'[bcdfghjklmnpqrstvwxz]+$', next_rom)):
                    # if not recursive:
                    #     print(f'* DELETE O ANG {first_char} {start}-{end}   LC: {lc[-40:]}  RC: {rc[:40]}')
                    return '', start, end, 'rom del'
        # Coptic: consonant + grace-accent = e + consonant
        if next_char and (next_char == "\u0300") and (uroman.chr_script_name(last_char) == "Coptic")\
                and (not self.simple_top_romanization_candidate_for_span(orig_start, end+1)):
            rom = 'e' + rom
            end = end+1
            last_char = full_string[end - 1]
            next_char = (full_string[end] if end < len(full_string) else '')
            annot = 'rom exp'
        # Japanese small y: ki + small ya = kya etc.
        if (next_char and next_char in 'ゃゅょャュョ') \
                and (uroman.chr_script_name(last_char) == uroman.chr_script_name(next_char)) \
                and regex.search(r'([bcdfghjklmnpqrstvwxyz]i$)', rom) \
                and (y_rom := self.romanization_by_first_rule(next_char)) \
                and (not self.simple_top_romanization_candidate_for_span(orig_start, end+1)) \
                and (not self.simple_top_romanization_candidate_for_span(start, end+1)):
            rom = rom[:-1] + y_rom
            end = end+1
            last_char = full_string[end - 1]
            next_char = (full_string[end] if end < len(full_string) else '')
            annot = 'rom exp'
        # Japanese vowel lengthener (U+30FC)
        last_rom_char = last_chr(rom)
        if (next_char == 'ー') \
                and (uroman.chr_script_name(last_char) in ('Hiragana', 'Katakana')) \
                and (last_rom_char in 'aeiou'):
            return rom + last_rom_char, start, end+1, 'rom exp'
        # Virama (in Indian languages)
        if self.uroman.dict_bool[('is-virama', next_char)]:
            return rom, start, end + 1, "rom exp"
        if rom.startswith(' ') and ((start == 0) or (prev_char == ' ')):
            rom = rom[1:]
        if rom.endswith(' ') and ((end == len(full_string)+1) or (next_char == ' ')):
            rom = rom[:-1]
        return rom, start, end, annot

    def prep_braille(self, **_args) -> None:
        if self.contains_script['Braille']:
            dots6 = '\u2820'  # characters in following word are upper case
            all_caps = False
            for i, c in enumerate(self.s):
                if (i >= 1) and (self.s[i-1] == dots6) and (c == dots6):
                    all_caps = True
                elif all_caps:
                    if c in '\u2800':  # Braille space
                        all_caps = False
                    else:
                        self.props[('is-upper', i)] = True

    def pick_tibetan_vowel_edge(self, **args) -> None:
        if not self.contains_script['Tibetan']:
            return None
        verbose = bool(args.get('verbose'))
        s = self.s
        uroman = self.uroman
        tibetan_syllable = []
        tibetan_letter_positions = []
        for start in range(self.max_vertex):
            c = s[start]
            if (uroman.chr_script_name(c) == 'Tibetan') and self.char_is_letter_or_vowel_sign(c):
                tibetan_letter_positions.append(start)
            else:
                if tibetan_letter_positions:
                    tibetan_syllable.append(tibetan_letter_positions)
                    tibetan_letter_positions = []
        if tibetan_letter_positions:
            tibetan_syllable.append(tibetan_letter_positions)
        for tibetan_letter_positions in tibetan_syllable:
            vowel_pos = None
            orig_txt = ''
            roms = []
            subjoined_letter_positions = []
            first_letter_position = tibetan_letter_positions[0]
            for i in tibetan_letter_positions:
                c = s[i]
                orig_txt += c
                rom = first_non_none(self.simple_top_romanization_candidate_for_span(i, i+1), "?")
                self.props[('edge-vowel', i)] = None
                if self.char_is_vowel_sign(c) or (rom and regex.match(r"[aeiou]+$", rom)):
                    vowel_pos = i
                    self.props[('edge-vowel', i)] = True
                    # delete any syllable initial ' before vowel
                    if roms == ["'"]:
                        self.props[('edge-delete', i-1)] = True
                elif self.char_is_subjoined_letter(c):
                    subjoined_letter_positions.append(i)
                    if i > first_letter_position:
                        if c == "\u0FB0":
                            vowel_pos = i-1
                            self.props[('edge-vowel', i-1)] = True
                        else:
                            self.props[('edge-vowel', i-1)] = False
                    rom = regex.sub(r'([bcdfghjklmnpqrstvwxyz].*)a$', r'\1', rom)
                elif c == "\u0F60":  # Tibetan letter -a (')
                    self.props[('edge-vowel', i)] = False
                    if i > first_letter_position:
                        vowel_pos = i-1
                        self.props[('edge-vowel', i-1)] = True
                        if i == tibetan_letter_positions[-1]:
                            self.props[('edge-delete', i)] = True
                    if roms and not (roms[-1] in "aeiou"):
                        rom = "a'"
                    else:
                        rom = "'"
                else:
                    rom = regex.sub(r'([bcdfghjklmnpqrstvwxyz].*)a$', r'\1', rom)
                roms.append(rom)
            if vowel_pos is not None:
                for i in tibetan_letter_positions:
                    if self.props.get(('edge-vowel', i)) is None:
                        self.props[('edge-vowel', i)] = False
            else:
                best_cost, best_vowel_pos, best_pre, best_post = math.inf, None, None, None
                n_letters = len(tibetan_letter_positions)
                for i in tibetan_letter_positions:
                    rel_pos = i - first_letter_position
                    pre, post = ''.join(roms[:rel_pos+1]), ''.join(roms[rel_pos+1:])
                    if self.props.get(('edge-vowel', i)) is False:
                        cost = 20
                        if cost < best_cost:
                            best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
                    elif n_letters == 1:
                        cost = 0
                        if cost < best_cost:
                            best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
                    elif n_letters == 2:
                        cost = 0 if i == 0 else 0.1
                        if cost < best_cost:
                            best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
                    else:
                        good_suffix = regex.match(r"(?:|[bcdfghjklmnpqrstvwxz]|bh|bs|ch|cs|dd|ddh|"
                                                  r"dh|dz|dzh|gh|gr|gs|kh|khs|kss|n|nn|nt|ms|ng|ngs|ns|ph|"
                                                  r"rm|sh|ss|th|ts|tsh|tt|tth|zh|zhs)'?$", post)
                        good_prefix = regex.match(r"'?(?:.|bd|br|brg|brgy|bs|bsh|bst|bt|bts|by|bz|bzh|"
                                                  r"ch|db|dby|dk|dm|dp|dpy|dr|"
                                                  r"gl|gn|gr|gs|gt|gy|gzh|kh|khr|khy|kr|ky|ld|lh|lt|mkh|mny|mth|mtsh|"
                                                  r"ny|ph|phr|phy|rgy|rk|el|rn|rny|rt|rts|"
                                                  r"sk|skr|sky|sl|sm|sn|sny|sp|spy|sr|st|th|ts|tsh)$", pre)
                        subjoined_suffix = all([x in subjoined_letter_positions
                                                for x in tibetan_letter_positions[rel_pos+2:]])
                        # print('GOOD', good_suffix, good_prefix, subjoined_suffix, f'{pre}a{post}',
                        #       subjoined_letter_positions, tibetan_letter_positions[rel_pos+2:])
                        if good_suffix and good_prefix:
                            cost = len(pre) * 0.1
                        elif good_suffix:
                            cost = len(pre)
                        elif subjoined_suffix and good_prefix:
                            cost = len(pre) * 0.3
                        elif subjoined_suffix:
                            cost = len(pre) * 0.5
                        else:
                            cost = math.inf
                    if cost < best_cost:
                        best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
                if best_vowel_pos is not None:
                    for i in tibetan_letter_positions:
                        if self.props.get(('edge-vowel', i)) is None:
                            value = (i == best_vowel_pos)
                            self.props[('edge-vowel', i)] = value
                if verbose:
                    best_cost = best_cost if isinstance(best_cost, int) else round(best_cost, 2)
                    sys.stderr.write(f'Tib. best cost: "{best_pre}a{best_post}"  o:{orig_txt}  c:{round(best_cost, 2)}'
                                     f'   p:{best_vowel_pos} {tibetan_letter_positions}\n')

    def add_default_abugida_vowel(self, rom: str, start: int, end: int, annotation: str = '') -> str:
        """Adds an abugida vowel (e.g. "a") where needed. Important for many languages in South Asia."""
        uroman = self.uroman
        s = self.s
        try:
            first_s_char = s[start]
            last_s_char = s[end-1]
            script_name = uroman.chr_script_name(first_s_char)
            script = self.uroman.scripts[script_name.lower()]
            if not (abugida_default_vowels := script['abugida-default-vowels']):
                return rom
            key = (script, rom)
            if key in uroman.abugida_cache:
                base_rom, base_rom_plus_vowel, mod_rom = uroman.abugida_cache[key]
                rom = mod_rom
            else:
                vowels_regex1 = '|'.join(abugida_default_vowels)   # e.g. 'a' or 'a|o'
                vowels_regex2 = '|'.join(map(lambda x: x + '+', abugida_default_vowels))   # e.g. 'a+' or 'a+|o+'
                if m := regex.match(fr'([cfghkmnqrstxy]?y)({vowels_regex2})-?$', rom):
                    base_rom = m.group(1)
                    base_rom_plus_vowel = base_rom + m.group(2)
                elif m := regex.match(fr'([bcdfghjklmnpqrstvwxyz]+)({vowels_regex1})-?$', rom):
                    base_rom = m.group(1)
                    base_rom_plus_vowel = base_rom + m.group(2)
                    if rom.endswith('-') and (start+1 == end) and rom[0].isalpha():
                        rom = rom[:-1]
                else:
                    base_rom = rom
                    base_rom_plus_vowel = base_rom + abugida_default_vowels[0]
                if (not regex.match(r"[bcdfghjklmnpqrstvwxyz]+$", base_rom)
                        and (not ((script_name == 'Tibetan') and (base_rom == "'")))):
                    base_rom, base_rom_plus_vowel = None, None
                uroman.abugida_cache[key] = (base_rom, base_rom_plus_vowel, rom)
            if base_rom is None:
                return rom
            if 'tail' in annotation:
                return rom
            prev_s_char = s[start-1] if start >= 1 else ''
            next_s_char = s[end] if len(s) > end else ''
            next2_s_char = s[end+1] if len(s) > end+1 else ''
            if script_name == 'Tibetan':
                if self.props.get(('edge-delete', start)):
                    return ''
                elif self.props.get(('edge-vowel', start)):
                    return base_rom_plus_vowel
                else:
                    return base_rom
            if (next_s_char and ((base_rom in "bcdfghklmnpqrstvwz") or (base_rom in ["ng"]))
                    and (next_s_char in "យ")):  # Khmer yo
                return base_rom
            if self.uroman.dict_bool[('is-vowel-sign', next_s_char)]:
                return base_rom
            if self.uroman.dict_bool[('is-medial-consonant-sign', next_s_char)]:
                return base_rom
            if self.char_is_subjoined_letter(next_s_char):
                return base_rom
            if self.uroman.char_is_nonspacing_mark(next_s_char) \
                    and self.uroman.dict_bool[('is-vowel-sign', next2_s_char)]:
                return base_rom
            if self.uroman.dict_bool[('is-virama', next_s_char)]:
                return base_rom
            if self.uroman.char_is_nonspacing_mark(next_s_char) \
                    and self.uroman.dict_bool[('is-virama', next2_s_char)]:
                return base_rom
            if self.uroman.dict_bool[('is-virama', prev_s_char)]:
                return base_rom_plus_vowel
            if self.is_at_start_of_word(start) and not regex.search('r[aeiou]', rom):
                return base_rom_plus_vowel
            # delete many final schwas from most Devanagari languages (except: Sanskrit)
            if self.is_at_end_of_word(end):
                if (script_name in ("Devanagari",)) and (self.lcode not in ('san',)):  # Sanskrit
                    return rom
                else:
                    return base_rom_plus_vowel
            if uroman.chr_script_name(prev_s_char) != script_name:
                return base_rom_plus_vowel
            if 'VOCALIC' in self.uroman.chr_name(last_s_char):
                return base_rom
            if uroman.chr_script_name(next_s_char) == script_name:
                return base_rom_plus_vowel
        except Exception:
            return rom
        else:
            pass
            # print('ABUGIDA', rom, start, script_name, script, abugida_default_vowels, prev_s_char, next_s_char)
        return rom

    def cand_is_valid(self, rom_rule: RomRule, start: int, end: int, rom: str) -> bool:
        if rom is None:
            return False
        if rom_rule['dont-use-at-start-of-word'] and self.is_at_start_of_word(start):
            return False
        if rom_rule['use-only-at-start-of-word'] and not self.is_at_start_of_word(start):
            return False
        if rom_rule['dont-use-at-end-of-word'] and self.is_at_end_of_word(end):
            return False
        if rom_rule['use-only-at-end-of-word'] and not self.is_at_end_of_word(end):
            return False
        if rom_rule['use-only-for-whole-word'] \
                and not (self.is_at_start_of_word(start) and self.is_at_end_of_word(end)):
            return False
        if (lcodes := rom_rule['lcodes']) and (self.lcode not in lcodes):
            return False
        return True

    # @profile
    def simple_sorted_romanization_candidates_for_span(self, start, end) -> List[str]:
        s = self.s[start:end]
        if not self.uroman.dict_bool[('s-prefix', s)]:
            return []
        rom_rule_candidates = []
        for rom_rule in self.uroman.rom_rules[s]:
            rom = rom_rule['t']
            if self.cand_is_valid(rom_rule, start, end, rom):
                rom_rule_candidates.append((rom_rule['n-restr'] or 0, rom_rule['t']))
        rom_rule_candidates.sort(reverse=True)
        return [x[1] for x in rom_rule_candidates]

    def simple_top_romanization_candidate_for_span(self, start, end, simple_search: bool = False) -> str | None:
        if (start < 0) or (end > self.max_vertex):
            return None
        span_range = (start, end)
        if (cached_result := self.simple_top_rom_cache.get(span_range)) is not None:
            return cached_result
        best_cand, best_n_restr, best_rom_rule = None, None, None
        for rom_rule in self.uroman.rom_rules[self.s[start:end]]:
            if self.cand_is_valid(rom_rule, start, end, rom_rule['t']):
                n_restr = rom_rule['n-restr'] or 0
                if best_n_restr is None or (n_restr > best_n_restr):
                    best_cand, best_n_restr, best_rom_rule = rom_rule['t'], n_restr, rom_rule
        if simple_search:
            return best_cand
        if best_rom_rule:
            t_at_end_of_syllable = best_rom_rule['t-at-end-of-syllable']
            if t_at_end_of_syllable is not None:
                is_at_end_of_syllable, rationale = self.is_at_end_of_syllable(end)
                if is_at_end_of_syllable:
                    best_cand = t_at_end_of_syllable
                # print(f"   SIMPLE {start}-{end} {best_cand} ({best_rom_rule['t']},{t_at_end_of_syllable}) "
                #       f"END:{is_at_end_of_syllable} ({rationale})")
        self.simple_top_rom_cache[span_range] = best_cand
        # if (best_rom_rule is not None) and ('cancel' in (prov := best_rom_rule['prov'])):
        #     sys.stderr.write(f'   Cancel {self.s} ({start}-{end}) {prov} {self.s[start:end]}\n')
        return best_cand

    def decomp_rom(self, char_position: int) -> str | None:
        """Input: decomposable character such as ﻼ or ½

        Output: la or 1/2"""
        full_string = self.s
        char = full_string[char_position]
        rom = None
        if ud_decomp_s := ud.decomposition(char):
            format_comps = []
            other_comps = []
            decomp_s = ''
            # name = self.uroman.chr_name(char)
            for ud_decomp_elem in ud_decomp_s.split():
                if ud_decomp_elem.startswith("<"):
                    format_comps.append(ud_decomp_elem)
                else:
                    try:
                        norm_char = chr(int(ud_decomp_elem, 16))
                    except ValueError:
                        other_comps.append(ud_decomp_elem)
                    else:
                        decomp_s += norm_char
            if (format_comps and (format_comps[0] not in ('<super>', '<sub>', '<noBreak>', '<compat>'))
                    and (not other_comps) and decomp_s):
                rom = self.uroman.romanize_string(decomp_s, self.lcode)
            # make sure to add a space for 23½ -> 23 1/2
            if rom and ud.numeric(char, None):
                rom = rom.replace('⁄', '/')
                if char_position >= 1 and ud.numeric(full_string[char_position-1], None):
                    rom = ' ' + rom
                if (char_position+1 < len(full_string)) and ud.numeric(full_string[char_position+1], None):
                    rom += ' '
        return rom

    def add_romanization(self, **args):
        """Adds a romanization edge to the romanization lattice."""
        for start in range(self.max_vertex):
            for end in range(start+1, self.max_vertex+1):
                if not self.uroman.dict_bool[('s-prefix', self.s[start:end])]:
                    break
                if (rom := self.simple_top_romanization_candidate_for_span(start, end)) is not None:
                    if self.contains_script['Braille'] and (start+1 == end):
                        if self.props.get(('is-upper', start)):
                            rom = rom.upper()
                    edge_annotation = 'rom'
                    if regex.match(r'\+(m|ng|n|h|r)', rom):
                        rom, edge_annotation = rom[1:], 'rom tail'
                    rom = self.add_default_abugida_vowel(rom, start, end, annotation=edge_annotation)
                    # orig_rom, orig_start, orig_end = rom, start, end
                    rom, start2, end2, exp_edge_annotation \
                        = self.expand_rom_with_special_chars(rom, start, end, annotation=edge_annotation,
                                                             recursive=args.get('recursive', False), **args)
                    edge_annotation = exp_edge_annotation or edge_annotation
                    # if (orig_rom, orig_start, orig_end) != (rom, start, end):
                    #     print(f'EXP {s} {orig_rom} {orig_start}-{orig_end} -> {rom} {start}-{end}')
                    # if rom != rom_orig: print('** Add ABUGIDA', rom, start, end, rom2)
                    self.add_edge(Edge(start2, end2, rom, edge_annotation))
            if start < len(self.s):
                char = self.s[start]
                cp = ord(char)
                # Korean Hangul characters
                if 0xAC00 <= cp <= 0xD7A3:
                    if rom := self.uroman.unicode_hangul_romanization(char):
                        self.add_edge(Edge(start, start+1, rom, 'rom'))
                # character decomposition
                if rom_decomp := self.decomp_rom(start):
                    self.add_edge(Edge(start, start + 1, rom_decomp, 'rom decomp'))

    @staticmethod
    def update_edge_list(edges, new_edge, old_edges) -> List[NumEdge]:
        new_edge_not_yet_added = True
        result = []
        for edge in edges:
            if edge in old_edges:
                edge.active = False
                if new_edge_not_yet_added:
                    result.append(new_edge)
                    new_edge_not_yet_added = False
            else:
                result.append(edge)
        if new_edge_not_yet_added:
            result.append(new_edge)
        return result

    @staticmethod
    def edge_is_digit(edge: Edge | None) -> bool:
        return (isinstance(edge, NumEdge)
                and (edge.value is not None)
                and isinstance(edge.value, int)
                and (edge.type == 'digit')
                and (0 <= edge.value <= 9)
                and (edge.end - edge.start == 1))

    @staticmethod
    def is_gap_null_edge(edge: Edge) -> bool:
        return isinstance(edge, NumEdge) and (edge.orig_txt in ('零', '〇'))

    @staticmethod
    def braille_digit(char: str) -> str | None:
        position = '\u281A\u2801\u2803\u2809\u2819\u2811\u280B\u281B\u2813\u280A'.find(char)  # Braille 0-9
        return str(position) if position >= 0 else None

    def add_braille_number(self, start: int, end: int, txt: str, **_args) -> None:
        new_edge = NumEdge(start, end, txt, self.uroman)
        new_edge.type = 'number'
        self.add_edge(new_edge)

    def add_braille_numbers(self, **_args):
        if self.contains_script['Braille']:
            s = self.s
            num_s, start = '', None
            for i in range(len(s)):
                char = s[i]
                if char == '\u283C':  # number mark
                    if start is None:
                        start = i
                elif (start is not None) and (digit_s := self.braille_digit(char)):
                    num_s += digit_s
                elif (start is not None) and (char == '\u2832'):  # period
                    num_s += '.'
                elif (start is not None) and (char == '\u2802'):  # comma
                    num_s += ','
                elif isinstance(start, int):
                    self.add_braille_number(start, i, num_s)
                    num_s, start = '', None
            if start is not None:
                self.add_braille_number(start, len(s), num_s)

    def add_numbers(self, uroman, **args):
        """Adds a numerical romanization edge to the romanization lattice, currently just for digits.

        To be significantly expanded to cover complex Chinese, Egyptian, Amharic numbers."""
        verbose = bool(args.get('verbose'))
        s = self.s
        num_edges = []
        for start in range(len(s)):
            char = s[start]
            if uroman.num_props[char]:
                new_edge = NumEdge(start, start + 1, char, uroman)
                num_edges.append(new_edge)
                if verbose:
                    print('NumEdge', new_edge)
                self.add_edge(new_edge)
        # D1 sequence of digits 1234
        for edge in num_edges:
            if self.edge_is_digit(edge) and edge.active:  # and (edge.value != 0):
                n_decimal_points = 0
                n_decimals = None
                new_value_s = str(edge.value)
                sub_edges = [edge]
                prev_edge = edge
                while True:
                    right_edge = self.best_right_neighbor_edge(prev_edge.end)
                    if self.edge_is_digit(right_edge):
                        sub_edges.append(right_edge)
                        new_value_s += str(right_edge.value)
                        if n_decimals is not None:
                            n_decimals += 1
                        prev_edge = right_edge
                    elif ((prev_edge.end < len(s)) and (s[prev_edge.end] == '.') and (n_decimal_points == 0)
                            and (right_edge2 := self.best_right_neighbor_edge(prev_edge.end + 1))
                            and self.edge_is_digit(right_edge2)):
                        if right_edge is None:
                            right_edge = Edge(prev_edge.end, prev_edge.end+1, s[prev_edge.end],
                                              'decimal period')
                            self.add_edge(right_edge)
                        sub_edges.append(right_edge)
                        sub_edges.append(right_edge2)
                        new_value_s += '.' + str(right_edge2.value)
                        n_decimal_points += 1
                        n_decimals = 1
                        prev_edge = right_edge2
                    else:
                        break
                if len(sub_edges) >= 2:
                    new_value = float(new_value_s) if '.' in new_value_s else int(new_value_s)
                    new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True)
                    new_edge.update(value=new_value, value_s=new_value_s, n_decimals=n_decimals, num_base=1, 
                                    e_type='D1', script=sub_edges[-1].script)
                    self.add_edge(new_edge)
                    num_edges = self.update_edge_list(num_edges, new_edge, sub_edges)
                    if verbose:
                        print(new_edge.type, new_edge)
        # G1 combine (*) "single digits" 2*100=200, 3*10= 30
        for edge in num_edges:
            if (isinstance(edge, NumEdge) and edge.active and (edge.num_base == 1)
                    and isinstance(edge.value, int) and (edge.value >= 1)):
                right_edge = self.best_right_neighbor_edge(edge.end, skip_num_edge=False)
                if (right_edge
                        and isinstance(right_edge, NumEdge)
                        and right_edge.active
                        and isinstance(right_edge.value, int)
                        and (right_edge.num_base > 1)
                        and (not right_edge.is_large_power)):
                    new_value = edge.value * right_edge.value
                    new_edge = NumEdge(edge.start, right_edge.end, str(new_value), uroman, active=True)
                    new_edge.update(value=new_value, num_base=right_edge.num_base, e_type='G1',
                                    orig_txt=edge.orig_txt + right_edge.orig_txt,
                                    script=right_edge.script)
                    self.add_edge(new_edge)
                    num_edges = self.update_edge_list(num_edges, new_edge, [edge, right_edge])
                    if verbose:
                        print(new_edge.type, new_edge)
        # G2 combine (+) G1 "single digits" 200+30+4=234 (within larger blocks of 1000, 1000000)
        for edge in num_edges:
            if isinstance(edge, NumEdge) and edge.active and isinstance(edge.value, int) and not edge.is_large_power:
                sub_edges = [edge]
                prev_edge = edge
                prev_non_edge = edge  # None if (edge.orig_txt in '零') else prev_edge
                while (prev_edge
                       and (right_edge := self.best_right_neighbor_edge(prev_edge.end, skip_num_edge=False))
                       and isinstance(right_edge, NumEdge)
                       and right_edge.active
                       and isinstance(right_edge.value, int)
                       and (not right_edge.is_large_power)
                       and (self.is_gap_null_edge(prev_non_edge)
                            or ((prev_non_edge.num_base > right_edge.value)
                                and (prev_non_edge.num_base > right_edge.num_base)))):
                    sub_edges.append(right_edge)
                    prev_edge = right_edge
                    if not self.is_gap_null_edge(right_edge):
                        prev_non_edge = right_edge
                if len(sub_edges) >= 2:
                    new_value = sum([e.value for e in sub_edges])
                    new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True)

                    new_edge.update(value=new_value, num_base=sub_edges[-1].num_base, e_type='G2',
                                    orig_txt=''.join([e.orig_txt for e in sub_edges]),
                                    script=sub_edges[-1].script)
                    self.add_edge(new_edge)
                    num_edges = self.update_edge_list(num_edges, new_edge, sub_edges)
                    new_edge.type = 'G2'
                    if verbose:
                        print(new_edge.type, new_edge)
        # G3 combine (*) G2 blocks with large powers, e.g. 234*1000 = 234000
        for edge in num_edges:
            if (isinstance(edge, NumEdge) and edge.active and (not edge.is_large_power)
                    and (isinstance(edge.value, int) or isinstance(edge.value, float))):
                right_edge = self.best_right_neighbor_edge(edge.end, skip_num_edge=False)
                if (right_edge
                        and isinstance(right_edge, NumEdge)
                        and right_edge.active
                        and isinstance(right_edge.value, int)
                        and (right_edge.num_base > 1)
                        and right_edge.is_large_power):
                    new_value = round(edge.value * right_edge.value, 5)
                    if isinstance(new_value, float) and new_value.is_integer():
                        new_value = int(new_value)
                    new_edge = NumEdge(edge.start, right_edge.end, str(new_value), uroman, active=True)
                    new_edge.update(value=new_value, num_base=right_edge.num_base, e_type='G3',
                                    orig_txt=edge.orig_txt + right_edge.orig_txt,
                                    script=right_edge.script)
                    self.add_edge(new_edge)
                    num_edges = self.update_edge_list(num_edges, new_edge, [edge, right_edge])
                    if verbose:
                        print(new_edge.type, new_edge)
        # G4 combine (+) G3 blocks 234000+567=234567
        for edge in num_edges:
            if isinstance(edge, NumEdge) and edge.active and isinstance(edge.value, int):
                sub_edges = [edge]
                while ((prev_edge := sub_edges[-1])
                       and (right_edge := self.best_right_neighbor_edge(prev_edge.end, skip_num_edge=False))
                       and isinstance(right_edge, NumEdge)
                       and right_edge.active
                       and isinstance(right_edge.value, int)
                       and (prev_edge.num_base > right_edge.value)
                       and (prev_edge.num_base > right_edge.num_base)):
                    if ((prev_edge.script == 'CJK')
                            and (prev_edge.num_base >= 1000)
                            and ('tag' not in prev_edge.type)
                            and regex.match('10+$', str(prev_edge.num_base))
                            and (1 <= right_edge.value <= 9)
                            and (right_edge.start + 1 == right_edge.end)):
                        new_num_base = prev_edge.num_base // 10
                        new_value = new_num_base * right_edge.value
                        # print('DIGIT TAG', prev_edge, right_edge, new_value)
                        right_edge.value = new_value
                        right_edge.num_base = new_num_base
                        right_edge.type = 'G4tag'
                    sub_edges.append(right_edge)
                if len(sub_edges) >= 2:
                    new_value = sum([e.value for e in sub_edges])
                    new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True)
                    new_edge.update(value=new_value, num_base=sub_edges[-1].num_base, e_type='G4',
                                    orig_txt=''.join([e.orig_txt for e in sub_edges]),
                                    script=sub_edges[-1].script)
                    self.add_edge(new_edge)
                    num_edges = self.update_edge_list(num_edges, new_edge, sub_edges)
                    if verbose:
                        print(new_edge.type, new_edge)
        # F1
        for edge in num_edges:
            # cushion fractions with spaces as needed: e.g. 23½ -> 23 1/2 or 十一五 -> 11 5
            if isinstance(edge, NumEdge) and regex.match(r'\d', edge.txt):
                left_edge = self.best_left_neighbor_edge(edge.start)
                if left_edge and regex.search(r'\d$', left_edge.txt):
                    if edge.fraction:
                        sep = ' '
                    else:
                        sep = '·'
                    edge.txt = sep + edge.txt

        for edge in num_edges:
            if (isinstance(edge, NumEdge) and edge.active and (edge.value is not None)
                    and (((edge.value > 1000) and (edge.start + 1 == edge.end))
                         or (edge.orig_txt in '兩參参伍陆陸什')
                         or (edge.orig_txt in ('京兆', )))):
                edge.active = False
        if verbose:  # or (num_edges and any([e.type in ['G1', 'G2', 'G3', 'G4'] for e in num_edges])):
            if num_edges:
                print('actives:')
            for num_edge in num_edges:
                print(num_edge)
        for start in range(len(s)):
            start_char = s[start]
            if (best_edge := self.best_edge_in_span(start, start+1)) and isinstance(best_edge, NumEdge):
                continue
            if (num := ud_numeric(start_char)) is not None:
                name = self.uroman.chr_name(start_char)
                if ("DIGIT" in name) and isinstance(num, int) and (0 <= num <= 9):
                    # if start_char not in '0123456789': print('DIGIT', s[start], num, name)
                    self.add_edge(Edge(start, start + 1, str(num), 'num'))
                else:
                    uroman.stats[('*NUM', start_char, num)] += 1

    def add_rom_fall_back_singles(self, **_args):
        """For characters in the original string not covered by romanizations and numbers,

        add a fallback edge based on type, romanization of single char, or original char."""
        for start in range(self.max_vertex):
            end = start+1
            orig_char = self.s[start]
            if not self.lattice[(start, end)]:
                rom, edge_annotation = orig_char, 'orig'
                if self.uroman.char_is_nonspacing_mark(rom):
                    rom, edge_annotation = '', 'Mn'
                elif self.uroman.char_is_format_char(rom):  # e.g. zero-width non-joiner, zero-width joiner
                    rom, edge_annotation = '', 'Cf'
                elif ud.category(orig_char) == 'Co':
                    rom, edge_annotation = '', 'Co'
                elif rom == ' ':
                    edge_annotation = 'orig'
                # elif self.uroman.char_is_space_separator(rom):
                #     rom, edge_annotation = ' ', 'Zs'
                elif (rom2 := self.simple_top_romanization_candidate_for_span(start, end)) is not None:
                    rom = rom2
                    if regex.match(r'\+(m|ng|n|h|r)', rom):
                        rom = rom[1:]
                    edge_annotation = 'rom single'
                # else the original values still hold: rom, edge_annotation = orig_char, 'orig'
                self.add_edge(Edge(start, end, rom, edge_annotation))

    @staticmethod
    def add_new_edge(old_edges: List[Edge], start: int, end: int, new_rom: str, new_type: str, position: int | None,

                     old_edge_dict: dict)\
            -> None:
        if (start, end, new_rom) not in old_edge_dict:
            new_edge = Edge(start, end, new_rom, new_type)
            if position is None:
                old_edges.append(new_edge)
            else:
                old_edges.insert(position + 1, new_edge)
            old_edge_dict[(start, end, new_rom)] = new_edge
            # print(f'  ALT {start}-{end} {new_rom}')

    def add_alternatives(self, old_edges: List[Edge]) -> None:
        old_edge_dict = {}
        for old_edge in old_edges:
            old_edge_dict[(old_edge.start, old_edge.end, old_edge.txt)] = old_edge
        for position, old_edge in enumerate(old_edges):
            if old_edge.type.startswith('rom-alt'):
                continue   # not old
            start, end = old_edge.start, old_edge.end
            orig_s = self.s[start:end]
            old_rom = old_edge.txt
            # self.lattice[(start, end)]:
            for rom_rule in self.uroman.rom_rules[orig_s]:
                rom_t = rom_rule['t']
                if self.cand_is_valid(rom_rule, start, end, rom_t):
                    rom_alts = rom_rule['t-alts']
                    rom_eosyl = rom_rule['t-at-end-of-syllable']
                    if (rom_t == old_rom) and rom_alts:
                        for rom_alt in rom_alts:
                            self.add_new_edge(old_edges, start, end, rom_alt, 'rom-alt', position,
                                              old_edge_dict)
                    if (rom_t == old_rom) and rom_eosyl:
                        self.add_new_edge(old_edges, start, end, rom_t, 'rom-alt2', position, old_edge_dict)
                    if rom_eosyl == old_rom:
                        self.add_new_edge(old_edges, start, end, rom_t, 'rom-alt3', position, old_edge_dict)

    def all_edges(self, start: int, end: int) -> List[Edge]:
        result = []
        for start2 in range(start, end):
            for end2 in sorted(list(self.lattice[(start2, 'right')]), reverse=True):
                if end2 <= end:
                    result.extend(self.lattice[(start2, end2)])
                else:
                    break
        return result

    def best_edge_in_span(self, start: int, end: int, skip_num_edge: bool = False) -> Edge | None:
        edges = self.lattice[(start, end)]
        # if len(edges) >= 2: print('Multi edge', start2, end2, self.s[start2:end2], edges)
        decomp_edge, other_edge, rom_edge = None, None, None
        for edge in edges:
            if isinstance(edge, NumEdge):
                if skip_num_edge:
                    continue
                if edge.active:
                    return edge
            if edge.type.startswith('rom decomp'):
                if decomp_edge is None:
                    decomp_edge = edge  # plan C
            elif regex.match(r'(?:rom|num)', edge.type):
                if rom_edge is None:
                    rom_edge = edge  # plan B
            elif other_edge is None:
                other_edge = edge  # plan D
        return rom_edge or decomp_edge or other_edge

    def best_right_neighbor_edge(self, start: int, skip_num_edge: bool = False) -> Edge | None:
        for end in sorted(list(self.lattice[(start, 'right')]), reverse=True):
            if best_edge := self.best_edge_in_span(start, end, skip_num_edge=skip_num_edge):
                return best_edge
        return None

    def best_left_neighbor_edge(self, end: int, skip_num_edge: bool = False) -> Edge | None:
        for start in sorted(list(self.lattice[(end, 'left')])):
            if best_edge := self.best_edge_in_span(start, end, skip_num_edge=skip_num_edge):
                return best_edge
        return None

    def best_rom_edge_path(self, start: int, end: int, skip_num_edge: bool = False) -> List[Edge]:
        """Finds the best romanization edge path through the romanization lattice, including

        non-romanized pieces such as ASCII and non-ASCII punctuation."""
        result = []
        start2 = start
        while start2 < end:
            if best_edge := self.best_right_neighbor_edge(start2, skip_num_edge=skip_num_edge):
                result.append(best_edge)
                start2 = best_edge.end
            else:  # should not happen
                start2 += 1
        return result

    def find_rom_edge_path_backwards(self, start: int, end: int, min_char: int | None = None,

                                     return_str: bool = False, skip_num_edge: bool = False) -> List[Edge] | str:
        """Finds a partial best path on the left from a start position to provide left contexts for

        romanization rules. Can return a string or a list of edges. Is typically used for a short context,

        as specified by min_char."""
        result_edges = []
        rom = ''
        end2 = end
        while start < end2:
            old_end2 = end2
            if new_edge := self.best_left_neighbor_edge(end2, skip_num_edge=skip_num_edge):
                result_edges = [new_edge] + result_edges
                rom = new_edge.txt + rom
                end2 = new_edge.start
            if min_char and len(rom) >= min_char:
                break
            if old_end2 >= end2:
                end2 -= 1
        if return_str:
            return rom
        else:
            return result_edges

    @staticmethod
    def edge_path_to_surf(edges) -> str:
        result = ''
        for edge in edges:
            result += edge.txt
        return result


# @timer
def main():
    """This function provides a user interface, either using argparse for a command line interface,

    or providing direct function calls.

    First, a uroman object will have to created, loading uroman data (directory must be provided,

    listed as default). This only needs to be done once.

    After that you can romanize from file to file, or just romanize a string."""

    # Compute data_dir based on the location of this executable script.
    src_dir = os.path.dirname(os.path.realpath(__file__))
    root_dir = os.path.dirname(src_dir)
    data_dir = os.path.join(root_dir, "data")
    # print(src_dir, root_dir, data)

    parser = argparse.ArgumentParser()
    parser.add_argument('direct_input', nargs='*', type=str)
    parser.add_argument('--data_dir', type=Path, default=data_dir, help='uroman resource dir')
    parser.add_argument('-i', '--input_filename', type=str, help='default: sys.stdin')
    parser.add_argument('-o', '--output_filename', type=str, help='default: sys.stdout')
    parser.add_argument('-l', '--lcode', type=str, default=None,
                        help='ISO 639-3 language code, e.g. eng')
    # parser.add_argument('-f', '--rom_format', type=RomFormat, default=RomFormat.STR, help:'alt: RomFormat.EDGES')
    parser.add_argument('-f', '--rom_format', type=RomFormat, default=RomFormat.STR,
                        choices=list(RomFormat), help="Output format of romanization. 'edges' provides offsets")
    # The remaining arguments are mostly for development and test
    parser.add_argument('--max_lines', type=int, default=None, help='limit uroman to first n lines')
    parser.add_argument('--load_log', action='count', default=0, help='report load stats')
    parser.add_argument('--test', action='count', default=0, help='perform/display a few tests')
    parser.add_argument('-v', '--verbose', action='count', default=0)
    parser.add_argument('--rebuild_ud_props', action='count', default=0,
                        help='rebuild UnicodeDataProps files (for development mode only)')
    parser.add_argument('--rebuild_num_props', action='count', default=0,
                        help='rebuild NumProps file (for development mode only)')
    parser.add_argument('--no_caching', action='count', default=0, help='for development mode: speed')
    parser.add_argument('--silent', action='count', default=0, help='suppress ... progress')
    parser.add_argument('-a', '--ablation', type=str, default='', help='for development mode: nocap')
    parser.add_argument('--stats', action='count', default=0, help='for development mode: numbers')
    parser.add_argument('--ignore_args', action='count', default=0, help='for usage illustration only')
    parser.add_argument(PROFILE_FLAG, type=argparse.FileType('w', encoding='utf-8', errors='ignore'),
                        default=None, metavar='PROFILE-FILENAME', help='(optional output for performance analysis)')
    args = parser.parse_args()
    # copy selected (minor) args from argparse.Namespace to dict
    args_dict = {'rom_format': args.rom_format, 'load_log': args.load_log, 'test': args.test, 'stats': args.stats,
                 'no_caching': args.no_caching, 'max_lines': args.max_lines, 'verbose': args.verbose,
                 'rebuild_ud_props': args.rebuild_ud_props, 'rebuild_num_props': args.rebuild_num_props,
                 'ablation': args.ablation, 'silent': args.silent}
    pr = None
    if args.profile:
        gc.enable()
        gc.set_debug(gc.DEBUG_STATS)
        gc.set_debug(gc.DEBUG_LEAK)
        pr = cProfile.Profile()
        pr.enable()
    '''Sample calls:

uroman.py --help

uroman.py -i ../test/multi-script.txt -o ../test/multi-script-out2.txt

uroman.py  < ../test/multi-script.txt  > ../test/multi-script-out2.txt

uroman.py Игорь

uroman.py Игорь --lcode ukr

uroman.py ألاسكا 서울 Καλιφόρνια

uroman.py ちょっとまってください -f edges

uroman.py "महात्मा गांधी" -f lattice

uroman.py สวัสดี --load_log

uroman.py --test

uroman.py --ignore_args

uroman.py Բարեւ -o ../test/tmp-out.txt -f edges

# In double input cases such as in the line below,

# the input-file's romanization is sent to stdout, while the direct-input romanization is sent to stderr

uroman.py ⴰⵣⵓⵍ -i ../test/multi-script.txt > ../test/multi-script-out2.txt

    '''

    if args.ignore_args:
        # minimal calls
        uroman = Uroman(args.data_dir)
        s, s2, s3, s4 = 'Игорь', 'ちょっとまってください', 'ka‍n‍ne', 'महात्मा गांधी'
        print(s, uroman.romanize_string(s))
        print(s, uroman.romanize_string(s, lcode='ukr'))
        print(s2, Edge.json_str(uroman.romanize_string(s2, rom_format=RomFormat.EDGES)))
        print(s3, Edge.json_str(uroman.romanize_string(s3, rom_format=RomFormat.EDGES)))
        print(s4, Edge.json_str(uroman.romanize_string(s4, rom_format=RomFormat.LATTICE)))
        # Note that ../test/multi-script.txt has several lines starting with ::lcode eng etc.
        # This allows users to select specific language codes to specific lines, overwriting the overall --lcodes
        uroman.romanize_file(input_filename='../test/multi-script.txt',
                             output_filename='../test/multi-script-out3.txt')
    else:
        # build a Uroman object (once for many applications and different scripts and languages)
        uroman = Uroman(args.data_dir, load_log=args.load_log, rebuild_ud_props=args.rebuild_ud_props,
                        rebuild_num_props=args.rebuild_num_props)
        romanize_file_p = (args.input_filename or args.output_filename
                           or not (args.direct_input or args.test or args.ignore_args
                                   or args.rebuild_ud_props or args.rebuild_num_props))
        # Romanize any positional arguments, interpreted as strings to be romanized.
        for s in args.direct_input:
            result = uroman.romanize_string(s.rstrip(), lcode=args.lcode, **args_dict)
            result_json = Edge.json_str(result)
            if romanize_file_p:
                # input from both file/stdin (to file/stdout) and direct-input (to stderr)
                if args.input_filename:
                    sys.stderr.write(result_json + '\n')
                # input from direct-input (but not from file/stdin) to stdout
                # else pass
            # no file/stdin or file/stdout, so we write romanization of direct-input to stdout
            else:
                print(result_json)
        # If provided, apply romanization to an entire file.
        if romanize_file_p:
            uroman.romanize_file(args.input_filename, args.output_filename, lcode=args.lcode,
                                 direct_input=args.direct_input, **args_dict)
        if args.test:
            uroman.test_output_of_selected_scripts_and_rom_rules()
            uroman.test_romanization()
        if uroman.stats and args.stats:
            stats100 = {k: uroman.stats[k] for k in list(dict(uroman.stats))[:100]}
            sys.stderr.write(f'Stats: {stats100} ...\n')
    if args.profile:
        if pr:
            pr.disable()
            ps = pstats.Stats(pr, stream=args.profile).sort_stats(pstats.SortKey.TIME)
            ps.print_stats()
        print(gc.get_stats())


if __name__ == "__main__":
    main()