ALEPH_WEO-WEBETA

Configuration error

App Files Files

ALEPH_WEO-WEBETA / uroman /bin /uroman.py

test-rtechs

Upload 68 files

607e564 verified 12 months ago

raw

history blame

122 kB

	#!/usr/bin/env python

	"""
	Written by Ulf Hermjakob, USC/ISI March-April 2024
	uroman is a universal romanizer. It converts text in any script to the Latin alphabet.
	This script is a Python reimplementation of an earlier Perl script, with some improvements.
	The tool has been tested on 250 languages, with 100 or more sentences each.
	This script is still under development and large-scale testing. Feedback welcome.
	This script provides token-size caching (for faster runtimes).
	Output formats include
	(1) best romanization string
	(2) best romanization edges ("best path"; incl. start and end positions with respect to the original string)
	(3) best romanization with alternatives (as applicable for ambiguous romanization)
	(4) best romanization full lattice (all edges, including superseded sub-edges)
	See below for 'sample calls' under main()
	"""


	from __future__ import annotations
	import argparse
	from collections import defaultdict
	# from memory_profiler import profile
	import datetime
	from enum import Enum
	from fractions import Fraction
	import gc
	import json
	import math
	import os
	import pathlib
	from pathlib import Path
	import pstats
	import regex
	import sys
	from typing import List, Tuple
	import unicodedata as ud
	PROFILE_FLAG = "--profile" # also used in argparse processing
	if PROFILE_FLAG in sys.argv:
	import cProfile

	# UTILITIES


	def timer(func):
	def wrapper(args, *kwargs):
	start_time = datetime.datetime.now()
	print(f"Calling: {func.__name__}{args}")
	print(f"Start time: {start_time:%A, %B %d, %Y at %H:%M}")
	result = func(args, *kwargs)
	end_time = datetime.datetime.now()
	time_diff = (end_time-start_time).total_seconds()
	print(f"End time: {end_time:%A, %B %d, %Y at %H:%M}")
	print(f"Duration: {time_diff} seconds")
	return result
	return wrapper


	def slot_value_in_double_colon_del_list(line: str, slot: str, default: str \| list \| None = None) -> str \| list \| None:
	"""For a given slot, e.g. 'cost', get its value from a line such as '::s1 of course ::s2 ::cost 0.3' -> 0.3
	The value can be an empty string, as for ::s2 in the example above."""
	m = regex.match(fr'(?:.\s)?::{slot}(\|\s+\S.?)(?:\s+::\S.\|\s)$', line)
	return m.group(1).strip() if m else default


	def has_value_in_double_colon_del_list(line: str, slot: str) -> bool:
	return isinstance(slot_value_in_double_colon_del_list(line, slot), str)


	def dequote_string(s: str) -> str:
	if isinstance(s, str):
	m = regex.match(r'''\s(['"“])(.)(['"”])\s*$''', s)
	if m and ((m.group(1) + m.group(3)) in ("''", '""', '“”')):
	return m.group(2)
	return s


	def last_chr(s: str) -> str:
	if len(s):
	return s[len(s)-1]
	else:
	''


	def ud_numeric(char: str) -> int \| float \| None:
	try:
	num_f = ud.numeric(char)
	return int(num_f) if num_f.is_integer() else num_f
	except (ValueError, TypeError):
	return None


	def robust_str_to_num(num_s: str, filename: str = None, line_number: int \| None = None, silent: bool = False) \
	-> int \| float \| None:
	if isinstance(num_s, str):
	try:
	return float(num_s) if "." in num_s else int(num_s)
	except ValueError:
	if not silent:
	sys.stderr.write(f'Cannot convert "{num_s}" to a number')
	if line_number:
	sys.stderr.write(f' line: {line_number}')
	if filename:
	sys.stderr.write(f' file: {filename}')
	sys.stderr.write(f'\n')
	elif isinstance(num_s, float) or isinstance(num_s, int):
	return num_s
	return None


	def first_non_none(*args):
	for arg in args:
	if arg is not None:
	return arg
	return None


	def any_not_none(*args) -> bool:
	for arg in args:
	if arg is not None:
	return True
	return False


	def add_non_none_to_dict(d: dict, key: str, value) -> None:
	if value is not None:
	d[key] = value


	def fraction_char2fraction(fraction_char: str, fraction_value: float \| None = None,
	uroman: Uroman \| None = None) -> Fraction \| None:
	s = ''
	fraction = None
	for ud_decomp_elem in ud.decomposition(fraction_char).split():
	try:
	s += chr(int(ud_decomp_elem, 16))
	except ValueError:
	s += ud_decomp_elem
	if m := regex.match(r'<fraction>(\d+)⁄(\d+)$', s):
	numerator_s, denominator_s = m.group(1, 2)
	try:
	fraction = Fraction(int(numerator_s), int(denominator_s))
	except ValueError:
	fraction = None
	if (fraction is None) and uroman and fraction_value:
	if num_denom := uroman.unicode_float2fraction(fraction_value):
	try:
	fraction = Fraction(num_denom[0], num_denom[1])
	except ValueError:
	fraction = None
	return fraction


	def chr_name(char: str) -> str:
	"""robust version of ud.name; see related Uroman.char_name() that includes names not included in UnicodeData.txt"""
	try:
	return ud.name(char)
	except (ValueError, TypeError):
	return ''


	def args_get(key: str, args: argparse.Namespace \| None = None):
	return vars(args)[key] if args and (key in args) else None


	class DictClass:
	def __init__(self, **kw_args):
	for kw_arg in kw_args:
	kw_arg2 = kw_arg.replace('_', '-')
	value = kw_args[kw_arg]
	if not (value in (None, [], False)):
	self.__dict__[kw_arg2] = value

	def __repr__(self):
	return str(self.__dict__)

	def __getitem__(self, key, default=None):
	return self.__dict__[key] if key in self.__dict__ else default

	def __bool__(self):
	return len(self.__dict__) > 0


	class RomRule(DictClass):
	# key: source string
	# typical attributes: s (source), t (target), prov (provenance), lcodes (language codes)
	# t_alts=t_alts (target alternatives), use_only_at_start_of_word, dont_use_at_start_of_word,
	# use_only_at_end_of_word, dont_use_at_end_of_word, use_only_for_whole_word
	pass


	class Script(DictClass):
	# key: lower case script_name
	# typical attributes: script_name, direction, abugida_default_vowels, alt_script_names, languages
	pass


	class RomFormat(Enum):
	"""Output format of romanization"""
	STR = 'str' # simple string
	EDGES = 'edges' # list of edges (includes character offsets in original string)
	ALTS = 'alts' # lattice including alternative edges
	LATTICE = 'lattice' # lattice including alternative and superseded edges

	def __str__(self):
	return self.value


	class Uroman:
	"""This class loads and maintains uroman data independent of any specific text corpus.
	Typically, only a single instance will be used. (In contrast to multiple lattice instances, one per text.)
	Methods include some testing. And finally methods to romanize a string (romanize_string()) or an entire file
	(romanize_file())."""
	def __init__(self, data_dir: Path, **args): # args: load_log, rebuild_ud_props
	self.data_dir = data_dir
	self.rom_rules = defaultdict(list)
	self.scripts = defaultdict(Script)
	self.dict_bool = defaultdict(bool)
	self.dict_str = defaultdict(str)
	self.dict_int = defaultdict(int)
	self.dict_num = defaultdict(lambda: None) # values are int (most common), float, or str ("1/2")
	# num_props key: txt
	# values: {"txt": "\u137b", "rom": "100", "value": 100, "type": "base", "mult": 1, "script": "Ethiopic"}
	self.num_props = defaultdict(dict)
	self.dict_set = defaultdict(set)
	self.float2fraction = {} # caching
	gc.disable()
	self.load_resource_files(data_dir, args.get('load_log', False),
	args.get('rebuild_ud_props', False),
	args.get('rebuild_num_props', False))
	gc.enable()
	self.hangul_rom = {}
	self.rom_cache = {} # key: (s, lcode) value: t
	self.stats = defaultdict(int) # stats, e.g. for unprocessed numbers
	self.abugida_cache = {} # key: (script, char_rom) value: (base_rom, base_rom_plus_abugida_vowel, modified rom)

	def second_rom_filter(self, c: str, rom: str, name: str \| None) -> Tuple[str \| None, str]:
	"""Much of this code will eventually move the old Perl code to generate cleaner primary data"""
	if rom and (' ' in rom):
	if name is None:
	name = self.chr_name(c)
	if "MYANMAR VOWEL SIGN KAYAH" in name:
	if m := regex.search(r'kayah\s+(\S+)\s*$', rom):
	return m.group(1), name
	if "MENDE KIKAKUI SYLLABLE" in name:
	if m := regex.search(r'm\d+\s+(\S+)\s*$', rom):
	return m.group(1), name
	if regex.search(r'\S\s+\S', rom):
	return c, name
	return None, name

	def load_rom_file(self, filename: str, provenance: str, file_format: str = None, load_log: bool = True):
	"""Reads in and processes the 3 main romanization data files: (1) romanization-auto-table.txt
	which was automatically generated from UnicodeData.txt (2) UnicodeDataOverwrite.txt that "corrects"
	some entries in romanization-auto-table.txt and (3) romanization-table.txt which was largely manually
	created and allows complex romanization rules, some for specific languages, some for specific contexts."""
	n_entries = 0
	try:
	f = open(filename)
	except FileNotFoundError:
	sys.stderr.write(f'Cannot open file {filename}\n')
	return
	with (f):
	for line_number, line in enumerate(f, 1):
	if line.startswith('#'):
	continue
	if regex.match(r'^\s*$', line): # blank line
	continue
	line = regex.sub(r'\s{2,}#.*$', '', line)
	if file_format == 'u2r':
	t_at_end_of_syllable = None
	u = dequote_string(slot_value_in_double_colon_del_list(line, 'u'))
	try:
	cp = int(u, 16)
	s = chr(cp)
	except ValueError:
	continue
	t = dequote_string(slot_value_in_double_colon_del_list(line, 'r'))
	if name := slot_value_in_double_colon_del_list(line, 'name'):
	self.dict_str[('name', s)] = name
	if pic := slot_value_in_double_colon_del_list(line, 'pic'):
	self.dict_str[('pic', s)] = pic
	if tone_mark := slot_value_in_double_colon_del_list(line, 'tone-mark'):
	self.dict_str[('tone-mark', s)] = tone_mark
	if syllable_info := slot_value_in_double_colon_del_list(line, 'syllable-info'):
	self.dict_str[('syllable-info', s)] = syllable_info
	else:
	s = dequote_string(slot_value_in_double_colon_del_list(line, 's'))
	t = dequote_string(slot_value_in_double_colon_del_list(line, 't'))
	t_at_end_of_syllable = dequote_string(slot_value_in_double_colon_del_list(line,
	't-end-of-syllable'))
	if (num_s := slot_value_in_double_colon_del_list(line, 'num')) is not None:
	num = robust_str_to_num(num_s)
	self.dict_num[s] = (num_s if (num is None) else num)
	is_minus_sign = has_value_in_double_colon_del_list(line, 'is-minus-sign')
	is_plus_sign = has_value_in_double_colon_del_list(line, 'is-plus-sign')
	is_decimal_point = has_value_in_double_colon_del_list(line, 'is-decimal-point')
	is_large_power = has_value_in_double_colon_del_list(line, 'is-large-power')
	fraction_connector = slot_value_in_double_colon_del_list(line, 'fraction-connector')
	percentage_marker = slot_value_in_double_colon_del_list(line, 'percentage-marker')
	int_frac_connector = slot_value_in_double_colon_del_list(line, 'int-frac-connector')
	lcode_s = slot_value_in_double_colon_del_list(line, 'lcode')
	lcodes = regex.split(r'[,;]\s*', lcode_s) if lcode_s else []
	use_only_at_start_of_word = has_value_in_double_colon_del_list(line, 'use-only-at-start-of-word')
	dont_use_at_start_of_word = has_value_in_double_colon_del_list(line, 'dont-use-at-start-of-word')
	use_only_at_end_of_word = has_value_in_double_colon_del_list(line, 'use-only-at-end-of-word')
	dont_use_at_end_of_word = has_value_in_double_colon_del_list(line, 'dont-use-at-end-of-word')
	use_only_for_whole_word = has_value_in_double_colon_del_list(line, 'use-only-for-whole-word')
	num_s = slot_value_in_double_colon_del_list(line, 'num')
	num = robust_str_to_num(num_s, filename, line_number, silent=False)
	t_alt_s = slot_value_in_double_colon_del_list(line, 't-alt')
	t_alts = regex.split(r'[,;]\s*', t_alt_s) if t_alt_s else []
	t_alts = list(map(dequote_string, t_alts))
	t_mod, name2 = self.second_rom_filter(s, t, None)
	if t_mod and (t_mod != t):
	if t != s:
	pass # sys.stderr.write(f'UPDATE: {s} {name2} {t} -> {t_mod}\n')
	t = t_mod
	if s is not None:
	for bool_key in ('is-large-power', 'is-minus-sign', 'is-plus-sign', 'is-decimal-point'):
	bool_value = eval(bool_key.replace('-', '_'))
	if bool_value:
	self.dict_bool[(bool_key, s)] = True
	if any_not_none(t, num, is_minus_sign, is_plus_sign, is_decimal_point, is_large_power,
	fraction_connector, percentage_marker, int_frac_connector):
	self.register_s_prefix(s)
	n_entries += 1
	# if regex.match(r'[\u2800-\u28FF]', s): print("Braille", s, t)
	restrictions = [lcodes, use_only_at_start_of_word, dont_use_at_start_of_word,
	use_only_at_end_of_word, dont_use_at_end_of_word, use_only_for_whole_word]
	n_restrictions = len([restr for restr in restrictions if restr])
	provenance2 = provenance
	if (t is None) and (num is not None) and (provenance2 == "rom"):
	provenance2 = "num"
	new_rom_rule = RomRule(s=s, t=t, prov=provenance2, lcodes=lcodes, t_alts=t_alts, num=num,
	use_only_at_start_of_word=use_only_at_start_of_word,
	dont_use_at_start_of_word=dont_use_at_start_of_word,
	use_only_at_end_of_word=use_only_at_end_of_word,
	dont_use_at_end_of_word=dont_use_at_end_of_word,
	use_only_for_whole_word=use_only_for_whole_word,
	t_at_end_of_syllable=t_at_end_of_syllable,
	n_restr=n_restrictions,
	is_minus_sign=is_minus_sign,
	is_plus_sign=is_plus_sign,
	is_decimal_point=is_decimal_point,
	fraction_connector=fraction_connector,
	percentage_marker=percentage_marker,
	int_frac_connector=int_frac_connector,
	is_large_power=is_large_power)
	old_rom_rules = self.rom_rules[s]
	if ((len(old_rom_rules) == 1) and (old_rom_rules[0]['prov'] in ('ud', 'ow'))
	and not (lcodes or use_only_at_start_of_word or dont_use_at_start_of_word
	or use_only_at_end_of_word or dont_use_at_end_of_word
	or use_only_for_whole_word)):
	self.rom_rules[s] = [new_rom_rule] # overwrite
	else:
	self.rom_rules[s].append(new_rom_rule)
	# Thai
	thai_cancellation_mark = '\u0E4C'
	# cancellation applies to preceding letter incl. any vowel modifier letter (e.g. ศักดิ์สิทธิ์ -> saksit)
	for cp in range(0x0E01, 0x0E4C): # Thai
	c = chr(cp)
	s = c + thai_cancellation_mark
	new_rom_rule = RomRule(s=s, t='', prov='auto cancel letter')
	if not self.rom_rules[s]:
	self.rom_rules[s] = [new_rom_rule]
	self.register_s_prefix(s)
	thai_consonants = list(map(chr, range(0x0E01, 0x0E2F)))
	thai_vowel_modifiers = ['\u0E31', '\u0E47'] + list(map(chr, range(0x0E33, 0x0E3B)))
	for c1 in thai_consonants:
	for v in thai_vowel_modifiers:
	s = c1 + v + thai_cancellation_mark
	new_rom_rule = RomRule(s=s, t='', prov='auto cancel syllable')
	if not self.rom_rules[s]:
	self.rom_rules[s] = [new_rom_rule]
	self.register_s_prefix(s)
	if load_log:
	sys.stderr.write(f'Loaded {n_entries} from {filename}\n')

	def load_script_file(self, filename: str, load_log: bool = True):
	"""Reads in (typically from Scripts.txt) information about various scripts such as Devanagari,
	incl. information such as the default abugida vowel letter (e.g. "a")."""
	n_entries, max_n_script_name_components = 0, 0
	try:
	f = open(filename)
	except FileNotFoundError:
	sys.stderr.write(f'Cannot open file {filename}\n')
	return
	with f:
	for line_number, line in enumerate(f, 1):
	if line.startswith('#'):
	continue
	if regex.match(r'^\s*$', line): # blank line
	continue
	line = regex.sub(r'\s{2,}#.*$', '', line)
	if script_name := slot_value_in_double_colon_del_list(line, 'script-name'):
	lc_script_name = script_name.lower()
	if lc_script_name in self.scripts:
	sys.stderr.write(f'** Ignoring duplicate script "{script_name}" '
	f'in line {line_number} of {filename}\n')
	else:
	n_entries += 1
	direction = slot_value_in_double_colon_del_list(line, 'direction')
	abugida_default_vowel_s = slot_value_in_double_colon_del_list(line,
	'abugida-default-vowel')
	abugida_default_vowels = regex.split(r'[,;]\s*', abugida_default_vowel_s) \
	if abugida_default_vowel_s else []
	alt_script_name_s = slot_value_in_double_colon_del_list(line, 'alt-script-name')
	alt_script_names = regex.split(r'[,;]\s*', alt_script_name_s) if alt_script_name_s else []
	language_s = slot_value_in_double_colon_del_list(line, 'language')
	languages = regex.split(r'[,;]\s*', language_s) if language_s else []
	new_script = Script(script_name=script_name, alt_script_names=alt_script_names,
	languages=languages, direction=direction,
	abugida_default_vowels=abugida_default_vowels)
	self.scripts[lc_script_name] = new_script
	for language in languages:
	self.dict_set[('scripts', language)].add(script_name)
	for alt_script_name in alt_script_names:
	lc_alt_script_name = alt_script_name.lower()
	if lc_alt_script_name in self.scripts:
	sys.stderr.write(f'** Ignoring duplicate alternative script name "{script_name}" '
	f'in line {line_number} of {filename}\n')
	else:
	self.scripts[lc_alt_script_name] = new_script
	n_script_name_components = len(script_name.split())
	if n_script_name_components > max_n_script_name_components:
	max_n_script_name_components = n_script_name_components
	if max_n_script_name_components:
	self.dict_int['max_n_script_name_components'] = max_n_script_name_components
	if load_log:
	sys.stderr.write(f'Loaded {n_entries} script descriptions from {filename}'
	f' (max_n_scripts_name_components: {max_n_script_name_components})\n')

	def extract_script_name(self, script_name_plus: str, full_char_name: str = None) -> str \| None:
	"""Using info from Scripts.txt, this script selects the script name from a Unicode,
	e.g. given "OLD HUNGARIAN CAPITAL LETTER A", extract "Old Hungarian"."""
	if full_char_name and script_name_plus == full_char_name:
	return None
	while script_name_plus:
	if script_name_plus.lower() in self.scripts:
	if script := self.scripts[script_name_plus.lower()]:
	if script_name := script['script-name']:
	return script_name
	script_name_plus = regex.sub(r'\s\S\s*$', '', script_name_plus)
	return None

	def load_unicode_data_props(self, filename: str, load_log: bool = True):
	"""Loads Unicode derived data from (1) UnicodeDataProps.txt, (2) UnicodeDataPropsHangul.txt
	and UnicodeDataPropsCJK.txt with a list of valid script-specific characters."""
	n_script, n_script_char, n_script_vowel_sign, n_script_medial_consonant_sign, n_script_virama = 0, 0, 0, 0, 0
	try:
	f = open(filename)
	except FileNotFoundError:
	sys.stderr.write(f'Cannot open file {filename}\n')
	return
	with f:
	for line_number, line in enumerate(f, 1):
	if line.startswith('#'):
	continue
	if regex.match(r'^\s*$', line): # blank line
	continue
	line = regex.sub(r'\s{2,}#.*$', '', line)
	if script_name := slot_value_in_double_colon_del_list(line, 'script-name'):
	n_script += 1
	for char in slot_value_in_double_colon_del_list(line, 'char', []):
	self.dict_str[('script', char)] = script_name
	n_script_char += 1
	for char in slot_value_in_double_colon_del_list(line, 'numeral', []):
	self.dict_str[('script', char)] = script_name
	n_script_char += 1
	for char in slot_value_in_double_colon_del_list(line, 'vowel-sign', []):
	self.dict_bool[('is-vowel-sign', char)] = True
	n_script_vowel_sign += 1
	for char in slot_value_in_double_colon_del_list(line, 'medial-consonant-sign', []):
	self.dict_bool[('is-medial-consonant-sign', char)] = True
	n_script_medial_consonant_sign += 1
	for char in slot_value_in_double_colon_del_list(line, 'sign-virama', []):
	self.dict_bool[('is-virama', char)] = True
	n_script_virama += 1
	if load_log:
	sys.stderr.write(f'Loaded from {filename} mappings of {n_script_char:,d} characters '
	f'to {n_script} script{"" if n_script == 1 else "s"}')
	if n_script_vowel_sign or n_script_virama or n_script_medial_consonant_sign:
	sys.stderr.write(f', with a total of {n_script_vowel_sign} vowel signs, '
	f'{n_script_medial_consonant_sign} medial consonant signs '
	f'and {n_script_virama} viramas')
	sys.stderr.write('.\n')

	def load_num_props(self, filename: str, load_log: bool = True):
	"""Loads Unicode derived data from (1) UnicodeDataProps.txt, (2) UnicodeDataPropsHangul.txt
	and UnicodeDataPropsCJK.txt with a list of valid script-specific characters."""
	n_entries = 0
	try:
	f = open(filename)
	except FileNotFoundError:
	sys.stderr.write(f'Cannot open file {filename}\n')
	return
	with f:
	for line_number, line in enumerate(f, 1):
	if line.startswith('#'):
	continue
	if regex.match(r'^\s*$', line): # blank line
	continue
	d = json.loads(line)
	if isinstance(d, dict):
	if txt := d.get('txt'):
	self.num_props[txt] = d
	n_entries += 1
	else:
	sys.stderr.write(f'Missing txt in l.{line_number} in file {filename}: {line.strip()}\n')
	for bool_key in ('is-large-power',):
	if d.get(bool_key):
	self.dict_bool[(bool_key, txt)] = True
	else:
	sys.stderr.write(f'json in l.{line_number} in file {filename} not a dict: {line.strip()}\n')
	if load_log:
	sys.stderr.write(f'Loaded {n_entries} entries from {filename}\n')

	@staticmethod
	def de_accent_pinyin(s: str) -> str:
	"""De-accents a string from "liú" to "liu" and "ü" to "u" (to help process file Chinese_to_Pinyin.txt)."""
	result = ''
	for char in s:
	if decomp := ud.decomposition(char).split():
	try:
	decomp_chars = [chr(int(x, 16)) for x in decomp]
	letters = [x for x in decomp_chars if ud.category(x).startswith('L')]
	except ValueError:
	sys.stderr.write(f'Cannot decode {decomp}\n')
	continue
	if len(letters) == 1:
	result += letters[0]
	else:
	sys.stderr.write(f'Cannot decode {decomp} (expected 1 letter)\n')
	else:
	result += char
	result = result.replace('ü', 'u')
	return result

	def register_s_prefix(self, s: str):
	for prefix_len in range(1, len(s) + 1):
	self.dict_bool[('s-prefix', s[:prefix_len])] = True

	def load_chinese_pinyin_file(self, filename: str, load_log: bool = True):
	"""Loads file Chinese_to_Pinyin.txt which maps Chinese characters to their Latin form."""
	n_entries = 0
	try:
	f = open(filename)
	except FileNotFoundError:
	sys.stderr.write(f'Cannot open file {filename}\n')
	return
	with f:
	for line_number, line in enumerate(f, 1):
	if line.startswith('#'):
	continue
	if regex.match(r'^\s*$', line): # blank line
	continue
	try:
	chinese, pinyin = line.rstrip().split()
	rom = self.de_accent_pinyin(pinyin)
	except ValueError:
	sys.stderr.write(f'Cannot process line {line_number} in file {filename}: {line}')
	else:
	s = chinese
	new_rom_rule = RomRule(s=s, t=rom, prov='rom pinyin', lcodes=[])
	self.rom_rules[chinese].append(new_rom_rule)
	self.register_s_prefix(s)
	n_entries += 1
	if load_log:
	sys.stderr.write(f'Loaded {n_entries} script descriptions from {filename}\n')

	@staticmethod
	def add_char_to_rebuild_unicode_data_dict(d: dict, script_name: str, prop_class: str, char: str):
	d['script-names'].add(script_name)
	key = (script_name, prop_class)
	if key in d:
	d[key].append(char)
	else:
	d[key] = [char]

	def rebuild_unicode_data_props(self, out_filename: str, cjk: str = None, hangul: str = None):
	"""This functions rebuilds UnicodeDataProps*.txt This might be useful when a new UnicodeData.txt
	version is released, or additional information is extracted from Unicode to UnicodeDataProps.txt
	Regular users normally never have to call this function."""
	d = {'script-names': set()}
	n_script_refs = 0
	codepoint = -1
	prop_classes = {'char'}
	while codepoint < 0xF0000:
	codepoint += 1
	c = chr(codepoint)
	if not (char_name := self.chr_name(c)):
	continue
	for prop_name_comp2 in ('VOWEL SIGN',
	('MEDIAL CONSONANT SIGN', 'CONSONANT SIGN MEDIAL', 'CONSONANT SIGN SHAN MEDIAL',
	'CONSONANT SIGN MON MEDIAL'),
	('SIGN VIRAMA', 'SIGN ASAT', 'AL-LAKUNA', 'SIGN COENG', 'SIGN PAMAAEH',
	'CHARACTER PHINTHU'),
	('NUMERAL', 'NUMBER', 'DIGIT', 'FRACTION')):
	if prop_name_comp2 and isinstance(prop_name_comp2, tuple):
	prop_list = prop_name_comp2
	else:
	prop_list = (prop_name_comp2,)
	for prop_name_comp in prop_list:
	prop_class = prop_list[0].lower().replace(' ', '-')
	if prop_class not in prop_classes:
	prop_classes.add(prop_class)
	script_name_cand = regex.sub(fr'\s+{prop_name_comp}\b.*$', '', char_name)
	if script_name := self.extract_script_name(script_name_cand, char_name):
	self.add_char_to_rebuild_unicode_data_dict(d, script_name, prop_class, c)
	script_name_cand = regex.sub(r'\s+(CONSONANT\|LETTER\|LIGATURE\|SIGN\|SYLLABLE\|SYLLABICS\|VOWEL\|'
	r'IDEOGRAPH\|HIEROGLYPH\|POINT\|ACCENT\|CHARACTER\|TIPPI\|ADDAK\|IRI\|URA\|'
	r'SYMBOL GENITIVE\|SYMBOL COMPLETED\|SYMBOL LOCATIVE\|SYMBOL AFOREMENTIONED\|'
	r'AU LENGTH MARK)\b.*$', '',
	char_name)
	if script_name := self.extract_script_name(script_name_cand, char_name):
	self.add_char_to_rebuild_unicode_data_dict(d, script_name, 'char', c)
	n_script_refs += 1
	# print(sorted(d['script-names']))
	prop_classes = sorted(prop_classes)
	out_filenames = [x for x in [out_filename, cjk, hangul] if x]
	cjk2 = cjk if cjk else out_filename
	hangul2 = hangul if hangul else out_filename
	for out_file in out_filenames:
	try:
	f_out = open(out_file, 'w')
	except OSError:
	sys.stderr.write(f'Cannot write to file {out_file}\n')
	continue
	with f_out:
	for script_name in sorted(d['script-names']):
	if script_name == 'CJK':
	if out_file != cjk2:
	continue
	elif script_name == 'Hangul':
	if out_file != hangul2:
	continue
	else:
	if out_file != out_filename:
	continue
	prop_components = [f"::script-name {script_name}"]
	for prop_class in prop_classes:
	key = (script_name, prop_class)
	if key in d:
	if chars := ''.join(d[key]):
	if prop_class in ('char',):
	prop_components.append(f"::n-{prop_class} {len(chars)}")
	prop_components.append(f"::{prop_class} {chars}")
	f_out.write(f"{' '.join(prop_components)}\n")
	sys.stderr.write(f"Rebuilt {out_filenames} with {n_script_refs} characters "
	f"for {len(d['script-names'])} scripts.\n")

	def rebuild_num_props(self, out_filename: str, err_filename: str):
	n_out, n_err = 0, 0
	with open(out_filename, 'w') as f_out, open(err_filename, 'w') as f_err:
	codepoint = -1
	while codepoint < 0xF0000:
	codepoint += 1
	char = chr(codepoint)
	num = first_non_none(ud_numeric(char), # robust ud.numeric
	self.num_value(char)) # uroman table includes extra num values, e.g. for Egyptian
	if num is None:
	continue
	result_dict = {}
	orig_txt = char
	value: int \| float \| None = None # non-fraction-value(3 1/2) = 3
	fraction: Fraction \| None = None # fraction(3 1/2) = Fraction(1, 2)
	num_base = None # num_base(500) = 100
	base_multiplier = None # base_multiplier(500) = 5
	script = None
	is_large_power = self.dict_bool[('is-large-power', char)]
	# num_base is typically a power of 10: 1, 10, 100, 1000, 10000, 100000, 1000000, ...
	# exceptions might include 12 for the 'dozen' in popular English 'two dozen and one' (2*12+1=25)
	# exceptions might include 20 for the 'score' in archaic English 'four score and seven' (4*20+7=87)
	# exceptions might include 20 for the 'vingt' as in standard French 'quatre-vingt-treize' (4*20+13=93)
	if script_name := self.chr_script_name(char):
	script = script_name
	elif char in '0123456789':
	script = 'ascii-digit'
	name = self.chr_name(char)
	exclude_from_number_processing = False
	for scrypt_type in ('SUPERSCRIPT', 'SUBSCRIPT',
	'CIRCLED', 'PARENTHESIZED', 'SEGMENTED', 'MATHEMATICAL', 'ROMAN NUMERAL',
	'FULL STOP', 'COMMA'):
	if scrypt_type in name:
	script = '*' + scrypt_type.lower().replace(' ', '-')
	exclude_from_number_processing = True
	break
	for scrypt_type in ('VULGAR FRACTION',):
	if scrypt_type in name:
	script = scrypt_type.lower().replace(' ', '-')
	break
	if exclude_from_number_processing:
	continue
	if isinstance(num, int):
	value = num
	if 0 <= num <= 9:
	num_base = 1
	base_multiplier = num
	if "DIGIT" in name:
	num_type = 'digit'
	else:
	# Chinese numbers 零 (0), 一 (1), ... 九 (9) have numeric values,
	# but are NOT (full) digits
	num_type = 'digit-like'
	elif m := regex.match(r'([0-9]+?)(0*)$', str(num)):
	base_multiplier = int(m.group(1)) # non_base_value(500) = 5
	num_base = int('1' + m.group(2))
	num_type = 'base' if base_multiplier == 1 else 'multi'
	else:
	num_type = 'other-int' # Do such cases exist?
	elif ("FRACTION" in name) and (fraction := fraction_char2fraction(char, num, self)):
	fraction = fraction
	num_type = 'fraction'
	else:
	num_type = 'other-num' # Do such cases exist? Yes. Bengali currency numerators, ...
	value_s = '' if value is None else str(value)
	fraction_s = '' if fraction is None else f'{fraction.numerator}/{fraction.denominator}'
	fraction_list = None if fraction is None else [fraction.numerator, fraction.denominator]
	delimiter_s = ' ' if value_s and fraction_s else ''
	rom = (value_s + delimiter_s + fraction_s) or orig_txt
	add_non_none_to_dict(result_dict, 'txt', orig_txt)
	add_non_none_to_dict(result_dict, 'rom', rom)
	add_non_none_to_dict(result_dict, 'value', value)
	add_non_none_to_dict(result_dict, 'fraction', fraction_list)
	add_non_none_to_dict(result_dict, 'type', num_type)
	if is_large_power:
	result_dict['is-large-power'] = True
	add_non_none_to_dict(result_dict, 'base', num_base)
	add_non_none_to_dict(result_dict, 'mult', base_multiplier)
	add_non_none_to_dict(result_dict, 'script', script)
	if num_type.startswith('other'):
	add_non_none_to_dict(result_dict, 'name', name)
	f_err.write(json.dumps(result_dict) + '\n')
	n_err += 1
	else:
	if not script:
	add_non_none_to_dict(result_dict, 'name', name)
	f_out.write(json.dumps(result_dict) + '\n')
	n_out += 1
	sys.stderr.write(f'Processed {codepoint} codepoints,\n wrote {n_out} lines to {out_filename}\n'
	f' and {n_err} lines to {err_filename}\n')

	def load_resource_files(self, data_dir: Path, load_log: bool = False,
	rebuild_ud_props: bool = False, rebuild_num_props: bool = False):
	"""Loads all resource files needed for romanization."""
	data_dir = data_dir
	if not isinstance(data_dir, pathlib.Path):
	sys.stderr.write(f'Error: data_dir is of {type(data_dir)}, not a Path.\n'
	f' Cannot load any resource files.\n')
	return
	self.load_rom_file(os.path.join(data_dir, "romanization-auto-table.txt"),
	'ud', file_format='rom', load_log=load_log)
	self.load_rom_file(os.path.join(data_dir, "UnicodeDataOverwrite.txt"),
	'ow', file_format='u2r', load_log=load_log)
	self.load_rom_file(os.path.join(data_dir, "romanization-table.txt"),
	'man', file_format='rom', load_log=load_log)
	self.load_chinese_pinyin_file(os.path.join(data_dir, "Chinese_to_Pinyin.txt"), load_log=load_log)
	self.load_script_file(os.path.join(data_dir, "Scripts.txt"), load_log=load_log)
	self.load_num_props(os.path.join(data_dir, "NumProps.jsonl"), load_log=load_log)
	for base_file in ("UnicodeDataProps.txt", "UnicodeDataPropsCJK.txt", "UnicodeDataPropsHangul.txt"):
	self.load_unicode_data_props(os.path.join(data_dir, base_file), load_log=load_log)
	if rebuild_ud_props:
	self.rebuild_unicode_data_props(os.path.join(data_dir, "UnicodeDataProps.txt"),
	cjk=os.path.join(data_dir, "UnicodeDataPropsCJK.txt"),
	hangul=os.path.join(data_dir, "UnicodeDataPropsHangul.txt"))
	if rebuild_num_props:
	self.rebuild_num_props(os.path.join(data_dir, "NumProps.jsonl"),
	os.path.join(data_dir, "NumPropsRejects.jsonl"))

	def unicode_hangul_romanization(self, s: str, pass_through_p: bool = False):
	"""Special algorithmic solution to convert (Korean) Hangul characters to the Latin alphabet."""
	if cached_rom := self.hangul_rom.get(s, None):
	return cached_rom
	leads = "g gg n d dd r m b bb s ss - j jj c k t p h".split()
	vowels = "a ae ya yae eo e yeo ye o wa wai oe yo u weo we wi yu eu yi i".split()
	tails = "- g gg gs n nj nh d l lg lm lb ls lt lp lh m b bs s ss ng j c k t p h".split()
	result = ""
	for c in s:
	cp = ord(c)
	if 0xAC00 <= cp <= 0xD7A3:
	code = cp - 0xAC00
	lead_index = int(code / (28 * 21))
	vowel_index = int(code / 28) % 21
	tail_index = code % 28
	rom = leads[lead_index] + vowels[vowel_index] + tails[tail_index]
	rom = rom.replace('-', '')
	self.hangul_rom[c] = rom
	result += rom
	elif pass_through_p:
	result += c
	return result

	@staticmethod
	def char_is_nonspacing_mark(s) -> bool:
	""" Checks whether a character is a nonspacing mark, e.g. combining accents, points, vowel signs"""
	return (len(s) == 1) and (ud.category(s) == 'Mn')

	@staticmethod
	def char_is_format_char(s) -> bool:
	""" Checks whether a character is a formatting character, e.g. a zero-with joiner/non-joiner"""
	return (len(s) == 1) and (ud.category(s) == 'Cf')

	@staticmethod
	def char_is_space_separator(s) -> bool:
	""" Checks whether a character is a space,
	e.g. ' ', non-breakable space, en space, ideographic (Chinese) space, Ogham space mark
	but excluding \t, \r, \n"""
	return (len(s) == 1) and (ud.category(s) == 'Zs')

	def chr_name(self, char: str) -> str:
	try:
	return ud.name(char)
	except (ValueError, TypeError):
	if name := self.dict_str[('name', char)]:
	return name
	return ''

	def num_value(self, s: str) -> int \| float \| Fraction \| None:
	"""rom_rules include numeric values beyond UnicodeData.txt, e.g. for Egyptian numerals"""
	for rom_rule in self.rom_rules[s]:
	if (num := rom_rule['num']) is not None:
	return num
	return None

	def rom_rule_value(self, s: str, key: str):
	for rom_rule in self.rom_rules[s]:
	if (value := rom_rule.get(key)) is not None:
	return value
	return None

	def unicode_float2fraction(self, num: float, precision: float = 0.000001) -> Tuple[int, int] \| None:
	"""only for common unicode fractions"""
	if chached_value := self.float2fraction.get(num, None):
	return chached_value
	for numerator in (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11):
	for denominator in (2, 3, 4, 5, 6, 8, 12, 16, 20, 32, 40, 64, 80, 160, 320):
	if abs(numerator / denominator - num) < precision:
	result = numerator, denominator
	self.float2fraction[num] = result
	return result
	return None

	def chr_script_name(self, char: str) -> str:
	"""For letters, diacritics, numerals etc."""
	return self.dict_str[('script', char)]

	def test_output_of_selected_scripts_and_rom_rules(self):
	"""Low level test function that checks and displays romanization information."""
	output = ''
	for s in ("Oriya", "Chinese"):
	d = self.scripts[s.lower()]
	output += f'SCRIPT {s} {d}\n'
	for s in ('ƿ', 'β', 'и', 'μπ', '⠹', '亿', 'ちょ', 'и', '𓍧', '正', '分之', 'ऽ', 'ศ', 'ด์'):
	d = self.rom_rules[s]
	output += f'DICT {s} {d}\n'
	for s in ('ƿ', 'β', 'न', 'ु'):
	output += f'SCRIPT-NAME {s} {self.chr_script_name(s)}\n'
	for s in ('万', '\uF8F7', '\U00013368', '\U0001308B', '\u0E48', '\u0E40'):
	name = self.chr_name(s)
	num = self.dict_num[s]
	pic = self.dict_str[('pic', s)]
	tone_mark = self.dict_str[('tone-mark', s)]
	syllable_info = self.dict_str[('syllable-info', s)]
	is_large_power = self.dict_bool[('is-large-power', s)]
	output += f'PROPS {s}'
	if name:
	output += f' name: {name}'
	if num:
	output += f' num: {num} ({type(num).__name__})'
	if pic:
	output += f' pic: {pic}'
	if tone_mark:
	output += f' tone-mark: {tone_mark}'
	if syllable_info:
	output += f' syllable-info: {syllable_info}'
	if is_large_power:
	output += f' is-large-power: {is_large_power}'
	output += '\n'
	mayan12 = '\U0001D2EC'
	egyptian600 = '𓍧'
	runic90 = '𐍁'
	klingon2 = '\uF8F2'
	for offset, c in enumerate(f'9九万萬百፲፱፻፸¾0²₂AⅫ⑫൵{runic90}{mayan12}{egyptian600}{klingon2}'):
	output += f'NUM-EDGE: {NumEdge(offset, offset+1, c, self)}\n'
	for s in ('\u00bc', '\u0968'):
	output += f'NUM-PROPS: {self.num_props[s]}\n'
	print(output)

	def test_romanization(self, **args):
	"""A few full cases of romanization testing."""
	tests = [('ألاسكا', None), ('यह एक अच्छा अनुवाद है.', 'hin'), ('ちょっとまってください', 'kor'),
	('Μπανγκαλόρ', 'ell'), ('Зеленський', 'ukr'), ('കേരളം', 'mal')]
	for test in tests:
	s = test[0]
	lcode = test[1] if len(test) >= 2 else None
	rom = self.romanize_string(s, lcode=lcode, **args)
	sys.stderr.write(f'ROM {s} -> {rom}\n')
	n_alerts = 0
	codepoint = -1
	while codepoint < 0xF0000:
	codepoint += 1
	c = chr(codepoint)
	rom = self.romanize_string(c)
	if regex.search(r'\s', rom) and regex.search(r'\S', rom):
	name = self.chr_name(c)
	sys.stderr.write(f'U+{codepoint:04X} {c} {name} {rom}\n')
	n_alerts += 1
	sys.stderr.write(f'{n_alerts} alerts for roms with spaces\n')

	def romanize_file(self, input_filename: str \| None = None, output_filename: str \| None = None,
	lcode: str \| None = None, direct_input: List[str] = None, **args):
	"""Script to apply romanization to an entire file. Input and output files needed.
	Language code (lcode) recommended."""
	f_in_to_be_closed, f_out_to_be_closed = False, False
	if direct_input and (input_filename is None):
	f_in = direct_input # list of lines
	elif isinstance(input_filename, str):
	try:
	f_in = open(input_filename)
	f_in_to_be_closed = True
	except OSError:
	sys.stderr.write(f'Error in romanize_file: Cannot open file {input_filename}\n')
	f_in = None
	elif input_filename is None:
	f_in = sys.stdin
	else:
	sys.stderr.write(f"Error in romanize_file: argument 'input_filename' {input_filename} "
	f"is of wrong type: {type(input_filename)} (should be str)\n")
	f_in = None
	if isinstance(output_filename, str):
	try:
	f_out = open(str(output_filename), 'w')
	f_out_to_be_closed = True
	except OSError:
	sys.stderr.write(f'Error in romanize_file: Cannot write to file {output_filename}\n')
	f_out = None
	elif output_filename is None:
	f_out = sys.stdout
	else:
	sys.stderr.write(f"Error in romanize_file: argument 'output_filename' {output_filename} "
	f"is of wrong type: {type(output_filename)} (should be str)\n")
	f_out = None
	if f_in and f_out:
	max_lines = args.get('max_lines', None)
	progress_dots_output = False
	for line_number, line in enumerate(f_in, 1):
	if m := regex.match(r'(::lcode\s+)([a-z]{3})(\s+)(.?)\s$', line):
	lcode_kw, lcode2, space, snt = m.group(1, 2, 3, 4)
	rom_result = self.romanize_string(snt, lcode2 or lcode, **args)
	if args.get('rom_format', None) == RomFormat.STR:
	lcode_prefix = f"{lcode_kw}{lcode2}{space}"
	f_out.write(lcode_prefix + rom_result + '\n')
	else:
	lcode_prefix = f'[0, 0, "", "lcode: {lcode2}"]' # meta edge with lcode info
	prefixed_edges = [lcode_prefix] + self.romanize_string(snt, lcode2 or lcode, **args)
	f_out.write(Edge.json_str(prefixed_edges) + '\n')
	else:
	f_out.write(Edge.json_str(self.romanize_string(line.rstrip(), lcode, **args)) + '\n')
	if not args.get('silent'):
	if line_number % 100 == 0:
	if line_number % 1000 == 0:
	sys.stderr.write(str(line_number))
	else:
	sys.stderr.write('.')
	progress_dots_output = True
	sys.stderr.flush()
	gc.collect()
	if max_lines and line_number >= max_lines:
	break
	if progress_dots_output:
	sys.stderr.write('\n')
	sys.stderr.flush()
	if f_in_to_be_closed:
	f_in.close()
	if f_out_to_be_closed:
	f_out.close()

	@staticmethod
	def apply_any_offset_to_cached_rom_result(cached_rom_result: str \| List[Edge], offset: int = 0) \
	-> str \| List[Edge]:
	if isinstance(cached_rom_result, str):
	return cached_rom_result
	elif offset == 0:
	return cached_rom_result
	else:
	return [Edge(edge.start + offset, edge.end + offset, edge.txt, edge.type) for edge in cached_rom_result]

	def romanize_string_core(self, s: str, lcode: str \| None, rom_format: RomFormat, cache_p: bool,
	offset: int = 0, **args) -> str \| List[Edge]:
	"""Script to support token-by-token romanization with caching for higher speed."""
	if cache_p:
	cached_rom = self.rom_cache.get((s, lcode, rom_format), None)
	if cached_rom is not None:
	return self.apply_any_offset_to_cached_rom_result(cached_rom, offset)
	lat = Lattice(s, uroman=self, lcode=lcode)
	lat.pick_tibetan_vowel_edge(**args)
	lat.prep_braille(**args)
	lat.add_romanization(**args)
	lat.add_numbers(self, **args)
	lat.add_braille_numbers(**args)
	lat.add_rom_fall_back_singles(**args)
	if rom_format == RomFormat.LATTICE:
	all_edges = lat.all_edges(0, len(s))
	lat.add_alternatives(all_edges)
	if cache_p:
	self.rom_cache[(s, lcode, rom_format)] = all_edges
	result = self.apply_any_offset_to_cached_rom_result(all_edges, offset)
	else:
	best_edges = lat.best_rom_edge_path(0, len(s))
	if rom_format in (RomFormat.EDGES, RomFormat.ALTS):
	if rom_format == RomFormat.ALTS:
	lat.add_alternatives(best_edges)
	if cache_p:
	self.rom_cache[(s, lcode, rom_format)] = best_edges
	result = self.apply_any_offset_to_cached_rom_result(best_edges, offset)
	else:
	rom = lat.edge_path_to_surf(best_edges)
	del lat
	if cache_p:
	self.rom_cache[(s, lcode, rom_format)] = rom
	result = rom
	return result

	def romanize_string(self, s: str, lcode: str \| None = None, rom_format: RomFormat = RomFormat.STR, **args) \
	-> str \| List[Edge]:
	"""Main entry point for romanizing a string. Recommended argument: lcode (language code).
	recursive only used for development.
	Method returns a string or a list of edges (with start and end offsets)."""
	lcode = lcode or args.get('lcode', None)
	# print('rom::', s, 'lcode:', lcode, 'print-lattice:', print_lattice_p)

	# with caching (for string format output only for now)
	if cache_p := not args.get('no_caching', False):
	rest, offset = s, 0
	result = '' if rom_format == RomFormat.STR else []
	while m3 := regex.match(r'(.?)([.,; ][ 。][.,; ])(.)$', rest):
	pre, delimiter, rest = m3.group(1, 2, 3)
	result += self.romanize_string_core(pre, lcode, rom_format, cache_p, offset, **args)
	offset += len(pre)
	result += self.romanize_string_core(delimiter, lcode, rom_format, cache_p, offset, **args)
	offset += len(delimiter)
	result += self.romanize_string_core(rest, lcode, rom_format, cache_p, offset, **args)
	return result
	else:
	return self.romanize_string_core(s, lcode, rom_format, cache_p, 0, **args)


	class Edge:
	"""This class defines edges that span part of a sentence with a specific romanization.
	There might be multiple edges for a given span. The edges in turn are part of the
	romanization lattice."""
	def __init__(self, start: int, end: int, s: str, annotation: str = None):
	self.start = start
	self.end = end
	self.txt = s
	self.type = annotation

	def __str__(self):
	return f'[{self.start}-{self.end}] {self.txt} ({self.type})'

	def __repr__(self):
	return str(self)

	def json(self) -> str: # start - end - text - annotation
	return json.dumps([self.start, self.end, self.txt, self.type])

	@staticmethod
	def json_str(rom_result: List[Edge] \| str) -> str:
	if isinstance(rom_result, str):
	return rom_result
	else:
	result = '['
	for edge in rom_result:
	if isinstance(edge, Edge):
	result += edge.json()
	else:
	result += str(edge)
	result += ']'
	return result


	class NumEdge(Edge):
	def __init__(self, start: int, end: int, s: str, uroman: Uroman \| None, active: bool = False):
	"""For NumEdge, the s argument is in original language (not yet romanized)."""
	# For speed, much of this processing should at some point be cached in data files.
	Edge.__init__(self, start, end, s)
	self.orig_txt, self.txt = s, s
	self.value, self.fraction, self.num_base, self.base_multiplier = None, None, None, None
	self.type, self.script, self.is_large_power, self.active = None, None, False, active
	self.n_decimals = None
	self.value_s = None # precision for 3.14159265358979323846264338327950288419716939937510582097494
	if start+1 == end:
	char = s[0]
	if d := uroman.num_props.get(char):
	self.active = True
	self.value = d.get('value')
	fraction_list = d.get('fraction')
	self.fraction = Fraction(fraction_list[0], fraction_list[1]) if fraction_list else None
	self.num_base = d.get('base')
	self.base_multiplier = d.get('mult')
	self.type = d.get('type')
	self.script = d.get('script')
	self.is_large_power = d.get('is-large-power')
	self.update()

	def update(self,
	value: int \| float \| None = None,
	value_s: str \| None = None,
	fraction: Fraction \| None = None,
	n_decimals: int \| None = None,
	num_base: int \| None = None,
	base_multiplier: int \| float \| None = None,
	script: str \| None = None,
	e_type: str \| None = None,
	orig_txt: str \| None = None) -> str:
	self.value = first_non_none(value, self.value)
	self.value_s = first_non_none(value_s, self.value_s)
	self.fraction = first_non_none(fraction, self.fraction)
	self.n_decimals = first_non_none(n_decimals, self.n_decimals)
	self.num_base = first_non_none(num_base, self.num_base)
	self.base_multiplier = first_non_none(base_multiplier, self.base_multiplier)
	self.script = first_non_none(script, self.script)
	self.type = first_non_none(e_type, self.type)
	self.orig_txt = first_non_none(orig_txt, self.orig_txt)
	if self.value_s is not None:
	value_s = self.value_s
	elif self.value is None:
	value_s = ''
	elif isinstance(self.value, float) and (self.n_decimals is not None):
	value_s = first_non_none(self.value_s, f'{self.value:0.{self.n_decimals}f}')
	else:
	value_s = str(self.value)
	fraction_s = '' if self.fraction is None else f'{self.fraction.numerator}/{self.fraction.denominator}'
	delimiter_s = ' ' if value_s and fraction_s else ''
	self.txt = (value_s + delimiter_s + fraction_s) or self.orig_txt
	return self.txt

	def __str__(self):
	if self.num_base is not None:
	if self.base_multiplier is not None:
	b_clause = f'{self.base_multiplier}*{self.num_base}'
	else:
	b_clause = str(self.num_base)
	else:
	b_clause = None
	return (('' if self.active else ' *')
	+ f'[{self.start}-{self.end}] {self.orig_txt} R:{self.txt} T:{self.type}'
	+ (' LP' if self.is_large_power else '')
	+ (f' B:{b_clause}' if (b_clause is not None) else '')
	+ (f' V:{self.value}' if ((self.value is not None) and (str(self.value) != self.txt)) else '')
	+ (f' VS:{self.value_s}' if ((self.value_s is not None) and (self.value_s != self.txt)) else '')
	+ (f' F:.{self.n_decimals}f' if self.n_decimals else f'')
	+ (f' S:{self.script}' if self.script else ''))


	class Lattice:
	"""Lattice for a specific romanization instance. Has edges."""
	def __init__(self, s: str, uroman: Uroman, lcode: str = None):
	self.s = s
	self.lcode = lcode
	self.lattice = defaultdict(set)
	self.max_vertex = len(s)
	self.uroman = uroman
	self.props = {}
	self.simple_top_rom_cache = {}
	self.contains_script = defaultdict(bool)
	self.check_for_scripts()

	def check_for_scripts(self):
	for c in self.s:
	script_name = self.uroman.chr_script_name(c)
	self.contains_script[script_name] = True
	if regex.search(r'[\u2800-\u28FF]', self.s):
	self.contains_script['Braille'] = True

	def add_edge(self, edge: Edge):
	self.lattice[(edge.start, edge.end)].add(edge)
	self.lattice[(edge.start, 'right')].add(edge.end)
	self.lattice[(edge.end, 'left')].add(edge.start)

	def __str__(self):
	edges = []
	for start in range(self.max_vertex):
	for end in self.lattice[(start, 'right')]:
	for edge in self.lattice[(start, end)]:
	edges.append(f'[{start}-{end}] {edge.txt} ({edge.type})')
	return ' '.join(edges)

	@staticmethod
	def char_is_braille(c: str) -> bool:
	return 0x2800 <= ord(c[0]) <= 0x28FF

	# Help Tibet
	def char_is_subjoined_letter(self, c: str) -> bool:
	return "SUBJOINED LETTER" in self.uroman.chr_name(c)

	def char_is_regular_letter(self, c: str) -> bool:
	char_name = self.uroman.chr_name(c)
	return ("LETTER" in char_name) and not ("SUBJOINED" in char_name)

	def char_is_letter(self, c: str) -> bool:
	return "LETTER" in self.uroman.chr_name(c)

	def char_is_vowel_sign(self, c: str) -> bool:
	return self.uroman.dict_bool[('is-vowel-sign', c)]

	def char_is_letter_or_vowel_sign(self, c: str) -> bool:
	return self.char_is_letter(c) or self.char_is_vowel_sign(c)

	def is_at_start_of_word(self, position: int) -> bool:
	# return not regex.match(r'(?:\pL\|\pM)', self.s[position-1:position])
	first_char = self.s[position]
	first_char_is_braille = self.char_is_braille(first_char)
	end = position
	if (preceded_by_alpha := self.props.get(('preceded_by_alpha', end), None)) in (True, False):
	return not preceded_by_alpha
	for start in self.lattice[(end, 'left')]:
	for edge in self.lattice[(start, end)]:
	prev_letter = None if edge.txt == '' else edge.txt[-1]
	if len(edge.txt) and (prev_letter.isalpha() or (first_char_is_braille and (prev_letter in ["'"]))):
	self.props[('preceded_by_alpha', position)] = True
	return False
	self.props[('preceded_by_alpha', position)] = False
	return True

	def is_at_end_of_word(self, position: int) -> bool:
	if (cached_followed_by_alpha := self.props.get(('followed_by_alpha', position), None)) in (True, False):
	return not cached_followed_by_alpha
	start = position
	while (start+1 < self.max_vertex) \
	and self.uroman.char_is_nonspacing_mark(self.s[start]) \
	and ('NUKTA' in self.uroman.chr_name(self.s[start])):
	start += 1
	for end in range(start + 1, self.max_vertex + 1):
	s = self.s[start:end]
	if not self.uroman.dict_bool[('s-prefix', s)]:
	break
	for rom_rule in self.uroman.rom_rules[s]:
	rom = rom_rule['t']
	if (not rom_rule['use-only-at-start-of-word']) and regex.search(r'\pL', rom):
	self.props[('followed_by_alpha', position)] = True
	return False
	self.props[('followed_by_alpha', position)] = False
	return True

	def is_at_end_of_syllable(self, position: int) -> Tuple[bool, str]:
	"""At least initially for Thai"""
	prev_char = self.s[position-2] if position >= 2 else None
	# char = self.s[position-1] if position >= 1 else None
	next_char = self.s[position] if position < self.max_vertex else None
	if self.uroman.dict_str[('tone-mark', next_char)]:
	adj_position = position + 1
	next_char = self.s[adj_position] if adj_position < self.max_vertex else None
	# print('TONE-MARK', position, next_char)
	else:
	adj_position = position
	next_char2 = self.s[adj_position + 1] if adj_position + 1 < self.max_vertex else None
	if prev_char is None:
	return False, 'start-of-string'
	if not regex.search(r'(?:\pL\|\pM)$', prev_char): # start of token
	return False, 'start-of-token'
	if self.uroman.dict_str[('syllable-info', prev_char)] == 'written-pre-consonant-spoken-post-consonant':
	return False, 'pre-post-vowel-on-left'
	if self.uroman.dict_str[('syllable-info', next_char)] == 'written-pre-consonant-spoken-post-consonant':
	return True, 'pre-post-vowel-on-right'
	if adj_position >= self.max_vertex: # end of string
	return True, 'end-of-string'
	# if not self.char_is_letter_or_vowel_sign(next_char): # end of token
	if not regex.match(r'(?:\pL\|\pM)', next_char): # end of token
	return True, 'end-of-token'
	if position > 0:
	left_edge = self.best_left_neighbor_edge(position-1)
	if left_edge and regex.search(r'[bcdfghjklmnpqrstvxz]$', left_edge.txt):
	return False, 'consonant-to-the-left'
	next_char_rom = first_non_none(self.simple_top_romanization_candidate_for_span(adj_position,
	adj_position + 2,
	simple_search=True),
	self.simple_top_romanization_candidate_for_span(adj_position,
	adj_position + 1,
	simple_search=True),
	"?")
	if not regex.match(r"[aeiou]", next_char_rom.lower()): # followed by consonant
	return True, f'not-followed-by-vowel {next_char_rom}'
	if (next_char == '\u0E2D') and (next_char2 is not None): # THAI CHARACTER O ANG
	next_char2_rom = first_non_none(self.simple_top_romanization_candidate_for_span(adj_position+1,
	adj_position+2,
	simple_search=True),
	"?")
	if regex.match(r"[aeiou]", next_char2_rom.lower()):
	return True, 'o-ang-followed-by-vowel' # In that context Thai char. "o ang" is considered a consonant
	return False, 'not-at-syllable-end-by-default'

	def romanization_by_first_rule(self, s) -> str \| None:
	try:
	return self.uroman.rom_rules[s][0]['t']
	except IndexError:
	return None

	def expand_rom_with_special_chars(self, rom: str, start: int, end: int, **args) \
	-> Tuple[str, int, int, str \| None]:
	"""This method contains a number of special romanization heuristics that typically modify
	an existing or preliminary edge based on context."""
	orig_start = start
	uroman = self.uroman
	full_string = self.s
	annot = None
	if rom == '':
	return rom, start, end, None
	prev_char = (full_string[start-1] if start >= 1 else '')
	first_char = full_string[start]
	last_char = full_string[end-1]
	next_char = (full_string[end] if end < len(full_string) else '')
	# \u2820 is the Braille character indicating that the next letter is upper case
	if (prev_char == '\u2820') and regex.match(r'[a-z]', rom):
	return rom[0].upper() + rom[1:], start-1, end, 'rom exp'
	# Normalize multi-upper case THessalonike -> Thessalonike, but don't change THESSALONIKE
	if start+1 == end and rom.isupper() and next_char.islower():
	ablation = args.get('ablation', '') # VERBOSE
	if not ('nocap' in ablation):
	rom = rom.capitalize()
	# Japanese small tsu (and Gurmukhi addak) used as consonant doubler:
	if (prev_char and prev_char in 'っッ\u0A71') \
	and (uroman.chr_script_name(prev_char) == uroman.chr_script_name(prev_char)) \
	and (m_double_consonant := regex.match(r'(ch\|[bcdfghjklmnpqrstwz])', rom)):
	# return m_double_consonant.group(1).replace('ch', 't') + rom, start-1, end, 'rom exp'
	# expansion might additional apply to the right
	if prev_char in 'っッ': # for Japanese, per Hepburn, use tch
	rom = m_double_consonant.group(1).replace('ch', 't') + rom
	else:
	rom = m_double_consonant.group(1).replace('ch', 'c') + rom
	start = start-1
	first_char = full_string[start]
	prev_char = (full_string[start-1] if start >= 1 else '')
	# Thai
	if uroman.chr_script_name(first_char) == 'Thai':
	if (start+1 == end) and regex.match(r'[bcdfghjklmnpqrstvwxyz]+$', rom):
	if uroman.dict_str[('syllable-info', prev_char)] == 'written-pre-consonant-spoken-post-consonant':
	for vowel_prefix_len in [1]:
	if vowel_prefix_len <= start:
	for vowel_suffix_len in [3, 2, 1]:
	if end + vowel_suffix_len <= len(full_string):
	pattern = (full_string[start-vowel_prefix_len: start]
	+ '–'
	+ full_string[end:end+vowel_suffix_len])
	if uroman.rom_rules[pattern]:
	vowel_rom_rule = uroman.rom_rules[pattern][0]
	vowel_rom = vowel_rom_rule['t']
	# print(f" PATTERN {pattern} ({full_string[start:end]}/{rom}) {rom}{vowel_rom}")
	return rom + vowel_rom, start-vowel_prefix_len, end+vowel_suffix_len, 'rom exp'
	if (uroman.chr_script_name(prev_char) == 'Thai') \
	and (uroman.dict_str[('syllable-info', prev_char)]
	== 'written-pre-consonant-spoken-post-consonant') \
	and regex.match(r'[bcdfghjklmnpqrstvwxyz]', rom) \
	and (vowel_rom := self.romanization_by_first_rule(prev_char)):
	return rom + vowel_rom, start-1, end, 'rom exp'
	# THAI CHARACTER O ANG
	if (first_char == '\u0E2D') and (end - start == 1):
	prev_script = uroman.chr_script_name(prev_char)
	next_script = uroman.chr_script_name(next_char)
	prev_rom = self.find_rom_edge_path_backwards(0, start, 1, return_str=True)
	next_rom = self.romanization_by_first_rule(next_char)
	# if not recursive:
	# lc = uroman.romanize_string(full_string[:start], lcode=self.lcode, recursive=True)
	# rc = uroman.romanize_string(full_string[end:], lcode=self.lcode, recursive=True)
	# print('PP', start, end, prev_script, next_script, prev_rom, next_rom, ' LC:', lc[-40:],
	# ' RC:', rc[:40])
	# delete THAI CHARACTER O ANG unless it is surrounded on both sides by a Thai consonant
	if not ((prev_script == 'Thai') and (next_script == 'Thai')
	and regex.match(r'[bcdfghjklmnpqrstvwxz]+$', prev_rom)
	and regex.match(r'[bcdfghjklmnpqrstvwxz]+$', next_rom)):
	# if not recursive:
	# print(f'* DELETE O ANG {first_char} {start}-{end} LC: {lc[-40:]} RC: {rc[:40]}')
	return '', start, end, 'rom del'
	# Coptic: consonant + grace-accent = e + consonant
	if next_char and (next_char == "\u0300") and (uroman.chr_script_name(last_char) == "Coptic")\
	and (not self.simple_top_romanization_candidate_for_span(orig_start, end+1)):
	rom = 'e' + rom
	end = end+1
	last_char = full_string[end - 1]
	next_char = (full_string[end] if end < len(full_string) else '')
	annot = 'rom exp'
	# Japanese small y: ki + small ya = kya etc.
	if (next_char and next_char in 'ゃゅょャュョ') \
	and (uroman.chr_script_name(last_char) == uroman.chr_script_name(next_char)) \
	and regex.search(r'([bcdfghjklmnpqrstvwxyz]i$)', rom) \
	and (y_rom := self.romanization_by_first_rule(next_char)) \
	and (not self.simple_top_romanization_candidate_for_span(orig_start, end+1)) \
	and (not self.simple_top_romanization_candidate_for_span(start, end+1)):
	rom = rom[:-1] + y_rom
	end = end+1
	last_char = full_string[end - 1]
	next_char = (full_string[end] if end < len(full_string) else '')
	annot = 'rom exp'
	# Japanese vowel lengthener (U+30FC)
	last_rom_char = last_chr(rom)
	if (next_char == 'ー') \
	and (uroman.chr_script_name(last_char) in ('Hiragana', 'Katakana')) \
	and (last_rom_char in 'aeiou'):
	return rom + last_rom_char, start, end+1, 'rom exp'
	# Virama (in Indian languages)
	if self.uroman.dict_bool[('is-virama', next_char)]:
	return rom, start, end + 1, "rom exp"
	if rom.startswith(' ') and ((start == 0) or (prev_char == ' ')):
	rom = rom[1:]
	if rom.endswith(' ') and ((end == len(full_string)+1) or (next_char == ' ')):
	rom = rom[:-1]
	return rom, start, end, annot

	def prep_braille(self, **_args) -> None:
	if self.contains_script['Braille']:
	dots6 = '\u2820' # characters in following word are upper case
	all_caps = False
	for i, c in enumerate(self.s):
	if (i >= 1) and (self.s[i-1] == dots6) and (c == dots6):
	all_caps = True
	elif all_caps:
	if c in '\u2800': # Braille space
	all_caps = False
	else:
	self.props[('is-upper', i)] = True

	def pick_tibetan_vowel_edge(self, **args) -> None:
	if not self.contains_script['Tibetan']:
	return None
	verbose = bool(args.get('verbose'))
	s = self.s
	uroman = self.uroman
	tibetan_syllable = []
	tibetan_letter_positions = []
	for start in range(self.max_vertex):
	c = s[start]
	if (uroman.chr_script_name(c) == 'Tibetan') and self.char_is_letter_or_vowel_sign(c):
	tibetan_letter_positions.append(start)
	else:
	if tibetan_letter_positions:
	tibetan_syllable.append(tibetan_letter_positions)
	tibetan_letter_positions = []
	if tibetan_letter_positions:
	tibetan_syllable.append(tibetan_letter_positions)
	for tibetan_letter_positions in tibetan_syllable:
	vowel_pos = None
	orig_txt = ''
	roms = []
	subjoined_letter_positions = []
	first_letter_position = tibetan_letter_positions[0]
	for i in tibetan_letter_positions:
	c = s[i]
	orig_txt += c
	rom = first_non_none(self.simple_top_romanization_candidate_for_span(i, i+1), "?")
	self.props[('edge-vowel', i)] = None
	if self.char_is_vowel_sign(c) or (rom and regex.match(r"[aeiou]+$", rom)):
	vowel_pos = i
	self.props[('edge-vowel', i)] = True
	# delete any syllable initial ' before vowel
	if roms == ["'"]:
	self.props[('edge-delete', i-1)] = True
	elif self.char_is_subjoined_letter(c):
	subjoined_letter_positions.append(i)
	if i > first_letter_position:
	if c == "\u0FB0":
	vowel_pos = i-1
	self.props[('edge-vowel', i-1)] = True
	else:
	self.props[('edge-vowel', i-1)] = False
	rom = regex.sub(r'([bcdfghjklmnpqrstvwxyz].*)a$', r'\1', rom)
	elif c == "\u0F60": # Tibetan letter -a (')
	self.props[('edge-vowel', i)] = False
	if i > first_letter_position:
	vowel_pos = i-1
	self.props[('edge-vowel', i-1)] = True
	if i == tibetan_letter_positions[-1]:
	self.props[('edge-delete', i)] = True
	if roms and not (roms[-1] in "aeiou"):
	rom = "a'"
	else:
	rom = "'"
	else:
	rom = regex.sub(r'([bcdfghjklmnpqrstvwxyz].*)a$', r'\1', rom)
	roms.append(rom)
	if vowel_pos is not None:
	for i in tibetan_letter_positions:
	if self.props.get(('edge-vowel', i)) is None:
	self.props[('edge-vowel', i)] = False
	else:
	best_cost, best_vowel_pos, best_pre, best_post = math.inf, None, None, None
	n_letters = len(tibetan_letter_positions)
	for i in tibetan_letter_positions:
	rel_pos = i - first_letter_position
	pre, post = ''.join(roms[:rel_pos+1]), ''.join(roms[rel_pos+1:])
	if self.props.get(('edge-vowel', i)) is False:
	cost = 20
	if cost < best_cost:
	best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
	elif n_letters == 1:
	cost = 0
	if cost < best_cost:
	best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
	elif n_letters == 2:
	cost = 0 if i == 0 else 0.1
	if cost < best_cost:
	best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
	else:
	good_suffix = regex.match(r"(?:\|[bcdfghjklmnpqrstvwxz]\|bh\|bs\|ch\|cs\|dd\|ddh\|"
	r"dh\|dz\|dzh\|gh\|gr\|gs\|kh\|khs\|kss\|n\|nn\|nt\|ms\|ng\|ngs\|ns\|ph\|"
	r"rm\|sh\|ss\|th\|ts\|tsh\|tt\|tth\|zh\|zhs)'?$", post)
	good_prefix = regex.match(r"'?(?:.\|bd\|br\|brg\|brgy\|bs\|bsh\|bst\|bt\|bts\|by\|bz\|bzh\|"
	r"ch\|db\|dby\|dk\|dm\|dp\|dpy\|dr\|"
	r"gl\|gn\|gr\|gs\|gt\|gy\|gzh\|kh\|khr\|khy\|kr\|ky\|ld\|lh\|lt\|mkh\|mny\|mth\|mtsh\|"
	r"ny\|ph\|phr\|phy\|rgy\|rk\|el\|rn\|rny\|rt\|rts\|"
	r"sk\|skr\|sky\|sl\|sm\|sn\|sny\|sp\|spy\|sr\|st\|th\|ts\|tsh)$", pre)
	subjoined_suffix = all([x in subjoined_letter_positions
	for x in tibetan_letter_positions[rel_pos+2:]])
	# print('GOOD', good_suffix, good_prefix, subjoined_suffix, f'{pre}a{post}',
	# subjoined_letter_positions, tibetan_letter_positions[rel_pos+2:])
	if good_suffix and good_prefix:
	cost = len(pre) * 0.1
	elif good_suffix:
	cost = len(pre)
	elif subjoined_suffix and good_prefix:
	cost = len(pre) * 0.3
	elif subjoined_suffix:
	cost = len(pre) * 0.5
	else:
	cost = math.inf
	if cost < best_cost:
	best_cost, best_vowel_pos, best_pre, best_post = cost, i, pre, post
	if best_vowel_pos is not None:
	for i in tibetan_letter_positions:
	if self.props.get(('edge-vowel', i)) is None:
	value = (i == best_vowel_pos)
	self.props[('edge-vowel', i)] = value
	if verbose:
	best_cost = best_cost if isinstance(best_cost, int) else round(best_cost, 2)
	sys.stderr.write(f'Tib. best cost: "{best_pre}a{best_post}" o:{orig_txt} c:{round(best_cost, 2)}'
	f' p:{best_vowel_pos} {tibetan_letter_positions}\n')

	def add_default_abugida_vowel(self, rom: str, start: int, end: int, annotation: str = '') -> str:
	"""Adds an abugida vowel (e.g. "a") where needed. Important for many languages in South Asia."""
	uroman = self.uroman
	s = self.s
	try:
	first_s_char = s[start]
	last_s_char = s[end-1]
	script_name = uroman.chr_script_name(first_s_char)
	script = self.uroman.scripts[script_name.lower()]
	if not (abugida_default_vowels := script['abugida-default-vowels']):
	return rom
	key = (script, rom)
	if key in uroman.abugida_cache:
	base_rom, base_rom_plus_vowel, mod_rom = uroman.abugida_cache[key]
	rom = mod_rom
	else:
	vowels_regex1 = '\|'.join(abugida_default_vowels) # e.g. 'a' or 'a\|o'
	vowels_regex2 = '\|'.join(map(lambda x: x + '+', abugida_default_vowels)) # e.g. 'a+' or 'a+\|o+'
	if m := regex.match(fr'([cfghkmnqrstxy]?y)({vowels_regex2})-?$', rom):
	base_rom = m.group(1)
	base_rom_plus_vowel = base_rom + m.group(2)
	elif m := regex.match(fr'([bcdfghjklmnpqrstvwxyz]+)({vowels_regex1})-?$', rom):
	base_rom = m.group(1)
	base_rom_plus_vowel = base_rom + m.group(2)
	if rom.endswith('-') and (start+1 == end) and rom[0].isalpha():
	rom = rom[:-1]
	else:
	base_rom = rom
	base_rom_plus_vowel = base_rom + abugida_default_vowels[0]
	if (not regex.match(r"[bcdfghjklmnpqrstvwxyz]+$", base_rom)
	and (not ((script_name == 'Tibetan') and (base_rom == "'")))):
	base_rom, base_rom_plus_vowel = None, None
	uroman.abugida_cache[key] = (base_rom, base_rom_plus_vowel, rom)
	if base_rom is None:
	return rom
	if 'tail' in annotation:
	return rom
	prev_s_char = s[start-1] if start >= 1 else ''
	next_s_char = s[end] if len(s) > end else ''
	next2_s_char = s[end+1] if len(s) > end+1 else ''
	if script_name == 'Tibetan':
	if self.props.get(('edge-delete', start)):
	return ''
	elif self.props.get(('edge-vowel', start)):
	return base_rom_plus_vowel
	else:
	return base_rom
	if (next_s_char and ((base_rom in "bcdfghklmnpqrstvwz") or (base_rom in ["ng"]))
	and (next_s_char in "យ")): # Khmer yo
	return base_rom
	if self.uroman.dict_bool[('is-vowel-sign', next_s_char)]:
	return base_rom
	if self.uroman.dict_bool[('is-medial-consonant-sign', next_s_char)]:
	return base_rom
	if self.char_is_subjoined_letter(next_s_char):
	return base_rom
	if self.uroman.char_is_nonspacing_mark(next_s_char) \
	and self.uroman.dict_bool[('is-vowel-sign', next2_s_char)]:
	return base_rom
	if self.uroman.dict_bool[('is-virama', next_s_char)]:
	return base_rom
	if self.uroman.char_is_nonspacing_mark(next_s_char) \
	and self.uroman.dict_bool[('is-virama', next2_s_char)]:
	return base_rom
	if self.uroman.dict_bool[('is-virama', prev_s_char)]:
	return base_rom_plus_vowel
	if self.is_at_start_of_word(start) and not regex.search('r[aeiou]', rom):
	return base_rom_plus_vowel
	# delete many final schwas from most Devanagari languages (except: Sanskrit)
	if self.is_at_end_of_word(end):
	if (script_name in ("Devanagari",)) and (self.lcode not in ('san',)): # Sanskrit
	return rom
	else:
	return base_rom_plus_vowel
	if uroman.chr_script_name(prev_s_char) != script_name:
	return base_rom_plus_vowel
	if 'VOCALIC' in self.uroman.chr_name(last_s_char):
	return base_rom
	if uroman.chr_script_name(next_s_char) == script_name:
	return base_rom_plus_vowel
	except Exception:
	return rom
	else:
	pass
	# print('ABUGIDA', rom, start, script_name, script, abugida_default_vowels, prev_s_char, next_s_char)
	return rom

	def cand_is_valid(self, rom_rule: RomRule, start: int, end: int, rom: str) -> bool:
	if rom is None:
	return False
	if rom_rule['dont-use-at-start-of-word'] and self.is_at_start_of_word(start):
	return False
	if rom_rule['use-only-at-start-of-word'] and not self.is_at_start_of_word(start):
	return False
	if rom_rule['dont-use-at-end-of-word'] and self.is_at_end_of_word(end):
	return False
	if rom_rule['use-only-at-end-of-word'] and not self.is_at_end_of_word(end):
	return False
	if rom_rule['use-only-for-whole-word'] \
	and not (self.is_at_start_of_word(start) and self.is_at_end_of_word(end)):
	return False
	if (lcodes := rom_rule['lcodes']) and (self.lcode not in lcodes):
	return False
	return True

	# @profile
	def simple_sorted_romanization_candidates_for_span(self, start, end) -> List[str]:
	s = self.s[start:end]
	if not self.uroman.dict_bool[('s-prefix', s)]:
	return []
	rom_rule_candidates = []
	for rom_rule in self.uroman.rom_rules[s]:
	rom = rom_rule['t']
	if self.cand_is_valid(rom_rule, start, end, rom):
	rom_rule_candidates.append((rom_rule['n-restr'] or 0, rom_rule['t']))
	rom_rule_candidates.sort(reverse=True)
	return [x[1] for x in rom_rule_candidates]

	def simple_top_romanization_candidate_for_span(self, start, end, simple_search: bool = False) -> str \| None:
	if (start < 0) or (end > self.max_vertex):
	return None
	span_range = (start, end)
	if (cached_result := self.simple_top_rom_cache.get(span_range)) is not None:
	return cached_result
	best_cand, best_n_restr, best_rom_rule = None, None, None
	for rom_rule in self.uroman.rom_rules[self.s[start:end]]:
	if self.cand_is_valid(rom_rule, start, end, rom_rule['t']):
	n_restr = rom_rule['n-restr'] or 0
	if best_n_restr is None or (n_restr > best_n_restr):
	best_cand, best_n_restr, best_rom_rule = rom_rule['t'], n_restr, rom_rule
	if simple_search:
	return best_cand
	if best_rom_rule:
	t_at_end_of_syllable = best_rom_rule['t-at-end-of-syllable']
	if t_at_end_of_syllable is not None:
	is_at_end_of_syllable, rationale = self.is_at_end_of_syllable(end)
	if is_at_end_of_syllable:
	best_cand = t_at_end_of_syllable
	# print(f" SIMPLE {start}-{end} {best_cand} ({best_rom_rule['t']},{t_at_end_of_syllable}) "
	# f"END:{is_at_end_of_syllable} ({rationale})")
	self.simple_top_rom_cache[span_range] = best_cand
	# if (best_rom_rule is not None) and ('cancel' in (prov := best_rom_rule['prov'])):
	# sys.stderr.write(f' Cancel {self.s} ({start}-{end}) {prov} {self.s[start:end]}\n')
	return best_cand

	def decomp_rom(self, char_position: int) -> str \| None:
	"""Input: decomposable character such as ﻼ or ½
	Output: la or 1/2"""
	full_string = self.s
	char = full_string[char_position]
	rom = None
	if ud_decomp_s := ud.decomposition(char):
	format_comps = []
	other_comps = []
	decomp_s = ''
	# name = self.uroman.chr_name(char)
	for ud_decomp_elem in ud_decomp_s.split():
	if ud_decomp_elem.startswith("<"):
	format_comps.append(ud_decomp_elem)
	else:
	try:
	norm_char = chr(int(ud_decomp_elem, 16))
	except ValueError:
	other_comps.append(ud_decomp_elem)
	else:
	decomp_s += norm_char
	if (format_comps and (format_comps[0] not in ('<super>', '<sub>', '<noBreak>', '<compat>'))
	and (not other_comps) and decomp_s):
	rom = self.uroman.romanize_string(decomp_s, self.lcode)
	# make sure to add a space for 23½ -> 23 1/2
	if rom and ud.numeric(char, None):
	rom = rom.replace('⁄', '/')
	if char_position >= 1 and ud.numeric(full_string[char_position-1], None):
	rom = ' ' + rom
	if (char_position+1 < len(full_string)) and ud.numeric(full_string[char_position+1], None):
	rom += ' '
	return rom

	def add_romanization(self, **args):
	"""Adds a romanization edge to the romanization lattice."""
	for start in range(self.max_vertex):
	for end in range(start+1, self.max_vertex+1):
	if not self.uroman.dict_bool[('s-prefix', self.s[start:end])]:
	break
	if (rom := self.simple_top_romanization_candidate_for_span(start, end)) is not None:
	if self.contains_script['Braille'] and (start+1 == end):
	if self.props.get(('is-upper', start)):
	rom = rom.upper()
	edge_annotation = 'rom'
	if regex.match(r'\+(m\|ng\|n\|h\|r)', rom):
	rom, edge_annotation = rom[1:], 'rom tail'
	rom = self.add_default_abugida_vowel(rom, start, end, annotation=edge_annotation)
	# orig_rom, orig_start, orig_end = rom, start, end
	rom, start2, end2, exp_edge_annotation \
	= self.expand_rom_with_special_chars(rom, start, end, annotation=edge_annotation,
	recursive=args.get('recursive', False), **args)
	edge_annotation = exp_edge_annotation or edge_annotation
	# if (orig_rom, orig_start, orig_end) != (rom, start, end):
	# print(f'EXP {s} {orig_rom} {orig_start}-{orig_end} -> {rom} {start}-{end}')
	# if rom != rom_orig: print('** Add ABUGIDA', rom, start, end, rom2)
	self.add_edge(Edge(start2, end2, rom, edge_annotation))
	if start < len(self.s):
	char = self.s[start]
	cp = ord(char)
	# Korean Hangul characters
	if 0xAC00 <= cp <= 0xD7A3:
	if rom := self.uroman.unicode_hangul_romanization(char):
	self.add_edge(Edge(start, start+1, rom, 'rom'))
	# character decomposition
	if rom_decomp := self.decomp_rom(start):
	self.add_edge(Edge(start, start + 1, rom_decomp, 'rom decomp'))

	@staticmethod
	def update_edge_list(edges, new_edge, old_edges) -> List[NumEdge]:
	new_edge_not_yet_added = True
	result = []
	for edge in edges:
	if edge in old_edges:
	edge.active = False
	if new_edge_not_yet_added:
	result.append(new_edge)
	new_edge_not_yet_added = False
	else:
	result.append(edge)
	if new_edge_not_yet_added:
	result.append(new_edge)
	return result

	@staticmethod
	def edge_is_digit(edge: Edge \| None) -> bool:
	return (isinstance(edge, NumEdge)
	and (edge.value is not None)
	and isinstance(edge.value, int)
	and (edge.type == 'digit')
	and (0 <= edge.value <= 9)
	and (edge.end - edge.start == 1))

	@staticmethod
	def is_gap_null_edge(edge: Edge) -> bool:
	return isinstance(edge, NumEdge) and (edge.orig_txt in ('零', '〇'))

	@staticmethod
	def braille_digit(char: str) -> str \| None:
	position = '\u281A\u2801\u2803\u2809\u2819\u2811\u280B\u281B\u2813\u280A'.find(char) # Braille 0-9
	return str(position) if position >= 0 else None

	def add_braille_number(self, start: int, end: int, txt: str, **_args) -> None:
	new_edge = NumEdge(start, end, txt, self.uroman)
	new_edge.type = 'number'
	self.add_edge(new_edge)

	def add_braille_numbers(self, **_args):
	if self.contains_script['Braille']:
	s = self.s
	num_s, start = '', None
	for i in range(len(s)):
	char = s[i]
	if char == '\u283C': # number mark
	if start is None:
	start = i
	elif (start is not None) and (digit_s := self.braille_digit(char)):
	num_s += digit_s
	elif (start is not None) and (char == '\u2832'): # period
	num_s += '.'
	elif (start is not None) and (char == '\u2802'): # comma
	num_s += ','
	elif isinstance(start, int):
	self.add_braille_number(start, i, num_s)
	num_s, start = '', None
	if start is not None:
	self.add_braille_number(start, len(s), num_s)

	def add_numbers(self, uroman, **args):
	"""Adds a numerical romanization edge to the romanization lattice, currently just for digits.
	To be significantly expanded to cover complex Chinese, Egyptian, Amharic numbers."""
	verbose = bool(args.get('verbose'))
	s = self.s
	num_edges = []
	for start in range(len(s)):
	char = s[start]
	if uroman.num_props[char]:
	new_edge = NumEdge(start, start + 1, char, uroman)
	num_edges.append(new_edge)
	if verbose:
	print('NumEdge', new_edge)
	self.add_edge(new_edge)
	# D1 sequence of digits 1234
	for edge in num_edges:
	if self.edge_is_digit(edge) and edge.active: # and (edge.value != 0):
	n_decimal_points = 0
	n_decimals = None
	new_value_s = str(edge.value)
	sub_edges = [edge]
	prev_edge = edge
	while True:
	right_edge = self.best_right_neighbor_edge(prev_edge.end)
	if self.edge_is_digit(right_edge):
	sub_edges.append(right_edge)
	new_value_s += str(right_edge.value)
	if n_decimals is not None:
	n_decimals += 1
	prev_edge = right_edge
	elif ((prev_edge.end < len(s)) and (s[prev_edge.end] == '.') and (n_decimal_points == 0)
	and (right_edge2 := self.best_right_neighbor_edge(prev_edge.end + 1))
	and self.edge_is_digit(right_edge2)):
	if right_edge is None:
	right_edge = Edge(prev_edge.end, prev_edge.end+1, s[prev_edge.end],
	'decimal period')
	self.add_edge(right_edge)
	sub_edges.append(right_edge)
	sub_edges.append(right_edge2)
	new_value_s += '.' + str(right_edge2.value)
	n_decimal_points += 1
	n_decimals = 1
	prev_edge = right_edge2
	else:
	break
	if len(sub_edges) >= 2:
	new_value = float(new_value_s) if '.' in new_value_s else int(new_value_s)
	new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True)
	new_edge.update(value=new_value, value_s=new_value_s, n_decimals=n_decimals, num_base=1,
	e_type='D1', script=sub_edges[-1].script)
	self.add_edge(new_edge)
	num_edges = self.update_edge_list(num_edges, new_edge, sub_edges)
	if verbose:
	print(new_edge.type, new_edge)
	# G1 combine () "single digits" 2100=200, 3*10= 30
	for edge in num_edges:
	if (isinstance(edge, NumEdge) and edge.active and (edge.num_base == 1)
	and isinstance(edge.value, int) and (edge.value >= 1)):
	right_edge = self.best_right_neighbor_edge(edge.end, skip_num_edge=False)
	if (right_edge
	and isinstance(right_edge, NumEdge)
	and right_edge.active
	and isinstance(right_edge.value, int)
	and (right_edge.num_base > 1)
	and (not right_edge.is_large_power)):
	new_value = edge.value * right_edge.value
	new_edge = NumEdge(edge.start, right_edge.end, str(new_value), uroman, active=True)
	new_edge.update(value=new_value, num_base=right_edge.num_base, e_type='G1',
	orig_txt=edge.orig_txt + right_edge.orig_txt,
	script=right_edge.script)
	self.add_edge(new_edge)
	num_edges = self.update_edge_list(num_edges, new_edge, [edge, right_edge])
	if verbose:
	print(new_edge.type, new_edge)
	# G2 combine (+) G1 "single digits" 200+30+4=234 (within larger blocks of 1000, 1000000)
	for edge in num_edges:
	if isinstance(edge, NumEdge) and edge.active and isinstance(edge.value, int) and not edge.is_large_power:
	sub_edges = [edge]
	prev_edge = edge
	prev_non_edge = edge # None if (edge.orig_txt in '零') else prev_edge
	while (prev_edge
	and (right_edge := self.best_right_neighbor_edge(prev_edge.end, skip_num_edge=False))
	and isinstance(right_edge, NumEdge)
	and right_edge.active
	and isinstance(right_edge.value, int)
	and (not right_edge.is_large_power)
	and (self.is_gap_null_edge(prev_non_edge)
	or ((prev_non_edge.num_base > right_edge.value)
	and (prev_non_edge.num_base > right_edge.num_base)))):
	sub_edges.append(right_edge)
	prev_edge = right_edge
	if not self.is_gap_null_edge(right_edge):
	prev_non_edge = right_edge
	if len(sub_edges) >= 2:
	new_value = sum([e.value for e in sub_edges])
	new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True)

	new_edge.update(value=new_value, num_base=sub_edges[-1].num_base, e_type='G2',
	orig_txt=''.join([e.orig_txt for e in sub_edges]),
	script=sub_edges[-1].script)
	self.add_edge(new_edge)
	num_edges = self.update_edge_list(num_edges, new_edge, sub_edges)
	new_edge.type = 'G2'
	if verbose:
	print(new_edge.type, new_edge)
	# G3 combine () G2 blocks with large powers, e.g. 2341000 = 234000
	for edge in num_edges:
	if (isinstance(edge, NumEdge) and edge.active and (not edge.is_large_power)
	and (isinstance(edge.value, int) or isinstance(edge.value, float))):
	right_edge = self.best_right_neighbor_edge(edge.end, skip_num_edge=False)
	if (right_edge
	and isinstance(right_edge, NumEdge)
	and right_edge.active
	and isinstance(right_edge.value, int)
	and (right_edge.num_base > 1)
	and right_edge.is_large_power):
	new_value = round(edge.value * right_edge.value, 5)
	if isinstance(new_value, float) and new_value.is_integer():
	new_value = int(new_value)
	new_edge = NumEdge(edge.start, right_edge.end, str(new_value), uroman, active=True)
	new_edge.update(value=new_value, num_base=right_edge.num_base, e_type='G3',
	orig_txt=edge.orig_txt + right_edge.orig_txt,
	script=right_edge.script)
	self.add_edge(new_edge)
	num_edges = self.update_edge_list(num_edges, new_edge, [edge, right_edge])
	if verbose:
	print(new_edge.type, new_edge)
	# G4 combine (+) G3 blocks 234000+567=234567
	for edge in num_edges:
	if isinstance(edge, NumEdge) and edge.active and isinstance(edge.value, int):
	sub_edges = [edge]
	while ((prev_edge := sub_edges[-1])
	and (right_edge := self.best_right_neighbor_edge(prev_edge.end, skip_num_edge=False))
	and isinstance(right_edge, NumEdge)
	and right_edge.active
	and isinstance(right_edge.value, int)
	and (prev_edge.num_base > right_edge.value)
	and (prev_edge.num_base > right_edge.num_base)):
	if ((prev_edge.script == 'CJK')
	and (prev_edge.num_base >= 1000)
	and ('tag' not in prev_edge.type)
	and regex.match('10+$', str(prev_edge.num_base))
	and (1 <= right_edge.value <= 9)
	and (right_edge.start + 1 == right_edge.end)):
	new_num_base = prev_edge.num_base // 10
	new_value = new_num_base * right_edge.value
	# print('DIGIT TAG', prev_edge, right_edge, new_value)
	right_edge.value = new_value
	right_edge.num_base = new_num_base
	right_edge.type = 'G4tag'
	sub_edges.append(right_edge)
	if len(sub_edges) >= 2:
	new_value = sum([e.value for e in sub_edges])
	new_edge = NumEdge(sub_edges[0].start, sub_edges[-1].end, str(new_value), uroman, active=True)
	new_edge.update(value=new_value, num_base=sub_edges[-1].num_base, e_type='G4',
	orig_txt=''.join([e.orig_txt for e in sub_edges]),
	script=sub_edges[-1].script)
	self.add_edge(new_edge)
	num_edges = self.update_edge_list(num_edges, new_edge, sub_edges)
	if verbose:
	print(new_edge.type, new_edge)
	# F1
	for edge in num_edges:
	# cushion fractions with spaces as needed: e.g. 23½ -> 23 1/2 or 十一五 -> 11 5
	if isinstance(edge, NumEdge) and regex.match(r'\d', edge.txt):
	left_edge = self.best_left_neighbor_edge(edge.start)
	if left_edge and regex.search(r'\d$', left_edge.txt):
	if edge.fraction:
	sep = ' '
	else:
	sep = '·'
	edge.txt = sep + edge.txt

	for edge in num_edges:
	if (isinstance(edge, NumEdge) and edge.active and (edge.value is not None)
	and (((edge.value > 1000) and (edge.start + 1 == edge.end))
	or (edge.orig_txt in '兩參参伍陆陸什')
	or (edge.orig_txt in ('京兆', )))):
	edge.active = False
	if verbose: # or (num_edges and any([e.type in ['G1', 'G2', 'G3', 'G4'] for e in num_edges])):
	if num_edges:
	print('actives:')
	for num_edge in num_edges:
	print(num_edge)
	for start in range(len(s)):
	start_char = s[start]
	if (best_edge := self.best_edge_in_span(start, start+1)) and isinstance(best_edge, NumEdge):
	continue
	if (num := ud_numeric(start_char)) is not None:
	name = self.uroman.chr_name(start_char)
	if ("DIGIT" in name) and isinstance(num, int) and (0 <= num <= 9):
	# if start_char not in '0123456789': print('DIGIT', s[start], num, name)
	self.add_edge(Edge(start, start + 1, str(num), 'num'))
	else:
	uroman.stats[('*NUM', start_char, num)] += 1

	def add_rom_fall_back_singles(self, **_args):
	"""For characters in the original string not covered by romanizations and numbers,
	add a fallback edge based on type, romanization of single char, or original char."""
	for start in range(self.max_vertex):
	end = start+1
	orig_char = self.s[start]
	if not self.lattice[(start, end)]:
	rom, edge_annotation = orig_char, 'orig'
	if self.uroman.char_is_nonspacing_mark(rom):
	rom, edge_annotation = '', 'Mn'
	elif self.uroman.char_is_format_char(rom): # e.g. zero-width non-joiner, zero-width joiner
	rom, edge_annotation = '', 'Cf'
	elif ud.category(orig_char) == 'Co':
	rom, edge_annotation = '', 'Co'
	elif rom == ' ':
	edge_annotation = 'orig'
	# elif self.uroman.char_is_space_separator(rom):
	# rom, edge_annotation = ' ', 'Zs'
	elif (rom2 := self.simple_top_romanization_candidate_for_span(start, end)) is not None:
	rom = rom2
	if regex.match(r'\+(m\|ng\|n\|h\|r)', rom):
	rom = rom[1:]
	edge_annotation = 'rom single'
	# else the original values still hold: rom, edge_annotation = orig_char, 'orig'
	self.add_edge(Edge(start, end, rom, edge_annotation))

	@staticmethod
	def add_new_edge(old_edges: List[Edge], start: int, end: int, new_rom: str, new_type: str, position: int \| None,
	old_edge_dict: dict)\
	-> None:
	if (start, end, new_rom) not in old_edge_dict:
	new_edge = Edge(start, end, new_rom, new_type)
	if position is None:
	old_edges.append(new_edge)
	else:
	old_edges.insert(position + 1, new_edge)
	old_edge_dict[(start, end, new_rom)] = new_edge
	# print(f' ALT {start}-{end} {new_rom}')

	def add_alternatives(self, old_edges: List[Edge]) -> None:
	old_edge_dict = {}
	for old_edge in old_edges:
	old_edge_dict[(old_edge.start, old_edge.end, old_edge.txt)] = old_edge
	for position, old_edge in enumerate(old_edges):
	if old_edge.type.startswith('rom-alt'):
	continue # not old
	start, end = old_edge.start, old_edge.end
	orig_s = self.s[start:end]
	old_rom = old_edge.txt
	# self.lattice[(start, end)]:
	for rom_rule in self.uroman.rom_rules[orig_s]:
	rom_t = rom_rule['t']
	if self.cand_is_valid(rom_rule, start, end, rom_t):
	rom_alts = rom_rule['t-alts']
	rom_eosyl = rom_rule['t-at-end-of-syllable']
	if (rom_t == old_rom) and rom_alts:
	for rom_alt in rom_alts:
	self.add_new_edge(old_edges, start, end, rom_alt, 'rom-alt', position,
	old_edge_dict)
	if (rom_t == old_rom) and rom_eosyl:
	self.add_new_edge(old_edges, start, end, rom_t, 'rom-alt2', position, old_edge_dict)
	if rom_eosyl == old_rom:
	self.add_new_edge(old_edges, start, end, rom_t, 'rom-alt3', position, old_edge_dict)

	def all_edges(self, start: int, end: int) -> List[Edge]:
	result = []
	for start2 in range(start, end):
	for end2 in sorted(list(self.lattice[(start2, 'right')]), reverse=True):
	if end2 <= end:
	result.extend(self.lattice[(start2, end2)])
	else:
	break
	return result

	def best_edge_in_span(self, start: int, end: int, skip_num_edge: bool = False) -> Edge \| None:
	edges = self.lattice[(start, end)]
	# if len(edges) >= 2: print('Multi edge', start2, end2, self.s[start2:end2], edges)
	decomp_edge, other_edge, rom_edge = None, None, None
	for edge in edges:
	if isinstance(edge, NumEdge):
	if skip_num_edge:
	continue
	if edge.active:
	return edge
	if edge.type.startswith('rom decomp'):
	if decomp_edge is None:
	decomp_edge = edge # plan C
	elif regex.match(r'(?:rom\|num)', edge.type):
	if rom_edge is None:
	rom_edge = edge # plan B
	elif other_edge is None:
	other_edge = edge # plan D
	return rom_edge or decomp_edge or other_edge

	def best_right_neighbor_edge(self, start: int, skip_num_edge: bool = False) -> Edge \| None:
	for end in sorted(list(self.lattice[(start, 'right')]), reverse=True):
	if best_edge := self.best_edge_in_span(start, end, skip_num_edge=skip_num_edge):
	return best_edge
	return None

	def best_left_neighbor_edge(self, end: int, skip_num_edge: bool = False) -> Edge \| None:
	for start in sorted(list(self.lattice[(end, 'left')])):
	if best_edge := self.best_edge_in_span(start, end, skip_num_edge=skip_num_edge):
	return best_edge
	return None

	def best_rom_edge_path(self, start: int, end: int, skip_num_edge: bool = False) -> List[Edge]:
	"""Finds the best romanization edge path through the romanization lattice, including
	non-romanized pieces such as ASCII and non-ASCII punctuation."""
	result = []
	start2 = start
	while start2 < end:
	if best_edge := self.best_right_neighbor_edge(start2, skip_num_edge=skip_num_edge):
	result.append(best_edge)
	start2 = best_edge.end
	else: # should not happen
	start2 += 1
	return result

	def find_rom_edge_path_backwards(self, start: int, end: int, min_char: int \| None = None,
	return_str: bool = False, skip_num_edge: bool = False) -> List[Edge] \| str:
	"""Finds a partial best path on the left from a start position to provide left contexts for
	romanization rules. Can return a string or a list of edges. Is typically used for a short context,
	as specified by min_char."""
	result_edges = []
	rom = ''
	end2 = end
	while start < end2:
	old_end2 = end2
	if new_edge := self.best_left_neighbor_edge(end2, skip_num_edge=skip_num_edge):
	result_edges = [new_edge] + result_edges
	rom = new_edge.txt + rom
	end2 = new_edge.start
	if min_char and len(rom) >= min_char:
	break
	if old_end2 >= end2:
	end2 -= 1
	if return_str:
	return rom
	else:
	return result_edges

	@staticmethod
	def edge_path_to_surf(edges) -> str:
	result = ''
	for edge in edges:
	result += edge.txt
	return result


	# @timer
	def main():
	"""This function provides a user interface, either using argparse for a command line interface,
	or providing direct function calls.
	First, a uroman object will have to created, loading uroman data (directory must be provided,
	listed as default). This only needs to be done once.
	After that you can romanize from file to file, or just romanize a string."""

	# Compute data_dir based on the location of this executable script.
	src_dir = os.path.dirname(os.path.realpath(__file__))
	root_dir = os.path.dirname(src_dir)
	data_dir = os.path.join(root_dir, "data")
	# print(src_dir, root_dir, data)

	parser = argparse.ArgumentParser()
	parser.add_argument('direct_input', nargs='*', type=str)
	parser.add_argument('--data_dir', type=Path, default=data_dir, help='uroman resource dir')
	parser.add_argument('-i', '--input_filename', type=str, help='default: sys.stdin')
	parser.add_argument('-o', '--output_filename', type=str, help='default: sys.stdout')
	parser.add_argument('-l', '--lcode', type=str, default=None,
	help='ISO 639-3 language code, e.g. eng')
	# parser.add_argument('-f', '--rom_format', type=RomFormat, default=RomFormat.STR, help:'alt: RomFormat.EDGES')
	parser.add_argument('-f', '--rom_format', type=RomFormat, default=RomFormat.STR,
	choices=list(RomFormat), help="Output format of romanization. 'edges' provides offsets")
	# The remaining arguments are mostly for development and test
	parser.add_argument('--max_lines', type=int, default=None, help='limit uroman to first n lines')
	parser.add_argument('--load_log', action='count', default=0, help='report load stats')
	parser.add_argument('--test', action='count', default=0, help='perform/display a few tests')
	parser.add_argument('-v', '--verbose', action='count', default=0)
	parser.add_argument('--rebuild_ud_props', action='count', default=0,
	help='rebuild UnicodeDataProps files (for development mode only)')
	parser.add_argument('--rebuild_num_props', action='count', default=0,
	help='rebuild NumProps file (for development mode only)')
	parser.add_argument('--no_caching', action='count', default=0, help='for development mode: speed')
	parser.add_argument('--silent', action='count', default=0, help='suppress ... progress')
	parser.add_argument('-a', '--ablation', type=str, default='', help='for development mode: nocap')
	parser.add_argument('--stats', action='count', default=0, help='for development mode: numbers')
	parser.add_argument('--ignore_args', action='count', default=0, help='for usage illustration only')
	parser.add_argument(PROFILE_FLAG, type=argparse.FileType('w', encoding='utf-8', errors='ignore'),
	default=None, metavar='PROFILE-FILENAME', help='(optional output for performance analysis)')
	args = parser.parse_args()
	# copy selected (minor) args from argparse.Namespace to dict
	args_dict = {'rom_format': args.rom_format, 'load_log': args.load_log, 'test': args.test, 'stats': args.stats,
	'no_caching': args.no_caching, 'max_lines': args.max_lines, 'verbose': args.verbose,
	'rebuild_ud_props': args.rebuild_ud_props, 'rebuild_num_props': args.rebuild_num_props,
	'ablation': args.ablation, 'silent': args.silent}
	pr = None
	if args.profile:
	gc.enable()
	gc.set_debug(gc.DEBUG_STATS)
	gc.set_debug(gc.DEBUG_LEAK)
	pr = cProfile.Profile()
	pr.enable()
	'''Sample calls:
	uroman.py --help
	uroman.py -i ../test/multi-script.txt -o ../test/multi-script-out2.txt
	uroman.py < ../test/multi-script.txt > ../test/multi-script-out2.txt
	uroman.py Игорь
	uroman.py Игорь --lcode ukr
	uroman.py ألاسكا 서울 Καλιφόρνια
	uroman.py ちょっとまってください -f edges
	uroman.py "महात्मा गांधी" -f lattice
	uroman.py สวัสดี --load_log
	uroman.py --test
	uroman.py --ignore_args
	uroman.py Բարեւ -o ../test/tmp-out.txt -f edges
	# In double input cases such as in the line below,
	# the input-file's romanization is sent to stdout, while the direct-input romanization is sent to stderr
	uroman.py ⴰⵣⵓⵍ -i ../test/multi-script.txt > ../test/multi-script-out2.txt
	'''

	if args.ignore_args:
	# minimal calls
	uroman = Uroman(args.data_dir)
	s, s2, s3, s4 = 'Игорь', 'ちょっとまってください', 'ka‍n‍ne', 'महात्मा गांधी'
	print(s, uroman.romanize_string(s))
	print(s, uroman.romanize_string(s, lcode='ukr'))
	print(s2, Edge.json_str(uroman.romanize_string(s2, rom_format=RomFormat.EDGES)))
	print(s3, Edge.json_str(uroman.romanize_string(s3, rom_format=RomFormat.EDGES)))
	print(s4, Edge.json_str(uroman.romanize_string(s4, rom_format=RomFormat.LATTICE)))
	# Note that ../test/multi-script.txt has several lines starting with ::lcode eng etc.
	# This allows users to select specific language codes to specific lines, overwriting the overall --lcodes
	uroman.romanize_file(input_filename='../test/multi-script.txt',
	output_filename='../test/multi-script-out3.txt')
	else:
	# build a Uroman object (once for many applications and different scripts and languages)
	uroman = Uroman(args.data_dir, load_log=args.load_log, rebuild_ud_props=args.rebuild_ud_props,
	rebuild_num_props=args.rebuild_num_props)
	romanize_file_p = (args.input_filename or args.output_filename
	or not (args.direct_input or args.test or args.ignore_args
	or args.rebuild_ud_props or args.rebuild_num_props))
	# Romanize any positional arguments, interpreted as strings to be romanized.
	for s in args.direct_input:
	result = uroman.romanize_string(s.rstrip(), lcode=args.lcode, **args_dict)
	result_json = Edge.json_str(result)
	if romanize_file_p:
	# input from both file/stdin (to file/stdout) and direct-input (to stderr)
	if args.input_filename:
	sys.stderr.write(result_json + '\n')
	# input from direct-input (but not from file/stdin) to stdout
	# else pass
	# no file/stdin or file/stdout, so we write romanization of direct-input to stdout
	else:
	print(result_json)
	# If provided, apply romanization to an entire file.
	if romanize_file_p:
	uroman.romanize_file(args.input_filename, args.output_filename, lcode=args.lcode,
	direct_input=args.direct_input, **args_dict)
	if args.test:
	uroman.test_output_of_selected_scripts_and_rom_rules()
	uroman.test_romanization()
	if uroman.stats and args.stats:
	stats100 = {k: uroman.stats[k] for k in list(dict(uroman.stats))[:100]}
	sys.stderr.write(f'Stats: {stats100} ...\n')
	if args.profile:
	if pr:
	pr.disable()
	ps = pstats.Stats(pr, stream=args.profile).sort_stats(pstats.SortKey.TIME)
	ps.print_stats()
	print(gc.get_stats())


	if __name__ == "__main__":
	main()