phonemize-text / app.py
cdleong's picture
formatting
743f5fb
raw
history blame
4.55 kB
import streamlit as st
import epitran
import langcodes
from langcodes import LanguageTagError
from pathlib import Path
from operator import itemgetter
from collections import defaultdict
# TODO: reverse transliterate?
@st.cache
def get_lang_description_from_mapping_name(string_to_check, add_original_code= True, add_iso_url=False):
description = None
if "generic-Latn" == string_to_check:
return "Generic Latin Script text"
lang = get_langcode_lang_from_mapping_name(string_to_check)
if lang:
items = []
for key, value in lang.describe().items():
if key == "language" and add_iso_url:
iso_code = lang.to_alpha3()
value = f"[{value}](https://iso639-3.sil.org/code/{iso_code})"
items.append(f"{key}: {value}")
description = ", ".join(items)
notes = {
"-red": " (reduced)",
"-suf": " (Based on data with suffixes attached)",
"-nosuf": "Based on data with suffixes removed",
"-np": " (process naively, assuming a phonemic orthography)",
}
for key, note in notes.items():
if key in string_to_check:
description = description + note
if add_original_code:
description = f"{string_to_check}: " + description
return description
@st.cache
def get_langcode_lang_from_mapping_name(string_to_check):
if len(string_to_check)<2:
return None
substrings = string_to_check.split("-")
iso_lang_and_iso_script = substrings[:2] # first two are ISO 639-3 language, and ISO 15924 script
string_to_check = "-".join(iso_lang_and_iso_script )
lang = langcodes.get(string_to_check)
return lang
@st.cache
def get_valid_epitran_mappings_list():
map_path = Path(epitran.__path__[0]) / "data" / "map"
map_files = map_path.glob("*.*")
valid_mappings = [map_file.stem for map_file in map_files]
valid_mappings.append("cmn-Hans") # special case
problem_mappings = ['generic-Latn',
'tur-Latn-bab',
'ood-Latn-sax',
'vie-Latn-so',
'vie-Latn-ce',
'vie-Latn-no',
'kaz-Cyrl-bab'] # https://github.com/dmort27/epitran/issues/98
filtered_mappings = [mapping for mapping in valid_mappings if mapping not in problem_mappings]
return filtered_mappings
def get_epitran(selected_mapping):
if selected_mapping == "cmn-Hans":
st.info("Chinese requires a special dictionary. Downloading now")
epitran.download.cedict()
epi = epitran.Epitran(selected_mapping)
return epi
if __name__ == "__main__":
st.write("# Phonemize your text with [Epitran](https://github.com/dmort27/epitran)!")
st.write("Epitran is a library and tool for transliterating orthographic text as IPA (International Phonetic Alphabet), by Mortensen, David R. and Dalmia, Siddharth and Littell, Patrick.")
valid_epitran_mappings = get_valid_epitran_mappings_list()
#st.write(valid_epitran_mappings)
st.write(f"It supports converting many writing sytems to IPA symbols, including approximately {len(valid_epitran_mappings)} languages/scripts, listed below:")
index_of_desired_default = valid_epitran_mappings.index("swa-Latn")
selected_mapping = st.selectbox("Select input language/script:",
valid_epitran_mappings,
index=index_of_desired_default,
format_func=get_lang_description_from_mapping_name,
)
description = get_lang_description_from_mapping_name(selected_mapping, add_iso_url=True)
st.write(f"Selected input language/script: {description}")
st.info("attempting to instantiate epitran transliterator for your language/script")
epi = get_epitran(str(selected_mapping))
examples = defaultdict(lambda: 'Try typing some words in the language you chose, and they will be transliterated.')
examples['cmn-Hans'] = 'ε€ͺεˆζœ‰ι“οΌŒι“δΈŽη₯žεŒεœ¨οΌŒι“ε°±ζ˜―η₯žγ€‚'
examples['swa-Latn'] = 'Mwanzoni Kabla ulimwengu haujaumbwa alikuwepo Neno Huyo Neno alikuwa pamoja na Mungu, na Neno alikuwa Mungu.'
input_text = st.text_area(label="Whatever you type here will be transliterated!", value=examples[selected_mapping])
# combined_code = "-".join([iso_lang_code, iso_script_code])
# st.write(f"Combined code: {combined_code}")
st.info(f"transliterating `{input_text}`\n\tusing {selected_mapping}...")
transliteration = epi.transliterate(input_text)
output = {
"original": input_text,
"transliteration":transliteration,
}
st.write(output)