phonemize-text / app.py
cdleong's picture
Call the right function name -_-
dbbbbd3
raw
history blame
4.17 kB
import streamlit as st
import epitran
import langcodes
from langcodes import LanguageTagError
from pathlib import Path
from operator import itemgetter
from collections import defaultdict
# TODO: reverse transliterate?
@st.cache
def get_lang_description_from_mapping_name(string_to_check, add_iso_url=False):
description = None
if "generic-Latn" == string_to_check:
return "Generic Latin Script text"
lang = get_langcode_lang_from_mapping_name(string_to_check)
if lang:
items = []
for key, value in lang.describe().items():
if key == "language" and add_iso_url:
iso_code = lang.to_alpha3()
value = f"[{value}](https://iso639-3.sil.org/code/{iso_code})"
items.append(f"{key}: {value}")
description = ", ".join(items)
if substrings[-1] == "red":
description = description + " (reduced)"
return description
@st.cache
def get_langcode_lang_from_mapping_name(string_to_check):
if len(string_to_check)<2:
return None
substrings = string_to_check.split("-")
iso_lang_and_iso_script = substrings[:2] # first two are ISO 639-3 language, and ISO 15924 script
string_to_check = "-".join(iso_lang_and_iso_script )
lang = langcodes.get(string_to_check)
return lang
@st.cache
def get_valid_epitran_mappings_list():
map_path = Path(epitran.__path__[0]) / "data" / "map"
map_files = map_path.glob("*.*")
valid_mappings = [map_file.stem for map_file in map_files]
valid_mappings.append("cmn-Hans") # special case
problem_mappings = ['generic-Latn',
'tur-Latn-bab',
'ood-Latn-sax',
'vie-Latn-so',
'vie-Latn-ce',
'vie-Latn-no',
'kaz-Cyrl-bab'] # https://github.com/dmort27/epitran/issues/98
filtered_mappings = [mapping for mapping in valid_mappings if mapping not in problem_mappings]
return filtered_mappings
def get_epitran(selected_mapping):
if selected_mapping == "cmn-Hans":
st.info("Chinese requires a special dictionary. Downloading now")
epitran.download.cedict()
epi = epitran.Epitran(selected_mapping)
return epi
if __name__ == "__main__":
st.write("# Phonemize your text with [Epitran](https://github.com/dmort27/epitran)!")
st.write("Epitran is a library and tool for transliterating orthographic text as IPA (International Phonetic Alphabet), by Mortensen, David R. and Dalmia, Siddharth and Littell, Patrick.")
valid_epitran_mappings = get_valid_epitran_mappings_list()
#st.write(valid_epitran_mappings)
st.write(f"It supports converting many writing sytems to IPA symbols, including approximately {len(valid_epitran_mappings)} languages/scripts, listed below:")
index_of_desired_default = valid_epitran_mappings.index("swa-Latn")
selected_mapping = st.selectbox("Select input language/script:",
valid_epitran_mappings,
index=index_of_desired_default,
format_func=get_lang_description_from_mapping_name,
)
description = get_lang_description_from_mapping_name(selected_mapping, add_iso_url=True)
st.write(f"Selected input language/script: {description}")
st.info("attempting to instantiate epitran transliterator for your language/script")
epi = get_epitran(str(selected_mapping))
examples = defaultdict(lambda: 'Try typing some words in the language you chose, and they will be transliterated.')
examples['cmn-Hans'] = 'ε€ͺεˆζœ‰ι“οΌŒι“δΈŽη₯žεŒεœ¨οΌŒι“ε°±ζ˜―η₯žγ€‚'
examples['swa-Latn'] = 'Mwanzoni Kabla ulimwengu haujaumbwa alikuwepo Neno Huyo Neno alikuwa pamoja na Mungu, na Neno alikuwa Mungu.'
input_text = st.text_area(label="Whatever you type here will be transliterated!", value=examples[selected_mapping])
# combined_code = "-".join([iso_lang_code, iso_script_code])
# st.write(f"Combined code: {combined_code}")
st.info(f"transliterating `{input_text}`\n\tusing {selected_mapping}...")
transliteration = epi.transliterate(input_text)
output = {
"original": input_text,
"transliteration":transliteration,
}
st.write(output)