import streamlit as st import epitran import langcodes from langcodes import LanguageTagError from pathlib import Path from operator import itemgetter # TODO: reverse transliterate? def get_lang_description_from_mapping_name(string_to_check): if "generic-Latn" == string_to_check: return "Generic Latin Script" if len(string_to_check)<2: return None try: description = None lang = langcodes.get(string_to_check) if lang: items = [] for key, value in lang.describe().items(): items.append(f"{key}: {value}") description = ", ".join(items) return description except LanguageTagError as e: if any(["out of place" in str(e), "must be followed by something" in str(e)]): # print("*****") # print(e) # LanguageTagError: This extlang subtag, 'red', is out of place. Expected territory, variant, extension, or end of string. # LanguageTagError: This script subtag, 'east', is out of place. Expected territory, variant, extension, or end of string. # LanguageTagError: The subtag 'p' must be followed by something substrings = string_to_check.split("-") substrings = substrings[:-1] # remove the last one string_to_check = "-".join(substrings) desc = get_lang_description_from_mapping_name(string_to_check) if substrings[-1] == "red": desc = desc + " (reduced)" return desc else: print("*****") print(e) return None def get_valid_epitran_mappings_list(): map_path = Path(epitran.__path__[0]) / "data" / "map" map_files = map_path.glob("*.*") valid_mappings = [map_file.stem for map_file in map_files] problem_mappings = ['generic-Latn', 'tur-Latn-bab', 'ood-Latn-sax', 'vie-Latn-so', 'vie-Latn-ce', 'vie-Latn-no', 'kaz-Cyrl-bab'] # https://github.com/dmort27/epitran/issues/98 filtered_mappings = [mapping for mapping in valid_mappings if mapping not in problem_mappings] return filtered_mappings if __name__ == "__main__": st.write("# Phonemize your text with [Epitran](https://github.com/dmort27/epitran)!") st.write("Epitran is a library and tool for transliterating orthographic text as IPA (International Phonetic Alphabet), by Mortensen, David R. and Dalmia, Siddharth and Littell, Patrick.") st.write("It supports converting many writing sytems to IPA symbols, including the following languages/scripts:") valid_epitran_mappings = get_valid_epitran_mappings_list() st.write(valid_epitran_mappings) selected_mapping = st.selectbox("Select input language/script:", valid_epitran_mappings) description = get_lang_description_from_mapping_name(selected_mapping) st.write(f"Selected input language/script: {description}") # iso_lang_code = st.text_input( # label="Three-letter ISO-639-3 (https://iso639-3.sil.org/) language code", # value="swa" # ) # st.write(f"iso code is {iso_lang_code}") # # iso_script_code = st.text_input( # label="ISO 15924 (https://unicode.org/iso15924/iso15924-codes.html) script code, e.g. 'Latn' for Latin script, 'Hans' for Chinese script, etc.", # value="Latn" # ) # st.write(f'iso code is {iso_script_code}') input_text = st.text_area(label="Whatever you type here will be transliterated!", value="Gari langu linaloangama limejaa na mikunga") # combined_code = "-".join([iso_lang_code, iso_script_code]) # st.write(f"Combined code: {combined_code}") st.info("attempting to instantiate epitran transliterator for your language/script") epi = epitran.Epitran(selected_mapping) st.info(f"transliterating `{input_text}`\n\tusing {selected_mapping}...") transliteration = epi.transliterate(input_text) st.success(transliteration)