File size: 3,156 Bytes
a013c5c
 
c95a8ea
 
db6d318
0db47ae
145304e
 
c95a8ea
da9bcd4
c95a8ea
 
 
 
bc0875e
905833b
 
12dfc24
905833b
c95a8ea
24c5e5a
 
 
 
 
 
 
 
 
 
 
 
905833b
24c5e5a
 
 
 
c95a8ea
 
 
7c012d1
 
c95a8ea
49de2b2
 
0db47ae
 
 
 
 
 
 
49de2b2
f6ae8b8
0db47ae
0e45975
c95a8ea
 
a013c5c
c95a8ea
1651e6e
b33e08e
619b882
817d838
c95a8ea
ed67811
817d838
 
 
b33e08e
da9bcd4
49de2b2
 
c95a8ea
383b08c
c95a8ea
 
49de2b2
 
 
 
 
 
 
 
62383b9
44f705d
c95a8ea
 
49de2b2
2bd2657
b33e08e
44f705d
2bd2657
 
c95a8ea
2bd2657
44f705d
a013c5c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import streamlit as st
import epitran
import langcodes
from langcodes import LanguageTagError
from pathlib import Path
from operator import itemgetter
# TODO: reverse transliterate? 


def get_lang_description_from_mapping_name(string_to_check):  
  if "generic-Latn" == string_to_check: 
    return "Generic Latin Script"
    
  if len(string_to_check)<2:
    return None       

  substrings = string_to_check.split("-")
  substrings = substrings[:2] # first two are ISO 639-3 language, and ISO 15924 script
  string_to_check = "-".join(substrings)
  


  description = None
  lang = langcodes.get(string_to_check)
  if lang: 
    items = []
    for key, value in lang.describe().items():
      if key == "language":
        iso_code = lang.to_alpha3()
        value = f"[{value}](https://iso639-3.sil.org/code/{iso_code})"
      items.append(f"{key}: {value}")
      
  
    description = ", ".join(items)  
    if substrings[-1] == "red":
      description = description + " (reduced)"    
  return description


def get_valid_epitran_mappings_list():
  map_path = Path(epitran.__path__[0]) / "data" / "map"
  map_files = map_path.glob("*.*")
  valid_mappings = [map_file.stem for map_file in map_files]
  valid_mappings.append("cmn-Hans")
  
  problem_mappings = ['generic-Latn',
 'tur-Latn-bab',
 'ood-Latn-sax',
 'vie-Latn-so',
 'vie-Latn-ce',
 'vie-Latn-no',
 'kaz-Cyrl-bab'] # https://github.com/dmort27/epitran/issues/98

  filtered_mappings = [mapping for mapping in valid_mappings if mapping not in problem_mappings]
  
  return filtered_mappings 


if __name__ == "__main__":

  st.write("# Phonemize your text with [Epitran](https://github.com/dmort27/epitran)!")

  st.write("Epitran is a library and tool for transliterating orthographic text as IPA (International Phonetic Alphabet), by Mortensen, David R.  and Dalmia, Siddharth and Littell, Patrick.") 
  
  valid_epitran_mappings = get_valid_epitran_mappings_list()
  index_of_swa_latn = valid_epitran_mappings.index("swa-Latn")
  st.write(f"It supports converting many writing sytems to IPA symbols, including approximately {len(valid_epitran_mappings)} languages/scripts, listed below:")

  #st.write(valid_epitran_mappings)
  
  selected_mapping = st.selectbox("Select input language/script:", valid_epitran_mappings, index=index_of_swa_latn)

  
  description = get_lang_description_from_mapping_name(selected_mapping)
  st.write(f"Selected input language/script: {description}")
  
  
  if selected_mapping = "cmn-Hans":
    st.info("Chinese requires a special dictionary. Downloading now")
    epitran.download.cedict()  
  
  st.info("attempting to instantiate epitran transliterator for your language/script")
  epi = epitran.Epitran(selected_mapping)
  
  
  input_text = st.text_area(label="Whatever you type here will be transliterated!", value="Gari langu linaloangama limejaa na mikunga")
  
#  combined_code = "-".join([iso_lang_code, iso_script_code])
#  st.write(f"Combined code: {combined_code}")


  st.info(f"transliterating `{input_text}`\n\tusing {selected_mapping}...")  
  transliteration = epi.transliterate(input_text)
  
  st.success(transliteration)