ORI-Muchim commited on
Commit
770bfff
ยท
1 Parent(s): 35cc217

Upload 5 files

Browse files
Files changed (5) hide show
  1. text/LICENSE +19 -0
  2. text/__init__.py +56 -0
  3. text/cleaners.py +128 -0
  4. text/korean.py +210 -0
  5. text/symbols.py +10 -0
text/LICENSE ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2017 Keith Ito
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
text/__init__.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+ from text import cleaners
3
+ from text.symbols import symbols
4
+
5
+
6
+ # Mappings from symbol to numeric ID and vice versa:
7
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9
+
10
+
11
+ def text_to_sequence(text, cleaner_names):
12
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13
+ Args:
14
+ text: string to convert to a sequence
15
+ cleaner_names: names of the cleaner functions to run the text through
16
+ Returns:
17
+ List of integers corresponding to the symbols in the text
18
+ '''
19
+ sequence = []
20
+
21
+ clean_text = _clean_text(text, cleaner_names)
22
+ for symbol in clean_text:
23
+ if symbol not in _symbol_to_id.keys():
24
+ continue
25
+ symbol_id = _symbol_to_id[symbol]
26
+ sequence += [symbol_id]
27
+ return sequence
28
+
29
+
30
+ def cleaned_text_to_sequence(cleaned_text):
31
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
32
+ Args:
33
+ text: string to convert to a sequence
34
+ Returns:
35
+ List of integers corresponding to the symbols in the text
36
+ '''
37
+ sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
38
+ return sequence
39
+
40
+
41
+ def sequence_to_text(sequence):
42
+ '''Converts a sequence of IDs back to a string'''
43
+ result = ''
44
+ for symbol_id in sequence:
45
+ s = _id_to_symbol[symbol_id]
46
+ result += s
47
+ return result
48
+
49
+
50
+ def _clean_text(text, cleaner_names):
51
+ for name in cleaner_names:
52
+ cleaner = getattr(cleaners, name)
53
+ if not cleaner:
54
+ raise Exception('Unknown cleaner: %s' % name)
55
+ text = cleaner(text)
56
+ return text
text/cleaners.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
3
+ from text.korean import latin_to_hangul, number_to_hangul, divide_hangul, korean_to_lazy_ipa, korean_to_ipa
4
+ from g2pk2 import G2p
5
+ from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
6
+ #from text.sanskrit import devanagari_to_ipa
7
+ from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
8
+ #from text.thai import num_to_thai, latin_to_thai
9
+ #from text.shanghainese import shanghainese_to_ipa
10
+ #from text.cantonese import cantonese_to_ipa
11
+ #from text.ngu_dialect import ngu_dialect_to_ipa
12
+
13
+
14
+ def japanese_cleaners(text):
15
+ text = japanese_to_romaji_with_accent(text)
16
+ text = re.sub(r'([A-Za-z])$', r'\1.', text)
17
+ return text
18
+
19
+
20
+ def japanese_cleaners2(text):
21
+ return japanese_cleaners(text).replace('ts', 'สฆ').replace('...', 'โ€ฆ')
22
+
23
+
24
+ def korean_cleaners(text):
25
+ '''Pipeline for Korean text'''
26
+ text = latin_to_hangul(text)
27
+ g2p = G2p()
28
+ text = g2p(text)
29
+ text = divide_hangul(text)
30
+ text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
31
+ return text
32
+
33
+
34
+ def chinese_cleaners(text):
35
+ '''Pipeline for Chinese text'''
36
+ text = number_to_chinese(text)
37
+ text = chinese_to_bopomofo(text)
38
+ text = latin_to_bopomofo(text)
39
+ text = re.sub(r'([ห‰หŠห‡ห‹ห™])$', r'\1ใ€‚', text)
40
+ return text
41
+
42
+
43
+ def zh_ja_mixture_cleaners(text):
44
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]',
45
+ lambda x: chinese_to_romaji(x.group(1))+' ', text)
46
+ text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
47
+ x.group(1)).replace('ts', 'สฆ').replace('u', 'ษฏ').replace('...', 'โ€ฆ')+' ', text)
48
+ text = re.sub(r'\s+$', '', text)
49
+ text = re.sub(r'([^\.,!\?\-โ€ฆ~])$', r'\1.', text)
50
+ return text
51
+
52
+
53
+ def sanskrit_cleaners(text):
54
+ text = text.replace('เฅฅ', 'เฅค').replace('เฅ', 'เค“เคฎเฅ')
55
+ text = re.sub(r'([^เฅค])$', r'\1เฅค', text)
56
+ return text
57
+
58
+
59
+ def cjks_cleaners(text):
60
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]',
61
+ lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
62
+ text = re.sub(r'\[JA\](.*?)\[JA\]',
63
+ lambda x: japanese_to_ipa(x.group(1))+' ', text)
64
+ text = re.sub(r'\[KO\](.*?)\[KO\]',
65
+ lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
66
+ text = re.sub(r'\[SA\](.*?)\[SA\]',
67
+ lambda x: devanagari_to_ipa(x.group(1))+' ', text)
68
+ text = re.sub(r'\[EN\](.*?)\[EN\]',
69
+ lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
70
+ text = re.sub(r'\s+$', '', text)
71
+ text = re.sub(r'([^\.,!\?\-โ€ฆ~])$', r'\1.', text)
72
+ return text
73
+
74
+ def cjke_cleaners(text):
75
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
76
+ 'สง', 'tสƒ').replace('สฆ', 'ts').replace('ษฅan', 'ษฅรฆn')+' ', text)
77
+ text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('สง', 'tสƒ').replace(
78
+ 'สฆ', 'ts').replace('ษฅan', 'ษฅรฆn').replace('สฅ', 'dz')+' ', text)
79
+ text = re.sub(r'\[KO\](.*?)\[KO\]',
80
+ lambda x: korean_to_ipa(x.group(1))+' ', text)
81
+ text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ษ‘', 'a').replace(
82
+ 'ษ”', 'o').replace('ษ›', 'e').replace('ษช', 'i').replace('สŠ', 'u')+' ', text)
83
+ text = re.sub(r'\s+$', '', text)
84
+ text = re.sub(r'([^\.,!\?\-โ€ฆ~])$', r'\1.', text)
85
+ return text
86
+
87
+ def cjke_cleaners2(text):
88
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]',
89
+ lambda x: chinese_to_ipa(x.group(1))+' ', text)
90
+ text = re.sub(r'\[JA\](.*?)\[JA\]',
91
+ lambda x: japanese_to_ipa2(x.group(1))+' ', text)
92
+ text = re.sub(r'\[KO\](.*?)\[KO\]',
93
+ lambda x: korean_to_ipa(x.group(1))+' ', text)
94
+ text = re.sub(r'\[EN\](.*?)\[EN\]',
95
+ lambda x: english_to_ipa2(x.group(1))+' ', text)
96
+ text = re.sub(r'\s+$', '', text)
97
+ text = re.sub(r'([^\.,!\?\-โ€ฆ~])$', r'\1.', text)
98
+ return text
99
+
100
+
101
+ def thai_cleaners(text):
102
+ text = num_to_thai(text)
103
+ text = latin_to_thai(text)
104
+ return text
105
+
106
+
107
+ def shanghainese_cleaners(text):
108
+ text = shanghainese_to_ipa(text)
109
+ text = re.sub(r'([^\.,!\?\-โ€ฆ~])$', r'\1.', text)
110
+ return text
111
+
112
+
113
+ def chinese_dialect_cleaners(text):
114
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]',
115
+ lambda x: chinese_to_ipa2(x.group(1))+' ', text)
116
+ text = re.sub(r'\[JA\](.*?)\[JA\]',
117
+ lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ส”')+' ', text)
118
+ text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', 'หฅหง').replace('5',
119
+ 'หงหงหฆ').replace('6', 'หฉหฉหง').replace('7', 'หฅ').replace('8', 'หฉหจ').replace('แด€', 'ษ').replace('แด‡', 'e')+' ', text)
120
+ text = re.sub(r'\[GD\](.*?)\[GD\]',
121
+ lambda x: cantonese_to_ipa(x.group(1))+' ', text)
122
+ text = re.sub(r'\[EN\](.*?)\[EN\]',
123
+ lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
124
+ text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
125
+ 1)).replace('สฃ', 'dz').replace('สฅ', 'dส‘').replace('สฆ', 'ts').replace('สจ', 'tษ•')+' ', text)
126
+ text = re.sub(r'\s+$', '', text)
127
+ text = re.sub(r'([^\.,!\?\-โ€ฆ~])$', r'\1.', text)
128
+ return text
text/korean.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from jamo import h2j, j2hcj
3
+ import ko_pron
4
+
5
+
6
+ # This is a list of Korean classifiers preceded by pure Korean numerals.
7
+ _korean_classifiers = '๊ตฐ๋ฐ ๊ถŒ ๊ฐœ ๊ทธ๋ฃจ ๋‹ข ๋Œ€ ๋‘ ๋งˆ๋ฆฌ ๋ชจ ๋ชจ๊ธˆ ๋ญ‡ ๋ฐœ ๋ฐœ์ง ๋ฐฉ ๋ฒˆ ๋ฒŒ ๋ณด๋ฃจ ์‚ด ์ˆ˜ ์ˆ  ์‹œ ์Œˆ ์›€ํผ ์ • ์ง ์ฑ„ ์ฒ™ ์ฒฉ ์ถ• ์ผค๋ ˆ ํ†จ ํ†ต'
8
+
9
+ # List of (hangul, hangul divided) pairs:
10
+ _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
11
+ ('ใ„ณ', 'ใ„ฑใ……'),
12
+ ('ใ„ต', 'ใ„ดใ…ˆ'),
13
+ ('ใ„ถ', 'ใ„ดใ…Ž'),
14
+ ('ใ„บ', 'ใ„นใ„ฑ'),
15
+ ('ใ„ป', 'ใ„นใ…'),
16
+ ('ใ„ผ', 'ใ„นใ…‚'),
17
+ ('ใ„ฝ', 'ใ„นใ……'),
18
+ ('ใ„พ', 'ใ„นใ…Œ'),
19
+ ('ใ„ฟ', 'ใ„นใ…'),
20
+ ('ใ…€', 'ใ„นใ…Ž'),
21
+ ('ใ…„', 'ใ…‚ใ……'),
22
+ ('ใ…˜', 'ใ…—ใ…'),
23
+ ('ใ…™', 'ใ…—ใ…'),
24
+ ('ใ…š', 'ใ…—ใ…ฃ'),
25
+ ('ใ…', 'ใ…œใ…“'),
26
+ ('ใ…ž', 'ใ…œใ…”'),
27
+ ('ใ…Ÿ', 'ใ…œใ…ฃ'),
28
+ ('ใ…ข', 'ใ…กใ…ฃ'),
29
+ ('ใ…‘', 'ใ…ฃใ…'),
30
+ ('ใ…’', 'ใ…ฃใ…'),
31
+ ('ใ…•', 'ใ…ฃใ…“'),
32
+ ('ใ…–', 'ใ…ฃใ…”'),
33
+ ('ใ…›', 'ใ…ฃใ…—'),
34
+ ('ใ… ', 'ใ…ฃใ…œ')
35
+ ]]
36
+
37
+ # List of (Latin alphabet, hangul) pairs:
38
+ _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
39
+ ('a', '์—์ด'),
40
+ ('b', '๋น„'),
41
+ ('c', '์‹œ'),
42
+ ('d', '๋””'),
43
+ ('e', '์ด'),
44
+ ('f', '์—ํ”„'),
45
+ ('g', '์ง€'),
46
+ ('h', '์—์ด์น˜'),
47
+ ('i', '์•„์ด'),
48
+ ('j', '์ œ์ด'),
49
+ ('k', '์ผ€์ด'),
50
+ ('l', '์—˜'),
51
+ ('m', '์— '),
52
+ ('n', '์—”'),
53
+ ('o', '์˜ค'),
54
+ ('p', 'ํ”ผ'),
55
+ ('q', 'ํ'),
56
+ ('r', '์•„๋ฅด'),
57
+ ('s', '์—์Šค'),
58
+ ('t', 'ํ‹ฐ'),
59
+ ('u', '์œ '),
60
+ ('v', '๋ธŒ์ด'),
61
+ ('w', '๋”๋ธ”์œ '),
62
+ ('x', '์—‘์Šค'),
63
+ ('y', '์™€์ด'),
64
+ ('z', '์ œํŠธ')
65
+ ]]
66
+
67
+ # List of (ipa, lazy ipa) pairs:
68
+ _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
69
+ ('tอกษ•','สง'),
70
+ ('dอกส‘','สฅ'),
71
+ ('ษฒ','n^'),
72
+ ('ษ•','สƒ'),
73
+ ('สท','w'),
74
+ ('ษญ','l`'),
75
+ ('สŽ','ษพ'),
76
+ ('ษฃ','ล‹'),
77
+ ('ษฐ','ษฏ'),
78
+ ('ส','j'),
79
+ ('สŒ','ษ™'),
80
+ ('ษก','g'),
81
+ ('\u031a','#'),
82
+ ('\u0348','='),
83
+ ('\u031e',''),
84
+ ('\u0320',''),
85
+ ('\u0339','')
86
+ ]]
87
+
88
+
89
+ def latin_to_hangul(text):
90
+ for regex, replacement in _latin_to_hangul:
91
+ text = re.sub(regex, replacement, text)
92
+ return text
93
+
94
+
95
+ def divide_hangul(text):
96
+ text = j2hcj(h2j(text))
97
+ for regex, replacement in _hangul_divided:
98
+ text = re.sub(regex, replacement, text)
99
+ return text
100
+
101
+
102
+ def hangul_number(num, sino=True):
103
+ '''Reference https://github.com/Kyubyong/g2pK'''
104
+ num = re.sub(',', '', num)
105
+
106
+ if num == '0':
107
+ return '์˜'
108
+ if not sino and num == '20':
109
+ return '์Šค๋ฌด'
110
+
111
+ digits = '123456789'
112
+ names = '์ผ์ด์‚ผ์‚ฌ์˜ค์œก์น ํŒ”๊ตฌ'
113
+ digit2name = {d: n for d, n in zip(digits, names)}
114
+
115
+ modifiers = 'ํ•œ ๋‘ ์„ธ ๋„ค ๋‹ค์„ฏ ์—ฌ์„ฏ ์ผ๊ณฑ ์—ฌ๋Ÿ ์•„ํ™‰'
116
+ decimals = '์—ด ์Šค๋ฌผ ์„œ๋ฅธ ๋งˆํ” ์‰ฐ ์˜ˆ์ˆœ ์ผํ” ์—ฌ๋“  ์•„ํ”'
117
+ digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
118
+ digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
119
+
120
+ spelledout = []
121
+ for i, digit in enumerate(num):
122
+ i = len(num) - i - 1
123
+ if sino:
124
+ if i == 0:
125
+ name = digit2name.get(digit, '')
126
+ elif i == 1:
127
+ name = digit2name.get(digit, '') + '์‹ญ'
128
+ name = name.replace('์ผ์‹ญ', '์‹ญ')
129
+ else:
130
+ if i == 0:
131
+ name = digit2mod.get(digit, '')
132
+ elif i == 1:
133
+ name = digit2dec.get(digit, '')
134
+ if digit == '0':
135
+ if i % 4 == 0:
136
+ last_three = spelledout[-min(3, len(spelledout)):]
137
+ if ''.join(last_three) == '':
138
+ spelledout.append('')
139
+ continue
140
+ else:
141
+ spelledout.append('')
142
+ continue
143
+ if i == 2:
144
+ name = digit2name.get(digit, '') + '๋ฐฑ'
145
+ name = name.replace('์ผ๋ฐฑ', '๋ฐฑ')
146
+ elif i == 3:
147
+ name = digit2name.get(digit, '') + '์ฒœ'
148
+ name = name.replace('์ผ์ฒœ', '์ฒœ')
149
+ elif i == 4:
150
+ name = digit2name.get(digit, '') + '๋งŒ'
151
+ name = name.replace('์ผ๋งŒ', '๋งŒ')
152
+ elif i == 5:
153
+ name = digit2name.get(digit, '') + '์‹ญ'
154
+ name = name.replace('์ผ์‹ญ', '์‹ญ')
155
+ elif i == 6:
156
+ name = digit2name.get(digit, '') + '๋ฐฑ'
157
+ name = name.replace('์ผ๋ฐฑ', '๋ฐฑ')
158
+ elif i == 7:
159
+ name = digit2name.get(digit, '') + '์ฒœ'
160
+ name = name.replace('์ผ์ฒœ', '์ฒœ')
161
+ elif i == 8:
162
+ name = digit2name.get(digit, '') + '์–ต'
163
+ elif i == 9:
164
+ name = digit2name.get(digit, '') + '์‹ญ'
165
+ elif i == 10:
166
+ name = digit2name.get(digit, '') + '๋ฐฑ'
167
+ elif i == 11:
168
+ name = digit2name.get(digit, '') + '์ฒœ'
169
+ elif i == 12:
170
+ name = digit2name.get(digit, '') + '์กฐ'
171
+ elif i == 13:
172
+ name = digit2name.get(digit, '') + '์‹ญ'
173
+ elif i == 14:
174
+ name = digit2name.get(digit, '') + '๋ฐฑ'
175
+ elif i == 15:
176
+ name = digit2name.get(digit, '') + '์ฒœ'
177
+ spelledout.append(name)
178
+ return ''.join(elem for elem in spelledout)
179
+
180
+
181
+ def number_to_hangul(text):
182
+ '''Reference https://github.com/Kyubyong/g2pK'''
183
+ tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
184
+ for token in tokens:
185
+ num, classifier = token
186
+ if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
187
+ spelledout = hangul_number(num, sino=False)
188
+ else:
189
+ spelledout = hangul_number(num, sino=True)
190
+ text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
191
+ # digit by digit for remaining digits
192
+ digits = '0123456789'
193
+ names = '์˜์ผ์ด์‚ผ์‚ฌ์˜ค์œก์น ํŒ”๊ตฌ'
194
+ for d, n in zip(digits, names):
195
+ text = text.replace(d, n)
196
+ return text
197
+
198
+
199
+ def korean_to_lazy_ipa(text):
200
+ text = latin_to_hangul(text)
201
+ text = number_to_hangul(text)
202
+ text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
203
+ for regex, replacement in _ipa_to_lazy_ipa:
204
+ text = re.sub(regex, replacement, text)
205
+ return text
206
+
207
+
208
+ def korean_to_ipa(text):
209
+ text = korean_to_lazy_ipa(text)
210
+ return text.replace('สง','tสƒ').replace('สฅ','dส‘')
text/symbols.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # korean_cleaners
2
+ _pad = '_'
3
+ _punctuation = ',.!?โ€ฆ~'
4
+ _letters = 'ใ„ฑใ„ดใ„ทใ„นใ…ใ…‚ใ……ใ…‡ใ…ˆใ…Šใ…‹ใ…Œใ…ใ…Žใ„ฒใ„ธใ…ƒใ…†ใ…‰ใ…ใ…“ใ…—ใ…œใ…กใ…ฃใ…ใ…” '
5
+
6
+
7
+ # Export all symbols:
8
+ symbols = [_pad] + list(_punctuation) + list(_letters)
9
+ # Special symbol ids
10
+ SPACE_ID = symbols.index(' ')