Spaces:
Runtime error
Runtime error
Commit
ยท
770bfff
1
Parent(s):
35cc217
Upload 5 files
Browse files- text/LICENSE +19 -0
- text/__init__.py +56 -0
- text/cleaners.py +128 -0
- text/korean.py +210 -0
- text/symbols.py +10 -0
text/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (c) 2017 Keith Ito
|
2 |
+
|
3 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4 |
+
of this software and associated documentation files (the "Software"), to deal
|
5 |
+
in the Software without restriction, including without limitation the rights
|
6 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7 |
+
copies of the Software, and to permit persons to whom the Software is
|
8 |
+
furnished to do so, subject to the following conditions:
|
9 |
+
|
10 |
+
The above copyright notice and this permission notice shall be included in
|
11 |
+
all copies or substantial portions of the Software.
|
12 |
+
|
13 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19 |
+
THE SOFTWARE.
|
text/__init__.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
from text import cleaners
|
3 |
+
from text.symbols import symbols
|
4 |
+
|
5 |
+
|
6 |
+
# Mappings from symbol to numeric ID and vice versa:
|
7 |
+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
8 |
+
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
9 |
+
|
10 |
+
|
11 |
+
def text_to_sequence(text, cleaner_names):
|
12 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
13 |
+
Args:
|
14 |
+
text: string to convert to a sequence
|
15 |
+
cleaner_names: names of the cleaner functions to run the text through
|
16 |
+
Returns:
|
17 |
+
List of integers corresponding to the symbols in the text
|
18 |
+
'''
|
19 |
+
sequence = []
|
20 |
+
|
21 |
+
clean_text = _clean_text(text, cleaner_names)
|
22 |
+
for symbol in clean_text:
|
23 |
+
if symbol not in _symbol_to_id.keys():
|
24 |
+
continue
|
25 |
+
symbol_id = _symbol_to_id[symbol]
|
26 |
+
sequence += [symbol_id]
|
27 |
+
return sequence
|
28 |
+
|
29 |
+
|
30 |
+
def cleaned_text_to_sequence(cleaned_text):
|
31 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
32 |
+
Args:
|
33 |
+
text: string to convert to a sequence
|
34 |
+
Returns:
|
35 |
+
List of integers corresponding to the symbols in the text
|
36 |
+
'''
|
37 |
+
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
|
38 |
+
return sequence
|
39 |
+
|
40 |
+
|
41 |
+
def sequence_to_text(sequence):
|
42 |
+
'''Converts a sequence of IDs back to a string'''
|
43 |
+
result = ''
|
44 |
+
for symbol_id in sequence:
|
45 |
+
s = _id_to_symbol[symbol_id]
|
46 |
+
result += s
|
47 |
+
return result
|
48 |
+
|
49 |
+
|
50 |
+
def _clean_text(text, cleaner_names):
|
51 |
+
for name in cleaner_names:
|
52 |
+
cleaner = getattr(cleaners, name)
|
53 |
+
if not cleaner:
|
54 |
+
raise Exception('Unknown cleaner: %s' % name)
|
55 |
+
text = cleaner(text)
|
56 |
+
return text
|
text/cleaners.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
|
3 |
+
from text.korean import latin_to_hangul, number_to_hangul, divide_hangul, korean_to_lazy_ipa, korean_to_ipa
|
4 |
+
from g2pk2 import G2p
|
5 |
+
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
|
6 |
+
#from text.sanskrit import devanagari_to_ipa
|
7 |
+
from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
|
8 |
+
#from text.thai import num_to_thai, latin_to_thai
|
9 |
+
#from text.shanghainese import shanghainese_to_ipa
|
10 |
+
#from text.cantonese import cantonese_to_ipa
|
11 |
+
#from text.ngu_dialect import ngu_dialect_to_ipa
|
12 |
+
|
13 |
+
|
14 |
+
def japanese_cleaners(text):
|
15 |
+
text = japanese_to_romaji_with_accent(text)
|
16 |
+
text = re.sub(r'([A-Za-z])$', r'\1.', text)
|
17 |
+
return text
|
18 |
+
|
19 |
+
|
20 |
+
def japanese_cleaners2(text):
|
21 |
+
return japanese_cleaners(text).replace('ts', 'สฆ').replace('...', 'โฆ')
|
22 |
+
|
23 |
+
|
24 |
+
def korean_cleaners(text):
|
25 |
+
'''Pipeline for Korean text'''
|
26 |
+
text = latin_to_hangul(text)
|
27 |
+
g2p = G2p()
|
28 |
+
text = g2p(text)
|
29 |
+
text = divide_hangul(text)
|
30 |
+
text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
|
31 |
+
return text
|
32 |
+
|
33 |
+
|
34 |
+
def chinese_cleaners(text):
|
35 |
+
'''Pipeline for Chinese text'''
|
36 |
+
text = number_to_chinese(text)
|
37 |
+
text = chinese_to_bopomofo(text)
|
38 |
+
text = latin_to_bopomofo(text)
|
39 |
+
text = re.sub(r'([หหหหห])$', r'\1ใ', text)
|
40 |
+
return text
|
41 |
+
|
42 |
+
|
43 |
+
def zh_ja_mixture_cleaners(text):
|
44 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
45 |
+
lambda x: chinese_to_romaji(x.group(1))+' ', text)
|
46 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
|
47 |
+
x.group(1)).replace('ts', 'สฆ').replace('u', 'ษฏ').replace('...', 'โฆ')+' ', text)
|
48 |
+
text = re.sub(r'\s+$', '', text)
|
49 |
+
text = re.sub(r'([^\.,!\?\-โฆ~])$', r'\1.', text)
|
50 |
+
return text
|
51 |
+
|
52 |
+
|
53 |
+
def sanskrit_cleaners(text):
|
54 |
+
text = text.replace('เฅฅ', 'เฅค').replace('เฅ', 'เคเคฎเฅ')
|
55 |
+
text = re.sub(r'([^เฅค])$', r'\1เฅค', text)
|
56 |
+
return text
|
57 |
+
|
58 |
+
|
59 |
+
def cjks_cleaners(text):
|
60 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
61 |
+
lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
|
62 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
63 |
+
lambda x: japanese_to_ipa(x.group(1))+' ', text)
|
64 |
+
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
65 |
+
lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
|
66 |
+
text = re.sub(r'\[SA\](.*?)\[SA\]',
|
67 |
+
lambda x: devanagari_to_ipa(x.group(1))+' ', text)
|
68 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
69 |
+
lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
|
70 |
+
text = re.sub(r'\s+$', '', text)
|
71 |
+
text = re.sub(r'([^\.,!\?\-โฆ~])$', r'\1.', text)
|
72 |
+
return text
|
73 |
+
|
74 |
+
def cjke_cleaners(text):
|
75 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
|
76 |
+
'สง', 'tส').replace('สฆ', 'ts').replace('ษฅan', 'ษฅรฆn')+' ', text)
|
77 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('สง', 'tส').replace(
|
78 |
+
'สฆ', 'ts').replace('ษฅan', 'ษฅรฆn').replace('สฅ', 'dz')+' ', text)
|
79 |
+
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
80 |
+
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
81 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ษ', 'a').replace(
|
82 |
+
'ษ', 'o').replace('ษ', 'e').replace('ษช', 'i').replace('ส', 'u')+' ', text)
|
83 |
+
text = re.sub(r'\s+$', '', text)
|
84 |
+
text = re.sub(r'([^\.,!\?\-โฆ~])$', r'\1.', text)
|
85 |
+
return text
|
86 |
+
|
87 |
+
def cjke_cleaners2(text):
|
88 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
89 |
+
lambda x: chinese_to_ipa(x.group(1))+' ', text)
|
90 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
91 |
+
lambda x: japanese_to_ipa2(x.group(1))+' ', text)
|
92 |
+
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
93 |
+
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
94 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
95 |
+
lambda x: english_to_ipa2(x.group(1))+' ', text)
|
96 |
+
text = re.sub(r'\s+$', '', text)
|
97 |
+
text = re.sub(r'([^\.,!\?\-โฆ~])$', r'\1.', text)
|
98 |
+
return text
|
99 |
+
|
100 |
+
|
101 |
+
def thai_cleaners(text):
|
102 |
+
text = num_to_thai(text)
|
103 |
+
text = latin_to_thai(text)
|
104 |
+
return text
|
105 |
+
|
106 |
+
|
107 |
+
def shanghainese_cleaners(text):
|
108 |
+
text = shanghainese_to_ipa(text)
|
109 |
+
text = re.sub(r'([^\.,!\?\-โฆ~])$', r'\1.', text)
|
110 |
+
return text
|
111 |
+
|
112 |
+
|
113 |
+
def chinese_dialect_cleaners(text):
|
114 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
115 |
+
lambda x: chinese_to_ipa2(x.group(1))+' ', text)
|
116 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
117 |
+
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ส')+' ', text)
|
118 |
+
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', 'หฅหง').replace('5',
|
119 |
+
'หงหงหฆ').replace('6', 'หฉหฉหง').replace('7', 'หฅ').replace('8', 'หฉหจ').replace('แด', 'ษ').replace('แด', 'e')+' ', text)
|
120 |
+
text = re.sub(r'\[GD\](.*?)\[GD\]',
|
121 |
+
lambda x: cantonese_to_ipa(x.group(1))+' ', text)
|
122 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
123 |
+
lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
|
124 |
+
text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
|
125 |
+
1)).replace('สฃ', 'dz').replace('สฅ', 'dส').replace('สฆ', 'ts').replace('สจ', 'tษ')+' ', text)
|
126 |
+
text = re.sub(r'\s+$', '', text)
|
127 |
+
text = re.sub(r'([^\.,!\?\-โฆ~])$', r'\1.', text)
|
128 |
+
return text
|
text/korean.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from jamo import h2j, j2hcj
|
3 |
+
import ko_pron
|
4 |
+
|
5 |
+
|
6 |
+
# This is a list of Korean classifiers preceded by pure Korean numerals.
|
7 |
+
_korean_classifiers = '๊ตฐ๋ฐ ๊ถ ๊ฐ ๊ทธ๋ฃจ ๋ข ๋ ๋ ๋ง๋ฆฌ ๋ชจ ๋ชจ๊ธ ๋ญ ๋ฐ ๋ฐ์ง ๋ฐฉ ๋ฒ ๋ฒ ๋ณด๋ฃจ ์ด ์ ์ ์ ์ ์ํผ ์ ์ง ์ฑ ์ฒ ์ฒฉ ์ถ ์ผค๋ ํจ ํต'
|
8 |
+
|
9 |
+
# List of (hangul, hangul divided) pairs:
|
10 |
+
_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
|
11 |
+
('ใณ', 'ใฑใ
'),
|
12 |
+
('ใต', 'ใดใ
'),
|
13 |
+
('ใถ', 'ใดใ
'),
|
14 |
+
('ใบ', 'ในใฑ'),
|
15 |
+
('ใป', 'ในใ
'),
|
16 |
+
('ใผ', 'ในใ
'),
|
17 |
+
('ใฝ', 'ในใ
'),
|
18 |
+
('ใพ', 'ในใ
'),
|
19 |
+
('ใฟ', 'ในใ
'),
|
20 |
+
('ใ
', 'ในใ
'),
|
21 |
+
('ใ
', 'ใ
ใ
'),
|
22 |
+
('ใ
', 'ใ
ใ
'),
|
23 |
+
('ใ
', 'ใ
ใ
'),
|
24 |
+
('ใ
', 'ใ
ใ
ฃ'),
|
25 |
+
('ใ
', 'ใ
ใ
'),
|
26 |
+
('ใ
', 'ใ
ใ
'),
|
27 |
+
('ใ
', 'ใ
ใ
ฃ'),
|
28 |
+
('ใ
ข', 'ใ
กใ
ฃ'),
|
29 |
+
('ใ
', 'ใ
ฃใ
'),
|
30 |
+
('ใ
', 'ใ
ฃใ
'),
|
31 |
+
('ใ
', 'ใ
ฃใ
'),
|
32 |
+
('ใ
', 'ใ
ฃใ
'),
|
33 |
+
('ใ
', 'ใ
ฃใ
'),
|
34 |
+
('ใ
', 'ใ
ฃใ
')
|
35 |
+
]]
|
36 |
+
|
37 |
+
# List of (Latin alphabet, hangul) pairs:
|
38 |
+
_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
39 |
+
('a', '์์ด'),
|
40 |
+
('b', '๋น'),
|
41 |
+
('c', '์'),
|
42 |
+
('d', '๋'),
|
43 |
+
('e', '์ด'),
|
44 |
+
('f', '์ํ'),
|
45 |
+
('g', '์ง'),
|
46 |
+
('h', '์์ด์น'),
|
47 |
+
('i', '์์ด'),
|
48 |
+
('j', '์ ์ด'),
|
49 |
+
('k', '์ผ์ด'),
|
50 |
+
('l', '์'),
|
51 |
+
('m', '์ '),
|
52 |
+
('n', '์'),
|
53 |
+
('o', '์ค'),
|
54 |
+
('p', 'ํผ'),
|
55 |
+
('q', 'ํ'),
|
56 |
+
('r', '์๋ฅด'),
|
57 |
+
('s', '์์ค'),
|
58 |
+
('t', 'ํฐ'),
|
59 |
+
('u', '์ '),
|
60 |
+
('v', '๋ธ์ด'),
|
61 |
+
('w', '๋๋ธ์ '),
|
62 |
+
('x', '์์ค'),
|
63 |
+
('y', '์์ด'),
|
64 |
+
('z', '์ ํธ')
|
65 |
+
]]
|
66 |
+
|
67 |
+
# List of (ipa, lazy ipa) pairs:
|
68 |
+
_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
69 |
+
('tอกษ','สง'),
|
70 |
+
('dอกส','สฅ'),
|
71 |
+
('ษฒ','n^'),
|
72 |
+
('ษ','ส'),
|
73 |
+
('สท','w'),
|
74 |
+
('ษญ','l`'),
|
75 |
+
('ส','ษพ'),
|
76 |
+
('ษฃ','ล'),
|
77 |
+
('ษฐ','ษฏ'),
|
78 |
+
('ส','j'),
|
79 |
+
('ส','ษ'),
|
80 |
+
('ษก','g'),
|
81 |
+
('\u031a','#'),
|
82 |
+
('\u0348','='),
|
83 |
+
('\u031e',''),
|
84 |
+
('\u0320',''),
|
85 |
+
('\u0339','')
|
86 |
+
]]
|
87 |
+
|
88 |
+
|
89 |
+
def latin_to_hangul(text):
|
90 |
+
for regex, replacement in _latin_to_hangul:
|
91 |
+
text = re.sub(regex, replacement, text)
|
92 |
+
return text
|
93 |
+
|
94 |
+
|
95 |
+
def divide_hangul(text):
|
96 |
+
text = j2hcj(h2j(text))
|
97 |
+
for regex, replacement in _hangul_divided:
|
98 |
+
text = re.sub(regex, replacement, text)
|
99 |
+
return text
|
100 |
+
|
101 |
+
|
102 |
+
def hangul_number(num, sino=True):
|
103 |
+
'''Reference https://github.com/Kyubyong/g2pK'''
|
104 |
+
num = re.sub(',', '', num)
|
105 |
+
|
106 |
+
if num == '0':
|
107 |
+
return '์'
|
108 |
+
if not sino and num == '20':
|
109 |
+
return '์ค๋ฌด'
|
110 |
+
|
111 |
+
digits = '123456789'
|
112 |
+
names = '์ผ์ด์ผ์ฌ์ค์ก์น ํ๊ตฌ'
|
113 |
+
digit2name = {d: n for d, n in zip(digits, names)}
|
114 |
+
|
115 |
+
modifiers = 'ํ ๋ ์ธ ๋ค ๋ค์ฏ ์ฌ์ฏ ์ผ๊ณฑ ์ฌ๋ ์ํ'
|
116 |
+
decimals = '์ด ์ค๋ฌผ ์๋ฅธ ๋งํ ์ฐ ์์ ์ผํ ์ฌ๋ ์ํ'
|
117 |
+
digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
|
118 |
+
digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
|
119 |
+
|
120 |
+
spelledout = []
|
121 |
+
for i, digit in enumerate(num):
|
122 |
+
i = len(num) - i - 1
|
123 |
+
if sino:
|
124 |
+
if i == 0:
|
125 |
+
name = digit2name.get(digit, '')
|
126 |
+
elif i == 1:
|
127 |
+
name = digit2name.get(digit, '') + '์ญ'
|
128 |
+
name = name.replace('์ผ์ญ', '์ญ')
|
129 |
+
else:
|
130 |
+
if i == 0:
|
131 |
+
name = digit2mod.get(digit, '')
|
132 |
+
elif i == 1:
|
133 |
+
name = digit2dec.get(digit, '')
|
134 |
+
if digit == '0':
|
135 |
+
if i % 4 == 0:
|
136 |
+
last_three = spelledout[-min(3, len(spelledout)):]
|
137 |
+
if ''.join(last_three) == '':
|
138 |
+
spelledout.append('')
|
139 |
+
continue
|
140 |
+
else:
|
141 |
+
spelledout.append('')
|
142 |
+
continue
|
143 |
+
if i == 2:
|
144 |
+
name = digit2name.get(digit, '') + '๋ฐฑ'
|
145 |
+
name = name.replace('์ผ๋ฐฑ', '๋ฐฑ')
|
146 |
+
elif i == 3:
|
147 |
+
name = digit2name.get(digit, '') + '์ฒ'
|
148 |
+
name = name.replace('์ผ์ฒ', '์ฒ')
|
149 |
+
elif i == 4:
|
150 |
+
name = digit2name.get(digit, '') + '๋ง'
|
151 |
+
name = name.replace('์ผ๋ง', '๋ง')
|
152 |
+
elif i == 5:
|
153 |
+
name = digit2name.get(digit, '') + '์ญ'
|
154 |
+
name = name.replace('์ผ์ญ', '์ญ')
|
155 |
+
elif i == 6:
|
156 |
+
name = digit2name.get(digit, '') + '๋ฐฑ'
|
157 |
+
name = name.replace('์ผ๋ฐฑ', '๋ฐฑ')
|
158 |
+
elif i == 7:
|
159 |
+
name = digit2name.get(digit, '') + '์ฒ'
|
160 |
+
name = name.replace('์ผ์ฒ', '์ฒ')
|
161 |
+
elif i == 8:
|
162 |
+
name = digit2name.get(digit, '') + '์ต'
|
163 |
+
elif i == 9:
|
164 |
+
name = digit2name.get(digit, '') + '์ญ'
|
165 |
+
elif i == 10:
|
166 |
+
name = digit2name.get(digit, '') + '๋ฐฑ'
|
167 |
+
elif i == 11:
|
168 |
+
name = digit2name.get(digit, '') + '์ฒ'
|
169 |
+
elif i == 12:
|
170 |
+
name = digit2name.get(digit, '') + '์กฐ'
|
171 |
+
elif i == 13:
|
172 |
+
name = digit2name.get(digit, '') + '์ญ'
|
173 |
+
elif i == 14:
|
174 |
+
name = digit2name.get(digit, '') + '๋ฐฑ'
|
175 |
+
elif i == 15:
|
176 |
+
name = digit2name.get(digit, '') + '์ฒ'
|
177 |
+
spelledout.append(name)
|
178 |
+
return ''.join(elem for elem in spelledout)
|
179 |
+
|
180 |
+
|
181 |
+
def number_to_hangul(text):
|
182 |
+
'''Reference https://github.com/Kyubyong/g2pK'''
|
183 |
+
tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
|
184 |
+
for token in tokens:
|
185 |
+
num, classifier = token
|
186 |
+
if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
|
187 |
+
spelledout = hangul_number(num, sino=False)
|
188 |
+
else:
|
189 |
+
spelledout = hangul_number(num, sino=True)
|
190 |
+
text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
|
191 |
+
# digit by digit for remaining digits
|
192 |
+
digits = '0123456789'
|
193 |
+
names = '์์ผ์ด์ผ์ฌ์ค์ก์น ํ๊ตฌ'
|
194 |
+
for d, n in zip(digits, names):
|
195 |
+
text = text.replace(d, n)
|
196 |
+
return text
|
197 |
+
|
198 |
+
|
199 |
+
def korean_to_lazy_ipa(text):
|
200 |
+
text = latin_to_hangul(text)
|
201 |
+
text = number_to_hangul(text)
|
202 |
+
text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
|
203 |
+
for regex, replacement in _ipa_to_lazy_ipa:
|
204 |
+
text = re.sub(regex, replacement, text)
|
205 |
+
return text
|
206 |
+
|
207 |
+
|
208 |
+
def korean_to_ipa(text):
|
209 |
+
text = korean_to_lazy_ipa(text)
|
210 |
+
return text.replace('สง','tส').replace('สฅ','dส')
|
text/symbols.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# korean_cleaners
|
2 |
+
_pad = '_'
|
3 |
+
_punctuation = ',.!?โฆ~'
|
4 |
+
_letters = 'ใฑใดใทในใ
ใ
ใ
ใ
ใ
ใ
ใ
ใ
ใ
ใ
ใฒใธใ
ใ
ใ
ใ
ใ
ใ
ใ
ใ
กใ
ฃใ
ใ
'
|
5 |
+
|
6 |
+
|
7 |
+
# Export all symbols:
|
8 |
+
symbols = [_pad] + list(_punctuation) + list(_letters)
|
9 |
+
# Special symbol ids
|
10 |
+
SPACE_ID = symbols.index(' ')
|