Spaces:
Running
Running
File size: 2,008 Bytes
67c46fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
# Referenced from https://github.com/hccho2/Tacotron-Wavenet-Vocoder-Korean
import re
class KoreanCleaner:
@classmethod
def _normalize_numbers(cls, text):
number_to_kor = {
"0": "μ",
"1": "μΌ",
"2": "μ΄",
"3": "μΌ",
"4": "μ¬",
"5": "μ€",
"6": "μ‘",
"7": "μΉ ",
"8": "ν",
"9": "ꡬ",
}
new_text = "".join(
number_to_kor[char] if char in number_to_kor.keys() else char
for char in text
)
return new_text
@classmethod
def _normalize_english_text(cls, text):
upper_alphabet_to_kor = {
"A": "μμ΄",
"B": "λΉ",
"C": "μ¨",
"D": "λ",
"E": "μ΄",
"F": "μν",
"G": "μ§",
"H": "μμ΄μΉ",
"I": "μμ΄",
"J": "μ μ΄",
"K": "μΌμ΄",
"L": "μ",
"M": "μ ",
"N": "μ",
"O": "μ€",
"P": "νΌ",
"Q": "ν",
"R": "μ",
"S": "μμ€",
"T": "ν°",
"U": "μ ",
"V": "λΈμ΄",
"W": "λλΈμ ",
"X": "μμ€",
"Y": "μμ΄",
"Z": "μ§",
}
new_text = re.sub("[a-z]+", lambda x: str.upper(x.group()), text)
new_text = "".join(
(
upper_alphabet_to_kor[char]
if char in upper_alphabet_to_kor.keys()
else char
)
for char in new_text
)
return new_text
@classmethod
def normalize_text(cls, text):
# stage 0 : text strip
text = text.strip()
# stage 1 : normalize numbers
text = cls._normalize_numbers(text)
# stage 2 : normalize english text
text = cls._normalize_english_text(text)
return text
|