Step-Audio / funasr_detach /tokenizer /korean_cleaner.py
martin
initial
67c46fd
raw
history blame
2.01 kB
# Referenced from https://github.com/hccho2/Tacotron-Wavenet-Vocoder-Korean
import re
class KoreanCleaner:
@classmethod
def _normalize_numbers(cls, text):
number_to_kor = {
"0": "영",
"1": "일",
"2": "이",
"3": "μ‚Ό",
"4": "사",
"5": "였",
"6": "윑",
"7": "μΉ ",
"8": "νŒ”",
"9": "ꡬ",
}
new_text = "".join(
number_to_kor[char] if char in number_to_kor.keys() else char
for char in text
)
return new_text
@classmethod
def _normalize_english_text(cls, text):
upper_alphabet_to_kor = {
"A": "에이",
"B": "λΉ„",
"C": "씨",
"D": "λ””",
"E": "이",
"F": "에프",
"G": "μ§€",
"H": "μ—μ΄μΉ˜",
"I": "아이",
"J": "제이",
"K": "케이",
"L": "μ—˜",
"M": "μ— ",
"N": "μ—”",
"O": "였",
"P": "ν”Ό",
"Q": "큐",
"R": "μ•Œ",
"S": "μ—μŠ€",
"T": "ν‹°",
"U": "유",
"V": "브이",
"W": "λ”λΈ”μœ ",
"X": "μ—‘μŠ€",
"Y": "와이",
"Z": "μ§€",
}
new_text = re.sub("[a-z]+", lambda x: str.upper(x.group()), text)
new_text = "".join(
(
upper_alphabet_to_kor[char]
if char in upper_alphabet_to_kor.keys()
else char
)
for char in new_text
)
return new_text
@classmethod
def normalize_text(cls, text):
# stage 0 : text strip
text = text.strip()
# stage 1 : normalize numbers
text = cls._normalize_numbers(text)
# stage 2 : normalize english text
text = cls._normalize_english_text(text)
return text