Spaces:
Running
Running
# Referenced from https://github.com/hccho2/Tacotron-Wavenet-Vocoder-Korean | |
import re | |
class KoreanCleaner: | |
def _normalize_numbers(cls, text): | |
number_to_kor = { | |
"0": "μ", | |
"1": "μΌ", | |
"2": "μ΄", | |
"3": "μΌ", | |
"4": "μ¬", | |
"5": "μ€", | |
"6": "μ‘", | |
"7": "μΉ ", | |
"8": "ν", | |
"9": "ꡬ", | |
} | |
new_text = "".join( | |
number_to_kor[char] if char in number_to_kor.keys() else char | |
for char in text | |
) | |
return new_text | |
def _normalize_english_text(cls, text): | |
upper_alphabet_to_kor = { | |
"A": "μμ΄", | |
"B": "λΉ", | |
"C": "μ¨", | |
"D": "λ", | |
"E": "μ΄", | |
"F": "μν", | |
"G": "μ§", | |
"H": "μμ΄μΉ", | |
"I": "μμ΄", | |
"J": "μ μ΄", | |
"K": "μΌμ΄", | |
"L": "μ", | |
"M": "μ ", | |
"N": "μ", | |
"O": "μ€", | |
"P": "νΌ", | |
"Q": "ν", | |
"R": "μ", | |
"S": "μμ€", | |
"T": "ν°", | |
"U": "μ ", | |
"V": "λΈμ΄", | |
"W": "λλΈμ ", | |
"X": "μμ€", | |
"Y": "μμ΄", | |
"Z": "μ§", | |
} | |
new_text = re.sub("[a-z]+", lambda x: str.upper(x.group()), text) | |
new_text = "".join( | |
( | |
upper_alphabet_to_kor[char] | |
if char in upper_alphabet_to_kor.keys() | |
else char | |
) | |
for char in new_text | |
) | |
return new_text | |
def normalize_text(cls, text): | |
# stage 0 : text strip | |
text = text.strip() | |
# stage 1 : normalize numbers | |
text = cls._normalize_numbers(text) | |
# stage 2 : normalize english text | |
text = cls._normalize_english_text(text) | |
return text | |