File size: 2,008 Bytes
67c46fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# Referenced from https://github.com/hccho2/Tacotron-Wavenet-Vocoder-Korean

import re


class KoreanCleaner:
    @classmethod
    def _normalize_numbers(cls, text):
        number_to_kor = {
            "0": "영",
            "1": "일",
            "2": "이",
            "3": "μ‚Ό",
            "4": "사",
            "5": "였",
            "6": "윑",
            "7": "μΉ ",
            "8": "νŒ”",
            "9": "ꡬ",
        }
        new_text = "".join(
            number_to_kor[char] if char in number_to_kor.keys() else char
            for char in text
        )
        return new_text

    @classmethod
    def _normalize_english_text(cls, text):
        upper_alphabet_to_kor = {
            "A": "에이",
            "B": "λΉ„",
            "C": "씨",
            "D": "λ””",
            "E": "이",
            "F": "에프",
            "G": "μ§€",
            "H": "μ—μ΄μΉ˜",
            "I": "아이",
            "J": "제이",
            "K": "케이",
            "L": "μ—˜",
            "M": "μ— ",
            "N": "μ—”",
            "O": "였",
            "P": "ν”Ό",
            "Q": "큐",
            "R": "μ•Œ",
            "S": "μ—μŠ€",
            "T": "ν‹°",
            "U": "유",
            "V": "브이",
            "W": "λ”λΈ”μœ ",
            "X": "μ—‘μŠ€",
            "Y": "와이",
            "Z": "μ§€",
        }
        new_text = re.sub("[a-z]+", lambda x: str.upper(x.group()), text)
        new_text = "".join(
            (
                upper_alphabet_to_kor[char]
                if char in upper_alphabet_to_kor.keys()
                else char
            )
            for char in new_text
        )

        return new_text

    @classmethod
    def normalize_text(cls, text):
        # stage 0 : text strip
        text = text.strip()

        # stage 1 : normalize numbers
        text = cls._normalize_numbers(text)

        # stage 2 : normalize english text
        text = cls._normalize_english_text(text)
        return text