File size: 7,112 Bytes
9d61c9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import unittest

import numpy as np

from training.preprocess.normalize_text import NormalizeText


# Create a class to test the ComputePitch class
class TestTextPreprocess(unittest.TestCase):
    def setUp(self):
        np.random.seed(0)
        self.normalizer = NormalizeText()

    def test_byte_encode(self):
        # Test with a simple word
        word = "hello"
        expected_output = [104, 101, 108, 108, 111]
        self.assertTrue(self.normalizer.byte_encode(word) == expected_output)

        # Test with a word containing non-ASCII characters
        word = "héllo"
        expected_output = [104, 195, 169, 108, 108, 111]
        self.assertTrue(self.normalizer.byte_encode(word) == expected_output)

        # Test with an empty string
        word = ""
        expected_output = []
        self.assertTrue(self.normalizer.byte_encode(word) == expected_output)

    def test_normalize_chars(self):
        # Test case 1: Test basic character normalization
        input_text = "It’s a beautiful day…"
        expected_output = "It's a beautiful day."
        self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)

        # Test case 2: Test character normalization with multiple dots
        input_text = "Hello..... world!!!!"
        expected_output = "Hello. world!"
        self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)

        # Test case 3: Test character normalization with multiple exclamation marks
        input_text = "Wow!!!!! This is amazing?????"
        expected_output = "Wow! This is amazing?"
        self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)

        # Test case 4: Test character normalization with multiple question marks
        input_text = "What????? I don't understand!????"
        expected_output = "What? I don't understand!?"
        self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)

        # Test case 5: Test character normalization with multiple quotes
        input_text = "He said, “I don’t know…”"
        expected_output = "He said, 'I don't know.'"
        self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)

        # Test case 6: Test character normalization with multiple dashes
        input_text = "This is a long--sentence"
        expected_output = "This is a long-sentence"
        self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)

        # Test case 7: Test character normalization with mixed characters
        input_text = "It’s a beautiful day… What????? I don't understand!!!!!"
        expected_output = "It's a beautiful day. What? I don't understand!"
        self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)

    def test_normalize(self):
        # Test case 1: Test basic text normalization
        input_text = r"""It’s a beautiful day… Hello..... World!!!! Wow!!!!! This is amazing????? He said, “I don’t know…”. It’s a beautiful day… What????? I don't understand!!!!!"""

        expected_output = r"""It's a beautiful day. Hello. World! Wow! This is amazing? He said, 'I don't know.'. It's a beautiful day. What? I don't understand!"""
        self.assertEqual(self.normalizer(input_text), expected_output)

        # Test case 2: Test text normalization with multiple dots
        input_text = "Hello..... World!!!!"
        expected_output = "Hello. World!"
        self.assertEqual(self.normalizer(input_text), expected_output)

        # Test case 3: numbers
        input_text = "1234567890"
        expected_output = "one two three four five six seven eight nine zero"
        self.assertEqual(self.normalizer(input_text), expected_output)

        # Test case 4: Complicated case
        input_text = (
            "Mr. Smith paid $111 in U.S.A. on Dec. 17th. We paid $123 for this desk."
        )
        expected_output = r"""mister Smith paid one hundred and eleven dollars in USA on december seventeenth. We paid one hundred and twenty three dollars for this desk."""
        self.assertEqual(self.normalizer(input_text), expected_output)

        # Test case 5: Complicated case 2
        input_text = "St. Patrick’s Day, spend $123 for this desk."
        expected_output = r"""Saint Patrick's Day, spend one hundred and twenty three dollars for this desk."""
        self.assertEqual(self.normalizer(input_text), expected_output)

        # Test case 6: check Dunky bug
        input_text = "For example it normalizes 'medic' into 'm e d i c' or 'yeah' into 'y e a h'."
        expected_output = r"""For example it normalizes 'medic' into 'm e d i c' or 'yeah' into 'y e a h'."""
        self.assertEqual(self.normalizer(input_text), expected_output)

        # Test case 7: Time, currency, line-break
        input_text = "The alarm went off at 10:00a.m. \nI received $123. It's 12:30pm. I paid $123.45 for this desk."
        expected_output = r"""The alarm went off at ten AM. I received one hundred and twenty three dollars. It's twelve thirty PM. I paid one hundred and twenty three dollars forty five cents for this desk."""
        self.assertEqual(self.normalizer(input_text), expected_output)

    def test_normalize2(self):
        input_text = r"""The Wizard of Oz: “Lions? And Tigers? And Bears?”
        Toy Story: “Buzz, you’re flying!”
        As the snake shook its head, a deafening shout behind Harry made both of them jump. ‘DUDLEY! MR DURSLEY! COME AND LOOK AT THIS SNAKE! YOU WON’T BELIEVE WHAT IT’S DOING!’."""

        expected_output = r"The Wizard of Oz: 'Lions? And Tigers? And Bears?'. Toy Story: 'Buzz, you're flying!'. As the snake shook its head, a deafening shout behind Harry made both of them jump. 'DUDLEY! MR DURSLEY! COME AND LOOK AT THIS SNAKE! YOU WON'T BELIEVE WHAT IT'S DOING!'."

        self.assertEqual(self.normalizer(input_text), expected_output)

    def test_normilize_numbers(self):
        input_text = "1234"
        result = self.normalizer(input_text)
        expected_output = "twelve thirty four"
        self.assertEqual(result, expected_output)

    def test_punctuation(self):
        input_text = r"""Hello, World! How are you?
        Victor — why did you do that?
        As the old saying goes, "The early bird catches the worm." (Some people say that the early bird gets the worm.)"""
        result = self.normalizer(input_text)
        expected_output = r"""Hello, World! How are you?. Victor - why did you do that?. As the old saying goes, 'The early bird catches the worm.' (Some people say that the early bird gets the worm.)"""
        self.assertEqual(result, expected_output)

        # Double punctuation
        input_text2 = r"""Hello, World!!!! How are you????
        Victor – why did you do that?
        As the old saying goes, "The early bird catches the worm." (Some people say that the early bird gets the worm.)"""
        result2 = self.normalizer(input_text2)
        self.assertEqual(result2, expected_output)


if __name__ == "__main__":
    unittest.main()