import unittest import numpy as np from training.preprocess.normalize_text import NormalizeText # Create a class to test the ComputePitch class class TestTextPreprocess(unittest.TestCase): def setUp(self): np.random.seed(0) self.normalizer = NormalizeText() def test_byte_encode(self): # Test with a simple word word = "hello" expected_output = [104, 101, 108, 108, 111] self.assertTrue(self.normalizer.byte_encode(word) == expected_output) # Test with a word containing non-ASCII characters word = "héllo" expected_output = [104, 195, 169, 108, 108, 111] self.assertTrue(self.normalizer.byte_encode(word) == expected_output) # Test with an empty string word = "" expected_output = [] self.assertTrue(self.normalizer.byte_encode(word) == expected_output) def test_normalize_chars(self): # Test case 1: Test basic character normalization input_text = "It’s a beautiful day…" expected_output = "It's a beautiful day." self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) # Test case 2: Test character normalization with multiple dots input_text = "Hello..... world!!!!" expected_output = "Hello. world!" self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) # Test case 3: Test character normalization with multiple exclamation marks input_text = "Wow!!!!! This is amazing?????" expected_output = "Wow! This is amazing?" self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) # Test case 4: Test character normalization with multiple question marks input_text = "What????? I don't understand!????" expected_output = "What? I don't understand!?" self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) # Test case 5: Test character normalization with multiple quotes input_text = "He said, “I don’t know…”" expected_output = "He said, 'I don't know.'" self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) # Test case 6: Test character normalization with multiple dashes input_text = "This is a long--sentence" expected_output = "This is a long-sentence" self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) # Test case 7: Test character normalization with mixed characters input_text = "It’s a beautiful day… What????? I don't understand!!!!!" expected_output = "It's a beautiful day. What? I don't understand!" self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) def test_normalize(self): # Test case 1: Test basic text normalization input_text = r"""It’s a beautiful day… Hello..... World!!!! Wow!!!!! This is amazing????? He said, “I don’t know…”. It’s a beautiful day… What????? I don't understand!!!!!""" expected_output = r"""It's a beautiful day. Hello. World! Wow! This is amazing? He said, 'I don't know.'. It's a beautiful day. What? I don't understand!""" self.assertEqual(self.normalizer(input_text), expected_output) # Test case 2: Test text normalization with multiple dots input_text = "Hello..... World!!!!" expected_output = "Hello. World!" self.assertEqual(self.normalizer(input_text), expected_output) # Test case 3: numbers input_text = "1234567890" expected_output = "one two three four five six seven eight nine zero" self.assertEqual(self.normalizer(input_text), expected_output) # Test case 4: Complicated case input_text = ( "Mr. Smith paid $111 in U.S.A. on Dec. 17th. We paid $123 for this desk." ) expected_output = r"""mister Smith paid one hundred and eleven dollars in USA on december seventeenth. We paid one hundred and twenty three dollars for this desk.""" self.assertEqual(self.normalizer(input_text), expected_output) # Test case 5: Complicated case 2 input_text = "St. Patrick’s Day, spend $123 for this desk." expected_output = r"""Saint Patrick's Day, spend one hundred and twenty three dollars for this desk.""" self.assertEqual(self.normalizer(input_text), expected_output) # Test case 6: check Dunky bug input_text = "For example it normalizes 'medic' into 'm e d i c' or 'yeah' into 'y e a h'." expected_output = r"""For example it normalizes 'medic' into 'm e d i c' or 'yeah' into 'y e a h'.""" self.assertEqual(self.normalizer(input_text), expected_output) # Test case 7: Time, currency, line-break input_text = "The alarm went off at 10:00a.m. \nI received $123. It's 12:30pm. I paid $123.45 for this desk." expected_output = r"""The alarm went off at ten AM. I received one hundred and twenty three dollars. It's twelve thirty PM. I paid one hundred and twenty three dollars forty five cents for this desk.""" self.assertEqual(self.normalizer(input_text), expected_output) def test_normalize2(self): input_text = r"""The Wizard of Oz: “Lions? And Tigers? And Bears?” Toy Story: “Buzz, you’re flying!” As the snake shook its head, a deafening shout behind Harry made both of them jump. ‘DUDLEY! MR DURSLEY! COME AND LOOK AT THIS SNAKE! YOU WON’T BELIEVE WHAT IT’S DOING!’.""" expected_output = r"The Wizard of Oz: 'Lions? And Tigers? And Bears?'. Toy Story: 'Buzz, you're flying!'. As the snake shook its head, a deafening shout behind Harry made both of them jump. 'DUDLEY! MR DURSLEY! COME AND LOOK AT THIS SNAKE! YOU WON'T BELIEVE WHAT IT'S DOING!'." self.assertEqual(self.normalizer(input_text), expected_output) def test_normilize_numbers(self): input_text = "1234" result = self.normalizer(input_text) expected_output = "twelve thirty four" self.assertEqual(result, expected_output) def test_punctuation(self): input_text = r"""Hello, World! How are you? Victor — why did you do that? As the old saying goes, "The early bird catches the worm." (Some people say that the early bird gets the worm.)""" result = self.normalizer(input_text) expected_output = r"""Hello, World! How are you?. Victor - why did you do that?. As the old saying goes, 'The early bird catches the worm.' (Some people say that the early bird gets the worm.)""" self.assertEqual(result, expected_output) # Double punctuation input_text2 = r"""Hello, World!!!! How are you???? Victor – why did you do that? As the old saying goes, "The early bird catches the worm." (Some people say that the early bird gets the worm.)""" result2 = self.normalizer(input_text2) self.assertEqual(result2, expected_output) if __name__ == "__main__": unittest.main()