Spaces:
Sleeping
Sleeping
File size: 7,112 Bytes
9d61c9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import unittest
import numpy as np
from training.preprocess.normalize_text import NormalizeText
# Create a class to test the ComputePitch class
class TestTextPreprocess(unittest.TestCase):
def setUp(self):
np.random.seed(0)
self.normalizer = NormalizeText()
def test_byte_encode(self):
# Test with a simple word
word = "hello"
expected_output = [104, 101, 108, 108, 111]
self.assertTrue(self.normalizer.byte_encode(word) == expected_output)
# Test with a word containing non-ASCII characters
word = "héllo"
expected_output = [104, 195, 169, 108, 108, 111]
self.assertTrue(self.normalizer.byte_encode(word) == expected_output)
# Test with an empty string
word = ""
expected_output = []
self.assertTrue(self.normalizer.byte_encode(word) == expected_output)
def test_normalize_chars(self):
# Test case 1: Test basic character normalization
input_text = "It’s a beautiful day…"
expected_output = "It's a beautiful day."
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
# Test case 2: Test character normalization with multiple dots
input_text = "Hello..... world!!!!"
expected_output = "Hello. world!"
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
# Test case 3: Test character normalization with multiple exclamation marks
input_text = "Wow!!!!! This is amazing?????"
expected_output = "Wow! This is amazing?"
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
# Test case 4: Test character normalization with multiple question marks
input_text = "What????? I don't understand!????"
expected_output = "What? I don't understand!?"
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
# Test case 5: Test character normalization with multiple quotes
input_text = "He said, “I don’t know…”"
expected_output = "He said, 'I don't know.'"
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
# Test case 6: Test character normalization with multiple dashes
input_text = "This is a long--sentence"
expected_output = "This is a long-sentence"
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
# Test case 7: Test character normalization with mixed characters
input_text = "It’s a beautiful day… What????? I don't understand!!!!!"
expected_output = "It's a beautiful day. What? I don't understand!"
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
def test_normalize(self):
# Test case 1: Test basic text normalization
input_text = r"""It’s a beautiful day… Hello..... World!!!! Wow!!!!! This is amazing????? He said, “I don’t know…”. It’s a beautiful day… What????? I don't understand!!!!!"""
expected_output = r"""It's a beautiful day. Hello. World! Wow! This is amazing? He said, 'I don't know.'. It's a beautiful day. What? I don't understand!"""
self.assertEqual(self.normalizer(input_text), expected_output)
# Test case 2: Test text normalization with multiple dots
input_text = "Hello..... World!!!!"
expected_output = "Hello. World!"
self.assertEqual(self.normalizer(input_text), expected_output)
# Test case 3: numbers
input_text = "1234567890"
expected_output = "one two three four five six seven eight nine zero"
self.assertEqual(self.normalizer(input_text), expected_output)
# Test case 4: Complicated case
input_text = (
"Mr. Smith paid $111 in U.S.A. on Dec. 17th. We paid $123 for this desk."
)
expected_output = r"""mister Smith paid one hundred and eleven dollars in USA on december seventeenth. We paid one hundred and twenty three dollars for this desk."""
self.assertEqual(self.normalizer(input_text), expected_output)
# Test case 5: Complicated case 2
input_text = "St. Patrick’s Day, spend $123 for this desk."
expected_output = r"""Saint Patrick's Day, spend one hundred and twenty three dollars for this desk."""
self.assertEqual(self.normalizer(input_text), expected_output)
# Test case 6: check Dunky bug
input_text = "For example it normalizes 'medic' into 'm e d i c' or 'yeah' into 'y e a h'."
expected_output = r"""For example it normalizes 'medic' into 'm e d i c' or 'yeah' into 'y e a h'."""
self.assertEqual(self.normalizer(input_text), expected_output)
# Test case 7: Time, currency, line-break
input_text = "The alarm went off at 10:00a.m. \nI received $123. It's 12:30pm. I paid $123.45 for this desk."
expected_output = r"""The alarm went off at ten AM. I received one hundred and twenty three dollars. It's twelve thirty PM. I paid one hundred and twenty three dollars forty five cents for this desk."""
self.assertEqual(self.normalizer(input_text), expected_output)
def test_normalize2(self):
input_text = r"""The Wizard of Oz: “Lions? And Tigers? And Bears?”
Toy Story: “Buzz, you’re flying!”
As the snake shook its head, a deafening shout behind Harry made both of them jump. ‘DUDLEY! MR DURSLEY! COME AND LOOK AT THIS SNAKE! YOU WON’T BELIEVE WHAT IT’S DOING!’."""
expected_output = r"The Wizard of Oz: 'Lions? And Tigers? And Bears?'. Toy Story: 'Buzz, you're flying!'. As the snake shook its head, a deafening shout behind Harry made both of them jump. 'DUDLEY! MR DURSLEY! COME AND LOOK AT THIS SNAKE! YOU WON'T BELIEVE WHAT IT'S DOING!'."
self.assertEqual(self.normalizer(input_text), expected_output)
def test_normilize_numbers(self):
input_text = "1234"
result = self.normalizer(input_text)
expected_output = "twelve thirty four"
self.assertEqual(result, expected_output)
def test_punctuation(self):
input_text = r"""Hello, World! How are you?
Victor — why did you do that?
As the old saying goes, "The early bird catches the worm." (Some people say that the early bird gets the worm.)"""
result = self.normalizer(input_text)
expected_output = r"""Hello, World! How are you?. Victor - why did you do that?. As the old saying goes, 'The early bird catches the worm.' (Some people say that the early bird gets the worm.)"""
self.assertEqual(result, expected_output)
# Double punctuation
input_text2 = r"""Hello, World!!!! How are you????
Victor – why did you do that?
As the old saying goes, "The early bird catches the worm." (Some people say that the early bird gets the worm.)"""
result2 = self.normalizer(input_text2)
self.assertEqual(result2, expected_output)
if __name__ == "__main__":
unittest.main()
|