Spaces:
Running
Running
import unittest | |
import numpy as np | |
from training.preprocess.normalize_text import NormalizeText | |
# Create a class to test the ComputePitch class | |
class TestTextPreprocess(unittest.TestCase): | |
def setUp(self): | |
np.random.seed(0) | |
self.normalizer = NormalizeText() | |
def test_byte_encode(self): | |
# Test with a simple word | |
word = "hello" | |
expected_output = [104, 101, 108, 108, 111] | |
self.assertTrue(self.normalizer.byte_encode(word) == expected_output) | |
# Test with a word containing non-ASCII characters | |
word = "héllo" | |
expected_output = [104, 195, 169, 108, 108, 111] | |
self.assertTrue(self.normalizer.byte_encode(word) == expected_output) | |
# Test with an empty string | |
word = "" | |
expected_output = [] | |
self.assertTrue(self.normalizer.byte_encode(word) == expected_output) | |
def test_normalize_chars(self): | |
# Test case 1: Test basic character normalization | |
input_text = "It’s a beautiful day…" | |
expected_output = "It's a beautiful day." | |
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) | |
# Test case 2: Test character normalization with multiple dots | |
input_text = "Hello..... world!!!!" | |
expected_output = "Hello. world!" | |
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) | |
# Test case 3: Test character normalization with multiple exclamation marks | |
input_text = "Wow!!!!! This is amazing?????" | |
expected_output = "Wow! This is amazing?" | |
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) | |
# Test case 4: Test character normalization with multiple question marks | |
input_text = "What????? I don't understand!????" | |
expected_output = "What? I don't understand!?" | |
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) | |
# Test case 5: Test character normalization with multiple quotes | |
input_text = "He said, “I don’t know…”" | |
expected_output = "He said, 'I don't know.'" | |
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) | |
# Test case 6: Test character normalization with multiple dashes | |
input_text = "This is a long--sentence" | |
expected_output = "This is a long-sentence" | |
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) | |
# Test case 7: Test character normalization with mixed characters | |
input_text = "It’s a beautiful day… What????? I don't understand!!!!!" | |
expected_output = "It's a beautiful day. What? I don't understand!" | |
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output) | |
def test_normalize(self): | |
# Test case 1: Test basic text normalization | |
input_text = r"""It’s a beautiful day… Hello..... World!!!! Wow!!!!! This is amazing????? He said, “I don’t know…”. It’s a beautiful day… What????? I don't understand!!!!!""" | |
expected_output = r"""It's a beautiful day. Hello. World! Wow! This is amazing? He said, 'I don't know.'. It's a beautiful day. What? I don't understand!""" | |
self.assertEqual(self.normalizer(input_text), expected_output) | |
# Test case 2: Test text normalization with multiple dots | |
input_text = "Hello..... World!!!!" | |
expected_output = "Hello. World!" | |
self.assertEqual(self.normalizer(input_text), expected_output) | |
# Test case 3: numbers | |
input_text = "1234567890" | |
expected_output = "one two three four five six seven eight nine zero" | |
self.assertEqual(self.normalizer(input_text), expected_output) | |
# Test case 4: Complicated case | |
input_text = ( | |
"Mr. Smith paid $111 in U.S.A. on Dec. 17th. We paid $123 for this desk." | |
) | |
expected_output = r"""mister Smith paid one hundred and eleven dollars in USA on december seventeenth. We paid one hundred and twenty three dollars for this desk.""" | |
self.assertEqual(self.normalizer(input_text), expected_output) | |
# Test case 5: Complicated case 2 | |
input_text = "St. Patrick’s Day, spend $123 for this desk." | |
expected_output = r"""Saint Patrick's Day, spend one hundred and twenty three dollars for this desk.""" | |
self.assertEqual(self.normalizer(input_text), expected_output) | |
# Test case 6: check Dunky bug | |
input_text = "For example it normalizes 'medic' into 'm e d i c' or 'yeah' into 'y e a h'." | |
expected_output = r"""For example it normalizes 'medic' into 'm e d i c' or 'yeah' into 'y e a h'.""" | |
self.assertEqual(self.normalizer(input_text), expected_output) | |
# Test case 7: Time, currency, line-break | |
input_text = "The alarm went off at 10:00a.m. \nI received $123. It's 12:30pm. I paid $123.45 for this desk." | |
expected_output = r"""The alarm went off at ten AM. I received one hundred and twenty three dollars. It's twelve thirty PM. I paid one hundred and twenty three dollars forty five cents for this desk.""" | |
self.assertEqual(self.normalizer(input_text), expected_output) | |
def test_normalize2(self): | |
input_text = r"""The Wizard of Oz: “Lions? And Tigers? And Bears?” | |
Toy Story: “Buzz, you’re flying!” | |
As the snake shook its head, a deafening shout behind Harry made both of them jump. ‘DUDLEY! MR DURSLEY! COME AND LOOK AT THIS SNAKE! YOU WON’T BELIEVE WHAT IT’S DOING!’.""" | |
expected_output = r"The Wizard of Oz: 'Lions? And Tigers? And Bears?'. Toy Story: 'Buzz, you're flying!'. As the snake shook its head, a deafening shout behind Harry made both of them jump. 'DUDLEY! MR DURSLEY! COME AND LOOK AT THIS SNAKE! YOU WON'T BELIEVE WHAT IT'S DOING!'." | |
self.assertEqual(self.normalizer(input_text), expected_output) | |
def test_normilize_numbers(self): | |
input_text = "1234" | |
result = self.normalizer(input_text) | |
expected_output = "twelve thirty four" | |
self.assertEqual(result, expected_output) | |
def test_punctuation(self): | |
input_text = r"""Hello, World! How are you? | |
Victor — why did you do that? | |
As the old saying goes, "The early bird catches the worm." (Some people say that the early bird gets the worm.)""" | |
result = self.normalizer(input_text) | |
expected_output = r"""Hello, World! How are you?. Victor - why did you do that?. As the old saying goes, 'The early bird catches the worm.' (Some people say that the early bird gets the worm.)""" | |
self.assertEqual(result, expected_output) | |
# Double punctuation | |
input_text2 = r"""Hello, World!!!! How are you???? | |
Victor – why did you do that? | |
As the old saying goes, "The early bird catches the worm." (Some people say that the early bird gets the worm.)""" | |
result2 = self.normalizer(input_text2) | |
self.assertEqual(result2, expected_output) | |
if __name__ == "__main__": | |
unittest.main() | |