PeechTTSv22050 / training /preprocess /tests /test_normalize_text.py
nickovchinnikov's picture
Init
9d61c9b
import unittest
import numpy as np
from training.preprocess.normalize_text import NormalizeText
# Create a class to test the ComputePitch class
class TestTextPreprocess(unittest.TestCase):
def setUp(self):
np.random.seed(0)
self.normalizer = NormalizeText()
def test_byte_encode(self):
# Test with a simple word
word = "hello"
expected_output = [104, 101, 108, 108, 111]
self.assertTrue(self.normalizer.byte_encode(word) == expected_output)
# Test with a word containing non-ASCII characters
word = "héllo"
expected_output = [104, 195, 169, 108, 108, 111]
self.assertTrue(self.normalizer.byte_encode(word) == expected_output)
# Test with an empty string
word = ""
expected_output = []
self.assertTrue(self.normalizer.byte_encode(word) == expected_output)
def test_normalize_chars(self):
# Test case 1: Test basic character normalization
input_text = "It’s a beautiful day…"
expected_output = "It's a beautiful day."
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
# Test case 2: Test character normalization with multiple dots
input_text = "Hello..... world!!!!"
expected_output = "Hello. world!"
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
# Test case 3: Test character normalization with multiple exclamation marks
input_text = "Wow!!!!! This is amazing?????"
expected_output = "Wow! This is amazing?"
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
# Test case 4: Test character normalization with multiple question marks
input_text = "What????? I don't understand!????"
expected_output = "What? I don't understand!?"
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
# Test case 5: Test character normalization with multiple quotes
input_text = "He said, “I don’t know…”"
expected_output = "He said, 'I don't know.'"
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
# Test case 6: Test character normalization with multiple dashes
input_text = "This is a long--sentence"
expected_output = "This is a long-sentence"
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
# Test case 7: Test character normalization with mixed characters
input_text = "It’s a beautiful day… What????? I don't understand!!!!!"
expected_output = "It's a beautiful day. What? I don't understand!"
self.assertEqual(self.normalizer.normalize_chars(input_text), expected_output)
def test_normalize(self):
# Test case 1: Test basic text normalization
input_text = r"""It’s a beautiful day… Hello..... World!!!! Wow!!!!! This is amazing????? He said, “I don’t know…”. It’s a beautiful day… What????? I don't understand!!!!!"""
expected_output = r"""It's a beautiful day. Hello. World! Wow! This is amazing? He said, 'I don't know.'. It's a beautiful day. What? I don't understand!"""
self.assertEqual(self.normalizer(input_text), expected_output)
# Test case 2: Test text normalization with multiple dots
input_text = "Hello..... World!!!!"
expected_output = "Hello. World!"
self.assertEqual(self.normalizer(input_text), expected_output)
# Test case 3: numbers
input_text = "1234567890"
expected_output = "one two three four five six seven eight nine zero"
self.assertEqual(self.normalizer(input_text), expected_output)
# Test case 4: Complicated case
input_text = (
"Mr. Smith paid $111 in U.S.A. on Dec. 17th. We paid $123 for this desk."
)
expected_output = r"""mister Smith paid one hundred and eleven dollars in USA on december seventeenth. We paid one hundred and twenty three dollars for this desk."""
self.assertEqual(self.normalizer(input_text), expected_output)
# Test case 5: Complicated case 2
input_text = "St. Patrick’s Day, spend $123 for this desk."
expected_output = r"""Saint Patrick's Day, spend one hundred and twenty three dollars for this desk."""
self.assertEqual(self.normalizer(input_text), expected_output)
# Test case 6: check Dunky bug
input_text = "For example it normalizes 'medic' into 'm e d i c' or 'yeah' into 'y e a h'."
expected_output = r"""For example it normalizes 'medic' into 'm e d i c' or 'yeah' into 'y e a h'."""
self.assertEqual(self.normalizer(input_text), expected_output)
# Test case 7: Time, currency, line-break
input_text = "The alarm went off at 10:00a.m. \nI received $123. It's 12:30pm. I paid $123.45 for this desk."
expected_output = r"""The alarm went off at ten AM. I received one hundred and twenty three dollars. It's twelve thirty PM. I paid one hundred and twenty three dollars forty five cents for this desk."""
self.assertEqual(self.normalizer(input_text), expected_output)
def test_normalize2(self):
input_text = r"""The Wizard of Oz: “Lions? And Tigers? And Bears?”
Toy Story: “Buzz, you’re flying!”
As the snake shook its head, a deafening shout behind Harry made both of them jump. ‘DUDLEY! MR DURSLEY! COME AND LOOK AT THIS SNAKE! YOU WON’T BELIEVE WHAT IT’S DOING!’."""
expected_output = r"The Wizard of Oz: 'Lions? And Tigers? And Bears?'. Toy Story: 'Buzz, you're flying!'. As the snake shook its head, a deafening shout behind Harry made both of them jump. 'DUDLEY! MR DURSLEY! COME AND LOOK AT THIS SNAKE! YOU WON'T BELIEVE WHAT IT'S DOING!'."
self.assertEqual(self.normalizer(input_text), expected_output)
def test_normilize_numbers(self):
input_text = "1234"
result = self.normalizer(input_text)
expected_output = "twelve thirty four"
self.assertEqual(result, expected_output)
def test_punctuation(self):
input_text = r"""Hello, World! How are you?
Victor — why did you do that?
As the old saying goes, "The early bird catches the worm." (Some people say that the early bird gets the worm.)"""
result = self.normalizer(input_text)
expected_output = r"""Hello, World! How are you?. Victor - why did you do that?. As the old saying goes, 'The early bird catches the worm.' (Some people say that the early bird gets the worm.)"""
self.assertEqual(result, expected_output)
# Double punctuation
input_text2 = r"""Hello, World!!!! How are you????
Victor – why did you do that?
As the old saying goes, "The early bird catches the worm." (Some people say that the early bird gets the worm.)"""
result2 = self.normalizer(input_text2)
self.assertEqual(result2, expected_output)
if __name__ == "__main__":
unittest.main()