|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Test Subtokenizer and string helper methods.""" |
|
|
|
import collections |
|
import tempfile |
|
|
|
import tensorflow as tf |
|
|
|
from official.nlp.transformer.utils import tokenizer |
|
|
|
|
|
class SubtokenizerTest(tf.test.TestCase): |
|
|
|
def _init_subtokenizer(self, vocab_list): |
|
temp_file = tempfile.NamedTemporaryFile(delete=False) |
|
with tf.io.gfile.GFile(temp_file.name, "w") as w: |
|
for subtoken in vocab_list: |
|
w.write("'%s'" % subtoken) |
|
w.write("\n") |
|
return tokenizer.Subtokenizer(temp_file.name, reserved_tokens=[]) |
|
|
|
def test_encode(self): |
|
vocab_list = ["123_", "test", "ing_"] |
|
subtokenizer = self._init_subtokenizer(vocab_list) |
|
s = "testing 123" |
|
encoded_list = subtokenizer.encode(s) |
|
self.assertEqual([1, 2, 0], encoded_list) |
|
|
|
def test_decode(self): |
|
vocab_list = ["123_", "test", "ing_"] |
|
subtokenizer = self._init_subtokenizer(vocab_list) |
|
encoded_list = [1, 2, 0] |
|
decoded_str = subtokenizer.decode(encoded_list) |
|
self.assertEqual("testing 123", decoded_str) |
|
|
|
def test_subtoken_ids_to_tokens(self): |
|
vocab_list = ["123_", "test", "ing_"] |
|
subtokenizer = self._init_subtokenizer(vocab_list) |
|
encoded_list = [1, 2, 0] |
|
token_list = subtokenizer._subtoken_ids_to_tokens(encoded_list) |
|
self.assertEqual([u"testing", u"123"], token_list) |
|
|
|
|
|
class StringHelperTest(tf.test.TestCase): |
|
|
|
def test_split_string_to_tokens(self): |
|
text = "test? testing 123." |
|
|
|
tokens = tokenizer._split_string_to_tokens(text, |
|
tokenizer._ALPHANUMERIC_CHAR_SET) |
|
self.assertEqual(["test", "? ", "testing", "123", "."], tokens) |
|
|
|
def test_join_tokens_to_string(self): |
|
tokens = ["test", "? ", "testing", "123", "."] |
|
|
|
s = tokenizer._join_tokens_to_string(tokens, |
|
tokenizer._ALPHANUMERIC_CHAR_SET) |
|
self.assertEqual("test? testing 123.", s) |
|
|
|
def test_escape_token(self): |
|
token = u"abc_\\4" |
|
alphabet = set("abc_\\u;") |
|
|
|
escaped_token = tokenizer._escape_token(token, alphabet) |
|
self.assertEqual("abc\\u\\\\\\52;_", escaped_token) |
|
|
|
def test_unescape_token(self): |
|
escaped_token = u"Underline: \\u, Backslash: \\\\, Unicode: \\52;" |
|
|
|
unescaped_token = tokenizer._unescape_token(escaped_token) |
|
self.assertEqual("Underline: _, Backslash: \\, Unicode: 4", unescaped_token) |
|
|
|
def test_list_to_index_dict(self): |
|
lst = ["test", "strings"] |
|
|
|
d = tokenizer._list_to_index_dict(lst) |
|
self.assertDictEqual({"test": 0, "strings": 1}, d) |
|
|
|
def test_split_token_to_subtokens(self): |
|
token = "abc" |
|
subtoken_dict = {"a": 0, "b": 1, "c": 2, "ab": 3} |
|
max_subtoken_length = 2 |
|
|
|
subtokens = tokenizer._split_token_to_subtokens(token, subtoken_dict, |
|
max_subtoken_length) |
|
self.assertEqual(["ab", "c"], subtokens) |
|
|
|
def test_generate_alphabet_dict(self): |
|
s = ["testing", "123"] |
|
reserved_tokens = ["???"] |
|
|
|
alphabet = tokenizer._generate_alphabet_dict(s, reserved_tokens) |
|
self.assertIn("?", alphabet) |
|
self.assertIn("t", alphabet) |
|
self.assertIn("e", alphabet) |
|
self.assertIn("s", alphabet) |
|
self.assertIn("i", alphabet) |
|
self.assertIn("n", alphabet) |
|
self.assertIn("g", alphabet) |
|
self.assertIn("1", alphabet) |
|
self.assertIn("2", alphabet) |
|
self.assertIn("3", alphabet) |
|
|
|
def test_count_and_gen_subtokens(self): |
|
token_counts = {"abc": 5} |
|
alphabet = set("abc_") |
|
subtoken_dict = {"a": 0, "b": 1, "c": 2, "_": 3} |
|
max_subtoken_length = 2 |
|
|
|
subtoken_counts = tokenizer._count_and_gen_subtokens( |
|
token_counts, alphabet, subtoken_dict, max_subtoken_length) |
|
|
|
self.assertIsInstance(subtoken_counts, collections.defaultdict) |
|
self.assertDictEqual( |
|
{ |
|
"a": 5, |
|
"b": 5, |
|
"c": 5, |
|
"_": 5, |
|
"ab": 5, |
|
"bc": 5, |
|
"c_": 5, |
|
"abc": 5, |
|
"bc_": 5, |
|
"abc_": 5 |
|
}, subtoken_counts) |
|
|
|
def test_filter_and_bucket_subtokens(self): |
|
subtoken_counts = collections.defaultdict(int, { |
|
"a": 2, |
|
"b": 4, |
|
"c": 1, |
|
"ab": 6, |
|
"ac": 3, |
|
"abbc": 5 |
|
}) |
|
min_count = 3 |
|
|
|
subtoken_buckets = tokenizer._filter_and_bucket_subtokens( |
|
subtoken_counts, min_count) |
|
|
|
self.assertEqual(len(subtoken_buckets[0]), 0) |
|
self.assertEqual(set("b"), subtoken_buckets[1]) |
|
self.assertEqual(set(["ab", "ac"]), subtoken_buckets[2]) |
|
self.assertEqual(len(subtoken_buckets[3]), 0) |
|
self.assertEqual(set(["abbc"]), subtoken_buckets[4]) |
|
|
|
def test_gen_new_subtoken_list(self): |
|
subtoken_counts = collections.defaultdict(int, { |
|
"translate": 10, |
|
"t": 40, |
|
"tr": 16, |
|
"tra": 12 |
|
}) |
|
min_count = 5 |
|
alphabet = set("translate") |
|
reserved_tokens = ["reserved", "tokens"] |
|
|
|
subtoken_list, max_token_length = tokenizer._gen_new_subtoken_list( |
|
subtoken_counts, min_count, alphabet, reserved_tokens) |
|
|
|
|
|
|
|
self.assertNotIn("tra", subtoken_list) |
|
|
|
self.assertIn("tr", subtoken_list) |
|
self.assertIn("t", subtoken_list) |
|
|
|
self.assertEqual(len("translate"), max_token_length) |
|
|
|
def test_generate_subtokens(self): |
|
token_counts = {"ab": 1, "bc": 3, "abc": 5} |
|
alphabet = set("abc_") |
|
min_count = 100 |
|
num_iterations = 1 |
|
reserved_tokens = ["reserved", "tokens"] |
|
|
|
vocab_list = tokenizer._generate_subtokens(token_counts, alphabet, |
|
min_count, num_iterations, |
|
reserved_tokens) |
|
|
|
|
|
self.assertEqual(vocab_list[:2], reserved_tokens) |
|
|
|
|
|
for c in alphabet: |
|
self.assertIn(c, vocab_list) |
|
|
|
|
|
if __name__ == "__main__": |
|
tf.test.main() |
|
|