Spaces:
Sleeping
Sleeping
File size: 4,945 Bytes
f18e71f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for official.nlp.data.create_pretraining_data."""
import random
import tensorflow as tf, tf_keras
from official.nlp.data import create_pretraining_data as cpd
_VOCAB_WORDS = ["vocab_1", "vocab_2"]
class CreatePretrainingDataTest(tf.test.TestCase):
def assertTokens(self, input_tokens, output_tokens, masked_positions,
masked_labels):
# Ensure the masked positions are unique.
self.assertCountEqual(masked_positions, set(masked_positions))
# Ensure we can reconstruct the input from the output.
reconstructed_tokens = output_tokens
for pos, label in zip(masked_positions, masked_labels):
reconstructed_tokens[pos] = label
self.assertEqual(input_tokens, reconstructed_tokens)
# Ensure each label is valid.
for pos, label in zip(masked_positions, masked_labels):
output_token = output_tokens[pos]
if (output_token == "[MASK]" or output_token in _VOCAB_WORDS or
output_token == input_tokens[pos]):
continue
self.fail("invalid mask value: {}".format(output_token))
def test_tokens_to_grams(self):
tests = [
(["That", "cone"], [(0, 1), (1, 2)]),
(["That", "cone", "##s"], [(0, 1), (1, 3)]),
(["Swit", "##zer", "##land"], [(0, 3)]),
(["[CLS]", "Up", "##dog"], [(1, 3)]),
(["[CLS]", "Up", "##dog", "[SEP]", "Down"], [(1, 3), (4, 5)]),
]
for inp, expected in tests:
output = cpd._tokens_to_grams(inp)
self.assertEqual(expected, output)
def test_window(self):
input_list = [1, 2, 3, 4]
window_outputs = [
(1, [[1], [2], [3], [4]]),
(2, [[1, 2], [2, 3], [3, 4]]),
(3, [[1, 2, 3], [2, 3, 4]]),
(4, [[1, 2, 3, 4]]),
(5, []),
]
for window, expected in window_outputs:
output = cpd._window(input_list, window)
self.assertEqual(expected, list(output))
def test_create_masked_lm_predictions(self):
tokens = ["[CLS]", "a", "##a", "b", "##b", "c", "##c", "[SEP]"]
rng = random.Random(123)
for _ in range(0, 5):
output_tokens, masked_positions, masked_labels = (
cpd.create_masked_lm_predictions(
tokens=tokens,
masked_lm_prob=1.0,
max_predictions_per_seq=3,
vocab_words=_VOCAB_WORDS,
rng=rng,
do_whole_word_mask=False,
max_ngram_size=None))
self.assertLen(masked_positions, 3)
self.assertLen(masked_labels, 3)
self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
def test_create_masked_lm_predictions_whole_word(self):
tokens = ["[CLS]", "a", "##a", "b", "##b", "c", "##c", "[SEP]"]
rng = random.Random(345)
for _ in range(0, 5):
output_tokens, masked_positions, masked_labels = (
cpd.create_masked_lm_predictions(
tokens=tokens,
masked_lm_prob=1.0,
max_predictions_per_seq=3,
vocab_words=_VOCAB_WORDS,
rng=rng,
do_whole_word_mask=True,
max_ngram_size=None))
# since we can't get exactly three tokens without breaking a word we
# only take two.
self.assertLen(masked_positions, 2)
self.assertLen(masked_labels, 2)
self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
# ensure that we took an entire word.
self.assertIn(masked_labels, [["a", "##a"], ["b", "##b"], ["c", "##c"]])
def test_create_masked_lm_predictions_ngram(self):
tokens = ["[CLS]"] + ["tok{}".format(i) for i in range(0, 512)] + ["[SEP]"]
rng = random.Random(345)
for _ in range(0, 5):
output_tokens, masked_positions, masked_labels = (
cpd.create_masked_lm_predictions(
tokens=tokens,
masked_lm_prob=1.0,
max_predictions_per_seq=76,
vocab_words=_VOCAB_WORDS,
rng=rng,
do_whole_word_mask=True,
max_ngram_size=3))
self.assertLen(masked_positions, 76)
self.assertLen(masked_labels, 76)
self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
if __name__ == "__main__":
tf.test.main()
|