Spaces:

NLPV
/

ISCO-code-predictor-api

Sleeping

App Files Files

Pradeep Kumar commited on Aug 15, 2024

Commit

01a6c3b

verified ·

1 Parent(s): 439fdba

Delete create_pretraining_data_test.py

Browse files

Files changed (1) hide show

create_pretraining_data_test.py +0 -128

create_pretraining_data_test.py DELETED Viewed

@@ -1,128 +0,0 @@
-# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for official.nlp.data.create_pretraining_data."""
-import random
-import tensorflow as tf, tf_keras
-from official.nlp.data import create_pretraining_data as cpd
-_VOCAB_WORDS = ["vocab_1", "vocab_2"]
-class CreatePretrainingDataTest(tf.test.TestCase):
-  def assertTokens(self, input_tokens, output_tokens, masked_positions,
-                   masked_labels):
-    # Ensure the masked positions are unique.
-    self.assertCountEqual(masked_positions, set(masked_positions))
-    # Ensure we can reconstruct the input from the output.
-    reconstructed_tokens = output_tokens
-    for pos, label in zip(masked_positions, masked_labels):
-      reconstructed_tokens[pos] = label
-    self.assertEqual(input_tokens, reconstructed_tokens)
-    # Ensure each label is valid.
-    for pos, label in zip(masked_positions, masked_labels):
-      output_token = output_tokens[pos]
-      if (output_token == "[MASK]" or output_token in _VOCAB_WORDS or
-          output_token == input_tokens[pos]):
-        continue
-      self.fail("invalid mask value: {}".format(output_token))
-  def test_tokens_to_grams(self):
-    tests = [
-        (["That", "cone"], [(0, 1), (1, 2)]),
-        (["That", "cone", "##s"], [(0, 1), (1, 3)]),
-        (["Swit", "##zer", "##land"], [(0, 3)]),
-        (["[CLS]", "Up", "##dog"], [(1, 3)]),
-        (["[CLS]", "Up", "##dog", "[SEP]", "Down"], [(1, 3), (4, 5)]),
-    ]
-    for inp, expected in tests:
-      output = cpd._tokens_to_grams(inp)
-      self.assertEqual(expected, output)
-  def test_window(self):
-    input_list = [1, 2, 3, 4]
-    window_outputs = [
-        (1, [[1], [2], [3], [4]]),
-        (2, [[1, 2], [2, 3], [3, 4]]),
-        (3, [[1, 2, 3], [2, 3, 4]]),
-        (4, [[1, 2, 3, 4]]),
-        (5, []),
-    ]
-    for window, expected in window_outputs:
-      output = cpd._window(input_list, window)
-      self.assertEqual(expected, list(output))
-  def test_create_masked_lm_predictions(self):
-    tokens = ["[CLS]", "a", "##a", "b", "##b", "c", "##c", "[SEP]"]
-    rng = random.Random(123)
-    for _ in range(0, 5):
-      output_tokens, masked_positions, masked_labels = (
-          cpd.create_masked_lm_predictions(
-              tokens=tokens,
-              masked_lm_prob=1.0,
-              max_predictions_per_seq=3,
-              vocab_words=_VOCAB_WORDS,
-              rng=rng,
-              do_whole_word_mask=False,
-              max_ngram_size=None))
-      self.assertLen(masked_positions, 3)
-      self.assertLen(masked_labels, 3)
-      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
-  def test_create_masked_lm_predictions_whole_word(self):
-    tokens = ["[CLS]", "a", "##a", "b", "##b", "c", "##c", "[SEP]"]
-    rng = random.Random(345)
-    for _ in range(0, 5):
-      output_tokens, masked_positions, masked_labels = (
-          cpd.create_masked_lm_predictions(
-              tokens=tokens,
-              masked_lm_prob=1.0,
-              max_predictions_per_seq=3,
-              vocab_words=_VOCAB_WORDS,
-              rng=rng,
-              do_whole_word_mask=True,
-              max_ngram_size=None))
-      # since we can't get exactly three tokens without breaking a word we
-      # only take two.
-      self.assertLen(masked_positions, 2)
-      self.assertLen(masked_labels, 2)
-      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
-      # ensure that we took an entire word.
-      self.assertIn(masked_labels, [["a", "##a"], ["b", "##b"], ["c", "##c"]])
-  def test_create_masked_lm_predictions_ngram(self):
-    tokens = ["[CLS]"] + ["tok{}".format(i) for i in range(0, 512)] + ["[SEP]"]
-    rng = random.Random(345)
-    for _ in range(0, 5):
-      output_tokens, masked_positions, masked_labels = (
-          cpd.create_masked_lm_predictions(
-              tokens=tokens,
-              masked_lm_prob=1.0,
-              max_predictions_per_seq=76,
-              vocab_words=_VOCAB_WORDS,
-              rng=rng,
-              do_whole_word_mask=True,
-              max_ngram_size=3))
-      self.assertLen(masked_positions, 76)
-      self.assertLen(masked_labels, 76)
-      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
-if __name__ == "__main__":
-  tf.test.main()