Spaces:

NLPV
/

ISCO-code-predictor-api

Sleeping

App Files Files

Pradeep Kumar commited on Aug 15, 2024

Commit

f18e71f

verified ·

1 Parent(s): c130734

Upload 33 files

Browse files

Files changed (33) hide show

README.md +4 -13
__init__.py +14 -0
classifier_data_lib.py +0 -0
classifier_data_lib_test.py +95 -0
create_finetuning_data.py +441 -0
create_pretraining_data.py +718 -0
create_pretraining_data_test.py +128 -0
create_xlnet_pretraining_data.py +721 -0
create_xlnet_pretraining_data_test.py +355 -0
data_loader.py +48 -0
data_loader_factory.py +58 -0
data_loader_factory_test.py +45 -0
dual_encoder_dataloader.py +147 -0
dual_encoder_dataloader_test.py +131 -0
pretrain_dataloader.py +589 -0
pretrain_dataloader_test.py +242 -0
pretrain_dynamic_dataloader.py +223 -0
pretrain_dynamic_dataloader_test.py +245 -0
pretrain_text_dataloader.py +226 -0
question_answering_dataloader.py +115 -0
question_answering_dataloader_test.py +74 -0
sentence_prediction_dataloader.py +267 -0
sentence_prediction_dataloader_test.py +290 -0
sentence_retrieval_lib.py +166 -0
squad_lib.py +975 -0
squad_lib_sp.py +976 -0
tagging_data_lib.py +426 -0
tagging_data_lib_test.py +108 -0
tagging_dataloader.py +90 -0
tagging_dataloader_test.py +82 -0
train_sentencepiece.py +133 -0
wmt_dataloader.py +295 -0
wmt_dataloader_test.py +130 -0

README.md CHANGED Viewed

@@ -1,13 +1,4 @@
----
-title: ISCO Code Predictor Api
-emoji: 📉
-colorFrom: green
-colorTo: indigo
-sdk: gradio
-sdk_version: 4.41.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+This directory contains binaries and utils required for input preprocessing,
+tokenization, etc that can be used with model building blocks available in
+NLP modeling library [nlp/modelling](https://github.com/tensorflow/models/tree/master/official/nlp/modeling)
+to train custom models and validate new research ideas.

__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

classifier_data_lib.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

classifier_data_lib_test.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for third_party.tensorflow_models.official.nlp.data.classifier_data_lib."""
+import os
+import tempfile
+from absl.testing import parameterized
+import tensorflow as tf, tf_keras
+import tensorflow_datasets as tfds
+from official.nlp.data import classifier_data_lib
+from official.nlp.tools import tokenization
+def decode_record(record, name_to_features):
+  """Decodes a record to a TensorFlow example."""
+  return tf.io.parse_single_example(record, name_to_features)
+class BertClassifierLibTest(tf.test.TestCase, parameterized.TestCase):
+  def setUp(self):
+    super(BertClassifierLibTest, self).setUp()
+    self.model_dir = self.get_temp_dir()
+    self.processors = {
+        "CB": classifier_data_lib.CBProcessor,
+        "SUPERGLUE-RTE": classifier_data_lib.SuperGLUERTEProcessor,
+        "BOOLQ": classifier_data_lib.BoolQProcessor,
+        "WIC": classifier_data_lib.WiCProcessor,
+    }
+    vocab_tokens = [
+        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
+        "##ing", ","
+    ]
+    with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
+      vocab_writer.write("".join([x + "\n" for x in vocab_tokens
+                                 ]).encode("utf-8"))
+    vocab_file = vocab_writer.name
+    self.tokenizer = tokenization.FullTokenizer(vocab_file)
+  @parameterized.parameters(
+      {"task_type": "CB"},
+      {"task_type": "BOOLQ"},
+      {"task_type": "SUPERGLUE-RTE"},
+      {"task_type": "WIC"},
+  )
+  def test_generate_dataset_from_tfds_processor(self, task_type):
+    with tfds.testing.mock_data(num_examples=5):
+      output_path = os.path.join(self.model_dir, task_type)
+      processor = self.processors[task_type]()
+      classifier_data_lib.generate_tf_record_from_data_file(
+          processor,
+          None,
+          self.tokenizer,
+          train_data_output_path=output_path,
+          eval_data_output_path=output_path,
+          test_data_output_path=output_path)
+      files = tf.io.gfile.glob(output_path)
+      self.assertNotEmpty(files)
+      train_dataset = tf.data.TFRecordDataset(output_path)
+      seq_length = 128
+      label_type = tf.int64
+      name_to_features = {
+          "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+          "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
+          "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+          "label_ids": tf.io.FixedLenFeature([], label_type),
+      }
+      train_dataset = train_dataset.map(
+          lambda record: decode_record(record, name_to_features))
+      # If data is retrieved without error, then all requirements
+      # including data type/shapes are met.
+      _ = next(iter(train_dataset))
+if __name__ == "__main__":
+  tf.test.main()

create_finetuning_data.py ADDED Viewed

	@@ -0,0 +1,441 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning task dataset generator."""
+import functools
+import json
+import os
+# Import libraries
+from absl import app
+from absl import flags
+import tensorflow as tf, tf_keras
+from official.nlp.data import classifier_data_lib
+from official.nlp.data import sentence_retrieval_lib
+# word-piece tokenizer based squad_lib
+from official.nlp.data import squad_lib as squad_lib_wp
+# sentence-piece tokenizer based squad_lib
+from official.nlp.data import squad_lib_sp
+from official.nlp.data import tagging_data_lib
+from official.nlp.tools import tokenization
+FLAGS = flags.FLAGS
+flags.DEFINE_enum(
+    "fine_tuning_task_type", "classification",
+    ["classification", "regression", "squad", "retrieval", "tagging"],
+    "The name of the BERT fine tuning task for which data "
+    "will be generated.")
+# BERT classification specific flags.
+flags.DEFINE_string(
+    "input_data_dir", None,
+    "The input data dir. Should contain the .tsv files (or other data files) "
+    "for the task.")
+flags.DEFINE_enum(
+    "classification_task_name", "MNLI", [
+        "AX", "COLA", "IMDB", "MNLI", "MRPC", "PAWS-X", "QNLI", "QQP", "RTE",
+        "SST-2", "STS-B", "WNLI", "XNLI", "XTREME-XNLI", "XTREME-PAWS-X",
+        "AX-g", "SUPERGLUE-RTE", "CB", "BoolQ", "WIC"
+    ], "The name of the task to train BERT classifier. The "
+    "difference between XTREME-XNLI and XNLI is: 1. the format "
+    "of input tsv files; 2. the dev set for XTREME is english "
+    "only and for XNLI is all languages combined. Same for "
+    "PAWS-X.")
+# MNLI task-specific flag.
+flags.DEFINE_enum("mnli_type", "matched", ["matched", "mismatched"],
+                  "The type of MNLI dataset.")
+# XNLI task-specific flag.
+flags.DEFINE_string(
+    "xnli_language", "en",
+    "Language of training data for XNLI task. If the value is 'all', the data "
+    "of all languages will be used for training.")
+# PAWS-X task-specific flag.
+flags.DEFINE_string(
+    "pawsx_language", "en",
+    "Language of training data for PAWS-X task. If the value is 'all', the data "
+    "of all languages will be used for training.")
+# XTREME classification specific flags. Only used in XtremePawsx and XtremeXnli.
+flags.DEFINE_string(
+    "translated_input_data_dir", None,
+    "The translated input data dir. Should contain the .tsv files (or other "
+    "data files) for the task.")
+# Retrieval task-specific flags.
+flags.DEFINE_enum("retrieval_task_name", "bucc", ["bucc", "tatoeba"],
+                  "The name of sentence retrieval task for scoring")
+# Tagging task-specific flags.
+flags.DEFINE_enum("tagging_task_name", "panx", ["panx", "udpos"],
+                  "The name of BERT tagging (token classification) task.")
+flags.DEFINE_bool("tagging_only_use_en_train", True,
+                  "Whether only use english training data in tagging.")
+# BERT Squad task-specific flags.
+flags.DEFINE_string(
+    "squad_data_file", None,
+    "The input data file in for generating training data for BERT squad task.")
+flags.DEFINE_string(
+    "translated_squad_data_folder", None,
+    "The translated data folder for generating training data for BERT squad "
+    "task.")
+flags.DEFINE_integer(
+    "doc_stride", 128,
+    "When splitting up a long document into chunks, how much stride to "
+    "take between chunks.")
+flags.DEFINE_integer(
+    "max_query_length", 64,
+    "The maximum number of tokens for the question. Questions longer than "
+    "this will be truncated to this length.")
+flags.DEFINE_bool(
+    "version_2_with_negative", False,
+    "If true, the SQuAD examples contain some that do not have an answer.")
+flags.DEFINE_bool(
+    "xlnet_format", False,
+    "If true, then data will be preprocessed in a paragraph, query, class order"
+    " instead of the BERT-style class, paragraph, query order.")
+# XTREME specific flags.
+flags.DEFINE_bool("only_use_en_dev", True, "Whether only use english dev data.")
+# Shared flags across BERT fine-tuning tasks.
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+flags.DEFINE_string(
+    "train_data_output_path", None,
+    "The path in which generated training input data will be written as tf"
+    " records.")
+flags.DEFINE_string(
+    "eval_data_output_path", None,
+    "The path in which generated evaluation input data will be written as tf"
+    " records.")
+flags.DEFINE_string(
+    "test_data_output_path", None,
+    "The path in which generated test input data will be written as tf"
+    " records. If None, do not generate test data. Must be a pattern template"
+    " as test_{}.tfrecords if processor has language specific test data.")
+flags.DEFINE_string("meta_data_file_path", None,
+                    "The path in which input meta data will be written.")
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded.")
+flags.DEFINE_string("sp_model_file", "",
+                    "The path to the model used by sentence piece tokenizer.")
+flags.DEFINE_enum(
+    "tokenization", "WordPiece", ["WordPiece", "SentencePiece"],
+    "Specifies the tokenizer implementation, i.e., whether to use WordPiece "
+    "or SentencePiece tokenizer. Canonical BERT uses WordPiece tokenizer, "
+    "while ALBERT uses SentencePiece tokenizer.")
+flags.DEFINE_string(
+    "tfds_params", "", "Comma-separated list of TFDS parameter assignments for "
+    "generic classfication data import (for more details "
+    "see the TfdsProcessor class documentation).")
+def generate_classifier_dataset():
+  """Generates classifier dataset and returns input meta data."""
+  if FLAGS.classification_task_name in [
+      "COLA",
+      "WNLI",
+      "SST-2",
+      "MRPC",
+      "QQP",
+      "STS-B",
+      "MNLI",
+      "QNLI",
+      "RTE",
+      "AX",
+      "SUPERGLUE-RTE",
+      "CB",
+      "BoolQ",
+      "WIC",
+  ]:
+    assert not FLAGS.input_data_dir or FLAGS.tfds_params
+  else:
+    assert (FLAGS.input_data_dir and FLAGS.classification_task_name or
+            FLAGS.tfds_params)
+  if FLAGS.tokenization == "WordPiece":
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+    processor_text_fn = tokenization.convert_to_unicode
+  else:
+    assert FLAGS.tokenization == "SentencePiece"
+    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
+    processor_text_fn = functools.partial(
+        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
+  if FLAGS.tfds_params:
+    processor = classifier_data_lib.TfdsProcessor(
+        tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn)
+    return classifier_data_lib.generate_tf_record_from_data_file(
+        processor,
+        None,
+        tokenizer,
+        train_data_output_path=FLAGS.train_data_output_path,
+        eval_data_output_path=FLAGS.eval_data_output_path,
+        test_data_output_path=FLAGS.test_data_output_path,
+        max_seq_length=FLAGS.max_seq_length)
+  else:
+    processors = {
+        "ax":
+            classifier_data_lib.AxProcessor,
+        "cola":
+            classifier_data_lib.ColaProcessor,
+        "imdb":
+            classifier_data_lib.ImdbProcessor,
+        "mnli":
+            functools.partial(
+                classifier_data_lib.MnliProcessor, mnli_type=FLAGS.mnli_type),
+        "mrpc":
+            classifier_data_lib.MrpcProcessor,
+        "qnli":
+            classifier_data_lib.QnliProcessor,
+        "qqp":
+            classifier_data_lib.QqpProcessor,
+        "rte":
+            classifier_data_lib.RteProcessor,
+        "sst-2":
+            classifier_data_lib.SstProcessor,
+        "sts-b":
+            classifier_data_lib.StsBProcessor,
+        "xnli":
+            functools.partial(
+                classifier_data_lib.XnliProcessor,
+                language=FLAGS.xnli_language),
+        "paws-x":
+            functools.partial(
+                classifier_data_lib.PawsxProcessor,
+                language=FLAGS.pawsx_language),
+        "wnli":
+            classifier_data_lib.WnliProcessor,
+        "xtreme-xnli":
+            functools.partial(
+                classifier_data_lib.XtremeXnliProcessor,
+                translated_data_dir=FLAGS.translated_input_data_dir,
+                only_use_en_dev=FLAGS.only_use_en_dev),
+        "xtreme-paws-x":
+            functools.partial(
+                classifier_data_lib.XtremePawsxProcessor,
+                translated_data_dir=FLAGS.translated_input_data_dir,
+                only_use_en_dev=FLAGS.only_use_en_dev),
+        "ax-g":
+            classifier_data_lib.AXgProcessor,
+        "superglue-rte":
+            classifier_data_lib.SuperGLUERTEProcessor,
+        "cb":
+            classifier_data_lib.CBProcessor,
+        "boolq":
+            classifier_data_lib.BoolQProcessor,
+        "wic":
+            classifier_data_lib.WnliProcessor,
+    }
+    task_name = FLAGS.classification_task_name.lower()
+    if task_name not in processors:
+      raise ValueError("Task not found: %s" % (task_name,))
+    processor = processors[task_name](process_text_fn=processor_text_fn)
+    return classifier_data_lib.generate_tf_record_from_data_file(
+        processor,
+        FLAGS.input_data_dir,
+        tokenizer,
+        train_data_output_path=FLAGS.train_data_output_path,
+        eval_data_output_path=FLAGS.eval_data_output_path,
+        test_data_output_path=FLAGS.test_data_output_path,
+        max_seq_length=FLAGS.max_seq_length)
+def generate_regression_dataset():
+  """Generates regression dataset and returns input meta data."""
+  if FLAGS.tokenization == "WordPiece":
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+    processor_text_fn = tokenization.convert_to_unicode
+  else:
+    assert FLAGS.tokenization == "SentencePiece"
+    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
+    processor_text_fn = functools.partial(
+        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
+  if FLAGS.tfds_params:
+    processor = classifier_data_lib.TfdsProcessor(
+        tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn)
+    return classifier_data_lib.generate_tf_record_from_data_file(
+        processor,
+        None,
+        tokenizer,
+        train_data_output_path=FLAGS.train_data_output_path,
+        eval_data_output_path=FLAGS.eval_data_output_path,
+        test_data_output_path=FLAGS.test_data_output_path,
+        max_seq_length=FLAGS.max_seq_length)
+  else:
+    raise ValueError("No data processor found for the given regression task.")
+def generate_squad_dataset():
+  """Generates squad training dataset and returns input meta data."""
+  assert FLAGS.squad_data_file
+  if FLAGS.tokenization == "WordPiece":
+    return squad_lib_wp.generate_tf_record_from_json_file(
+        input_file_path=FLAGS.squad_data_file,
+        vocab_file_path=FLAGS.vocab_file,
+        output_path=FLAGS.train_data_output_path,
+        translated_input_folder=FLAGS.translated_squad_data_folder,
+        max_seq_length=FLAGS.max_seq_length,
+        do_lower_case=FLAGS.do_lower_case,
+        max_query_length=FLAGS.max_query_length,
+        doc_stride=FLAGS.doc_stride,
+        version_2_with_negative=FLAGS.version_2_with_negative,
+        xlnet_format=FLAGS.xlnet_format)
+  else:
+    assert FLAGS.tokenization == "SentencePiece"
+    return squad_lib_sp.generate_tf_record_from_json_file(
+        input_file_path=FLAGS.squad_data_file,
+        sp_model_file=FLAGS.sp_model_file,
+        output_path=FLAGS.train_data_output_path,
+        translated_input_folder=FLAGS.translated_squad_data_folder,
+        max_seq_length=FLAGS.max_seq_length,
+        do_lower_case=FLAGS.do_lower_case,
+        max_query_length=FLAGS.max_query_length,
+        doc_stride=FLAGS.doc_stride,
+        xlnet_format=FLAGS.xlnet_format,
+        version_2_with_negative=FLAGS.version_2_with_negative)
+def generate_retrieval_dataset():
+  """Generate retrieval test and dev dataset and returns input meta data."""
+  assert (FLAGS.input_data_dir and FLAGS.retrieval_task_name)
+  if FLAGS.tokenization == "WordPiece":
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+    processor_text_fn = tokenization.convert_to_unicode
+  else:
+    assert FLAGS.tokenization == "SentencePiece"
+    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
+    processor_text_fn = functools.partial(
+        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
+  processors = {
+      "bucc": sentence_retrieval_lib.BuccProcessor,
+      "tatoeba": sentence_retrieval_lib.TatoebaProcessor,
+  }
+  task_name = FLAGS.retrieval_task_name.lower()
+  if task_name not in processors:
+    raise ValueError("Task not found: %s" % task_name)
+  processor = processors[task_name](process_text_fn=processor_text_fn)
+  return sentence_retrieval_lib.generate_sentence_retrevial_tf_record(
+      processor, FLAGS.input_data_dir, tokenizer, FLAGS.eval_data_output_path,
+      FLAGS.test_data_output_path, FLAGS.max_seq_length)
+def generate_tagging_dataset():
+  """Generates tagging dataset."""
+  processors = {
+      "panx":
+          functools.partial(
+              tagging_data_lib.PanxProcessor,
+              only_use_en_train=FLAGS.tagging_only_use_en_train,
+              only_use_en_dev=FLAGS.only_use_en_dev),
+      "udpos":
+          functools.partial(
+              tagging_data_lib.UdposProcessor,
+              only_use_en_train=FLAGS.tagging_only_use_en_train,
+              only_use_en_dev=FLAGS.only_use_en_dev),
+  }
+  task_name = FLAGS.tagging_task_name.lower()
+  if task_name not in processors:
+    raise ValueError("Task not found: %s" % task_name)
+  if FLAGS.tokenization == "WordPiece":
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+    processor_text_fn = tokenization.convert_to_unicode
+  elif FLAGS.tokenization == "SentencePiece":
+    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
+    processor_text_fn = functools.partial(
+        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
+  else:
+    raise ValueError("Unsupported tokenization: %s" % FLAGS.tokenization)
+  processor = processors[task_name]()
+  return tagging_data_lib.generate_tf_record_from_data_file(
+      processor, FLAGS.input_data_dir, tokenizer, FLAGS.max_seq_length,
+      FLAGS.train_data_output_path, FLAGS.eval_data_output_path,
+      FLAGS.test_data_output_path, processor_text_fn)
+def main(_):
+  if FLAGS.tokenization == "WordPiece":
+    if not FLAGS.vocab_file:
+      raise ValueError(
+          "FLAG vocab_file for word-piece tokenizer is not specified.")
+  else:
+    assert FLAGS.tokenization == "SentencePiece"
+    if not FLAGS.sp_model_file:
+      raise ValueError(
+          "FLAG sp_model_file for sentence-piece tokenizer is not specified.")
+  if FLAGS.fine_tuning_task_type != "retrieval":
+    flags.mark_flag_as_required("train_data_output_path")
+  if FLAGS.fine_tuning_task_type == "classification":
+    input_meta_data = generate_classifier_dataset()
+  elif FLAGS.fine_tuning_task_type == "regression":
+    input_meta_data = generate_regression_dataset()
+  elif FLAGS.fine_tuning_task_type == "retrieval":
+    input_meta_data = generate_retrieval_dataset()
+  elif FLAGS.fine_tuning_task_type == "squad":
+    input_meta_data = generate_squad_dataset()
+  else:
+    assert FLAGS.fine_tuning_task_type == "tagging"
+    input_meta_data = generate_tagging_dataset()
+  tf.io.gfile.makedirs(os.path.dirname(FLAGS.meta_data_file_path))
+  with tf.io.gfile.GFile(FLAGS.meta_data_file_path, "w") as writer:
+    writer.write(json.dumps(input_meta_data, indent=4) + "\n")
+if __name__ == "__main__":
+  flags.mark_flag_as_required("meta_data_file_path")
+  app.run(main)

create_pretraining_data.py ADDED Viewed

	@@ -0,0 +1,718 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Create masked LM/next sentence masked_lm TF examples for BERT."""
+import collections
+import itertools
+import random
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf, tf_keras
+from official.nlp.tools import tokenization
+FLAGS = flags.FLAGS
+flags.DEFINE_string("input_file", None,
+                    "Input raw text file (or comma-separated list of files).")
+flags.DEFINE_string(
+    "output_file", None,
+    "Output TF example file (or comma-separated list of files).")
+flags.DEFINE_enum(
+    "tokenization",
+    "WordPiece",
+    ["WordPiece", "SentencePiece"],
+    "Specifies the tokenizer implementation, i.e., whether to use WordPiece "
+    "or SentencePiece tokenizer. Canonical BERT uses WordPiece tokenizer, "
+    "while ALBERT uses SentencePiece tokenizer.",
+)
+flags.DEFINE_string(
+    "vocab_file",
+    None,
+    "For WordPiece tokenization, the vocabulary file of the tokenizer.",
+)
+flags.DEFINE_string(
+    "sp_model_file",
+    "",
+    "For SentencePiece tokenization, the path to the model of the tokenizer.",
+)
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+flags.DEFINE_bool(
+    "do_whole_word_mask",
+    False,
+    "Whether to use whole word masking rather than per-token masking.",
+)
+flags.DEFINE_integer(
+    "max_ngram_size", None,
+    "Mask contiguous whole words (n-grams) of up to `max_ngram_size` using a "
+    "weighting scheme to favor shorter n-grams. "
+    "Note: `--do_whole_word_mask=True` must also be set when n-gram masking.")
+flags.DEFINE_bool(
+    "gzip_compress", False,
+    "Whether to use `GZIP` compress option to get compressed TFRecord files.")
+flags.DEFINE_bool(
+    "use_v2_feature_names", False,
+    "Whether to use the feature names consistent with the models.")
+flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
+flags.DEFINE_integer("max_predictions_per_seq", 20,
+                     "Maximum number of masked LM predictions per sequence.")
+flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
+flags.DEFINE_integer(
+    "dupe_factor", 10,
+    "Number of times to duplicate the input data (with different masks).")
+flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
+flags.DEFINE_float(
+    "short_seq_prob", 0.1,
+    "Probability of creating sequences which are shorter than the "
+    "maximum length.")
+class TrainingInstance(object):
+  """A single training instance (sentence pair)."""
+  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
+               is_random_next):
+    self.tokens = tokens
+    self.segment_ids = segment_ids
+    self.is_random_next = is_random_next
+    self.masked_lm_positions = masked_lm_positions
+    self.masked_lm_labels = masked_lm_labels
+  def __str__(self):
+    s = ""
+    s += "tokens: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.tokens]))
+    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
+    s += "is_random_next: %s\n" % self.is_random_next
+    s += "masked_lm_positions: %s\n" % (" ".join(
+        [str(x) for x in self.masked_lm_positions]))
+    s += "masked_lm_labels: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
+    s += "\n"
+    return s
+  def __repr__(self):
+    return self.__str__()
+def write_instance_to_example_files(instances, tokenizer, max_seq_length,
+                                    max_predictions_per_seq, output_files,
+                                    gzip_compress, use_v2_feature_names):
+  """Creates TF example files from `TrainingInstance`s."""
+  writers = []
+  for output_file in output_files:
+    writers.append(
+        tf.io.TFRecordWriter(
+            output_file, options="GZIP" if gzip_compress else ""))
+  writer_index = 0
+  total_written = 0
+  for (inst_index, instance) in enumerate(instances):
+    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+    input_mask = [1] * len(input_ids)
+    segment_ids = list(instance.segment_ids)
+    assert len(input_ids) <= max_seq_length
+    while len(input_ids) < max_seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      segment_ids.append(0)
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+    masked_lm_positions = list(instance.masked_lm_positions)
+    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
+    masked_lm_weights = [1.0] * len(masked_lm_ids)
+    while len(masked_lm_positions) < max_predictions_per_seq:
+      masked_lm_positions.append(0)
+      masked_lm_ids.append(0)
+      masked_lm_weights.append(0.0)
+    next_sentence_label = 1 if instance.is_random_next else 0
+    features = collections.OrderedDict()
+    if use_v2_feature_names:
+      features["input_word_ids"] = create_int_feature(input_ids)
+      features["input_type_ids"] = create_int_feature(segment_ids)
+    else:
+      features["input_ids"] = create_int_feature(input_ids)
+      features["segment_ids"] = create_int_feature(segment_ids)
+    features["input_mask"] = create_int_feature(input_mask)
+    features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
+    features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
+    features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
+    features["next_sentence_labels"] = create_int_feature([next_sentence_label])
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writers[writer_index].write(tf_example.SerializeToString())
+    writer_index = (writer_index + 1) % len(writers)
+    total_written += 1
+    if inst_index < 20:
+      logging.info("*** Example ***")
+      logging.info("tokens: %s", " ".join(
+          [tokenization.printable_text(x) for x in instance.tokens]))
+      for feature_name in features.keys():
+        feature = features[feature_name]
+        values = []
+        if feature.int64_list.value:
+          values = feature.int64_list.value
+        elif feature.float_list.value:
+          values = feature.float_list.value
+        logging.info("%s: %s", feature_name, " ".join([str(x) for x in values]))
+  for writer in writers:
+    writer.close()
+  logging.info("Wrote %d total instances", total_written)
+def create_int_feature(values):
+  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  return feature
+def create_float_feature(values):
+  feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+  return feature
+def create_training_instances(
+    input_files,
+    tokenizer,
+    processor_text_fn,
+    max_seq_length,
+    dupe_factor,
+    short_seq_prob,
+    masked_lm_prob,
+    max_predictions_per_seq,
+    rng,
+    do_whole_word_mask=False,
+    max_ngram_size=None,
+):
+  """Create `TrainingInstance`s from raw text."""
+  all_documents = [[]]
+  # Input file format:
+  # (1) One sentence per line. These should ideally be actual sentences, not
+  # entire paragraphs or arbitrary spans of text. (Because we use the
+  # sentence boundaries for the "next sentence prediction" task).
+  # (2) Blank lines between documents. Document boundaries are needed so
+  # that the "next sentence prediction" task doesn't span between documents.
+  for input_file in input_files:
+    with tf.io.gfile.GFile(input_file, "rb") as reader:
+      for line in reader:
+        line = processor_text_fn(line)
+        # Empty lines are used as document delimiters
+        if not line:
+          all_documents.append([])
+        tokens = tokenizer.tokenize(line)
+        if tokens:
+          all_documents[-1].append(tokens)
+  # Remove empty documents
+  all_documents = [x for x in all_documents if x]
+  rng.shuffle(all_documents)
+  vocab_words = list(tokenizer.vocab.keys())
+  instances = []
+  for _ in range(dupe_factor):
+    for document_index in range(len(all_documents)):
+      instances.extend(
+          create_instances_from_document(
+              all_documents, document_index, max_seq_length, short_seq_prob,
+              masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
+              do_whole_word_mask, max_ngram_size))
+  rng.shuffle(instances)
+  return instances
+def create_instances_from_document(
+    all_documents, document_index, max_seq_length, short_seq_prob,
+    masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
+    do_whole_word_mask=False,
+    max_ngram_size=None):
+  """Creates `TrainingInstance`s for a single document."""
+  document = all_documents[document_index]
+  # Account for [CLS], [SEP], [SEP]
+  max_num_tokens = max_seq_length - 3
+  # We *usually* want to fill up the entire sequence since we are padding
+  # to `max_seq_length` anyways, so short sequences are generally wasted
+  # computation. However, we *sometimes*
+  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+  # sequences to minimize the mismatch between pre-training and fine-tuning.
+  # The `target_seq_length` is just a rough target however, whereas
+  # `max_seq_length` is a hard limit.
+  target_seq_length = max_num_tokens
+  if rng.random() < short_seq_prob:
+    target_seq_length = rng.randint(2, max_num_tokens)
+  # We DON'T just concatenate all of the tokens from a document into a long
+  # sequence and choose an arbitrary split point because this would make the
+  # next sentence prediction task too easy. Instead, we split the input into
+  # segments "A" and "B" based on the actual "sentences" provided by the user
+  # input.
+  instances = []
+  current_chunk = []
+  current_length = 0
+  i = 0
+  while i < len(document):
+    segment = document[i]
+    current_chunk.append(segment)
+    current_length += len(segment)
+    if i == len(document) - 1 or current_length >= target_seq_length:
+      if current_chunk:
+        # `a_end` is how many segments from `current_chunk` go into the `A`
+        # (first) sentence.
+        a_end = 1
+        if len(current_chunk) >= 2:
+          a_end = rng.randint(1, len(current_chunk) - 1)
+        tokens_a = []
+        for j in range(a_end):
+          tokens_a.extend(current_chunk[j])
+        tokens_b = []
+        # Random next
+        is_random_next = False
+        if len(current_chunk) == 1 or rng.random() < 0.5:
+          is_random_next = True
+          target_b_length = target_seq_length - len(tokens_a)
+          # This should rarely go for more than one iteration for large
+          # corpora. However, just to be careful, we try to make sure that
+          # the random document is not the same as the document
+          # we're processing.
+          for _ in range(10):
+            random_document_index = rng.randint(0, len(all_documents) - 1)
+            if random_document_index != document_index:
+              break
+          random_document = all_documents[random_document_index]
+          random_start = rng.randint(0, len(random_document) - 1)
+          for j in range(random_start, len(random_document)):
+            tokens_b.extend(random_document[j])
+            if len(tokens_b) >= target_b_length:
+              break
+          # We didn't actually use these segments so we "put them back" so
+          # they don't go to waste.
+          num_unused_segments = len(current_chunk) - a_end
+          i -= num_unused_segments
+        # Actual next
+        else:
+          is_random_next = False
+          for j in range(a_end, len(current_chunk)):
+            tokens_b.extend(current_chunk[j])
+        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+        assert len(tokens_a) >= 1
+        assert len(tokens_b) >= 1
+        tokens = []
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in tokens_a:
+          tokens.append(token)
+          segment_ids.append(0)
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+        for token in tokens_b:
+          tokens.append(token)
+          segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+        (tokens, masked_lm_positions,
+         masked_lm_labels) = create_masked_lm_predictions(
+             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
+             do_whole_word_mask, max_ngram_size)
+        instance = TrainingInstance(
+            tokens=tokens,
+            segment_ids=segment_ids,
+            is_random_next=is_random_next,
+            masked_lm_positions=masked_lm_positions,
+            masked_lm_labels=masked_lm_labels)
+        instances.append(instance)
+      current_chunk = []
+      current_length = 0
+    i += 1
+  return instances
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+# A _Gram is a [half-open) interval of token indices which form a word.
+# E.g.,
+#   words:  ["The", "doghouse"]
+#   tokens: ["The", "dog", "##house"]
+#   grams:  [(0,1), (1,3)]
+_Gram = collections.namedtuple("_Gram", ["begin", "end"])
+def _window(iterable, size):
+  """Helper to create a sliding window iterator with a given size.
+  E.g.,
+    input = [1, 2, 3, 4]
+    _window(input, 1) => [1], [2], [3], [4]
+    _window(input, 2) => [1, 2], [2, 3], [3, 4]
+    _window(input, 3) => [1, 2, 3], [2, 3, 4]
+    _window(input, 4) => [1, 2, 3, 4]
+    _window(input, 5) => None
+  Args:
+    iterable: elements to iterate over.
+    size: size of the window.
+  Yields:
+    Elements of `iterable` batched into a sliding window of length `size`.
+  """
+  i = iter(iterable)
+  window = []
+  try:
+    for e in range(0, size):
+      window.append(next(i))
+    yield window
+  except StopIteration:
+    # handle the case where iterable's length is less than the window size.
+    return
+  for e in i:
+    window = window[1:] + [e]
+    yield window
+def _contiguous(sorted_grams):
+  """Test whether a sequence of grams is contiguous.
+  Args:
+    sorted_grams: _Grams which are sorted in increasing order.
+  Returns:
+    True if `sorted_grams` are touching each other.
+  E.g.,
+    _contiguous([(1, 4), (4, 5), (5, 10)]) == True
+    _contiguous([(1, 2), (4, 5)]) == False
+  """
+  for a, b in _window(sorted_grams, 2):
+    if a.end != b.begin:
+      return False
+  return True
+def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
+  """Create a list of masking {1, ..., n}-grams from a list of one-grams.
+  This is an extension of 'whole word masking' to mask multiple, contiguous
+  words such as (e.g., "the red boat").
+  Each input gram represents the token indices of a single word,
+     words:  ["the", "red", "boat"]
+     tokens: ["the", "red", "boa", "##t"]
+     grams:  [(0,1), (1,2), (2,4)]
+  For a `max_ngram_size` of three, possible outputs masks include:
+    1-grams: (0,1), (1,2), (2,4)
+    2-grams: (0,2), (1,4)
+    3-grams; (0,4)
+  Output masks will not overlap and contain less than `max_masked_tokens` total
+  tokens.  E.g., for the example above with `max_masked_tokens` as three,
+  valid outputs are,
+       [(0,1), (1,2)]  # "the", "red" covering two tokens
+       [(1,2), (2,4)]  # "red", "boa", "##t" covering three tokens
+  The length of the selected n-gram follows a zipf weighting to
+  favor shorter n-gram sizes (weight(1)=1, weight(2)=1/2, weight(3)=1/3, ...).
+  Args:
+    grams: List of one-grams.
+    max_ngram_size: Maximum number of contiguous one-grams combined to create
+      an n-gram.
+    max_masked_tokens: Maximum total number of tokens to be masked.
+    rng: `random.Random` generator.
+  Returns:
+    A list of n-grams to be used as masks.
+  """
+  if not grams:
+    return None
+  grams = sorted(grams)
+  num_tokens = grams[-1].end
+  # Ensure our grams are valid (i.e., they don't overlap).
+  for a, b in _window(grams, 2):
+    if a.end > b.begin:
+      raise ValueError("overlapping grams: {}".format(grams))
+  # Build map from n-gram length to list of n-grams.
+  ngrams = {i: [] for i in range(1, max_ngram_size+1)}
+  for gram_size in range(1, max_ngram_size+1):
+    for g in _window(grams, gram_size):
+      if _contiguous(g):
+        # Add an n-gram which spans these one-grams.
+        ngrams[gram_size].append(_Gram(g[0].begin, g[-1].end))
+  # Shuffle each list of n-grams.
+  for v in ngrams.values():
+    rng.shuffle(v)
+  # Create the weighting for n-gram length selection.
+  # Stored cumulatively for `random.choices` below.
+  cummulative_weights = list(
+      itertools.accumulate([1./n for n in range(1, max_ngram_size+1)]))
+  output_ngrams = []
+  # Keep a bitmask of which tokens have been masked.
+  masked_tokens = [False] * num_tokens
+  # Loop until we have enough masked tokens or there are no more candidate
+  # n-grams of any length.
+  # Each code path should ensure one or more elements from `ngrams` are removed
+  # to guarantee this loop terminates.
+  while (sum(masked_tokens) < max_masked_tokens and
+         sum(len(s) for s in ngrams.values())):
+    # Pick an n-gram size based on our weights.
+    sz = random.choices(range(1, max_ngram_size+1),
+                        cum_weights=cummulative_weights)[0]
+    # Ensure this size doesn't result in too many masked tokens.
+    # E.g., a two-gram contains _at least_ two tokens.
+    if sum(masked_tokens) + sz > max_masked_tokens:
+      # All n-grams of this length are too long and can be removed from
+      # consideration.
+      ngrams[sz].clear()
+      continue
+    # All of the n-grams of this size have been used.
+    if not ngrams[sz]:
+      continue
+    # Choose a random n-gram of the given size.
+    gram = ngrams[sz].pop()
+    num_gram_tokens = gram.end-gram.begin
+    # Check if this would add too many tokens.
+    if num_gram_tokens + sum(masked_tokens) > max_masked_tokens:
+      continue
+    # Check if any of the tokens in this gram have already been masked.
+    if sum(masked_tokens[gram.begin:gram.end]):
+      continue
+    # Found a usable n-gram!  Mark its tokens as masked and add it to return.
+    masked_tokens[gram.begin:gram.end] = [True] * (gram.end-gram.begin)
+    output_ngrams.append(gram)
+  return output_ngrams
+def _tokens_to_grams(tokens):
+  """Reconstitue grams (words) from `tokens`.
+  E.g.,
+     tokens: ['[CLS]', 'That', 'lit', '##tle', 'blue', 'tru', '##ck', '[SEP]']
+      grams: [          [1,2), [2,         4),  [4,5) , [5,       6)]
+  Args:
+    tokens: list of tokens (word pieces or sentence pieces).
+  Returns:
+    List of _Grams representing spans of whole words
+    (without "[CLS]" and "[SEP]").
+  """
+  grams = []
+  gram_start_pos = None
+  for i, token in enumerate(tokens):
+    if gram_start_pos is not None and token.startswith("##"):
+      continue
+    if gram_start_pos is not None:
+      grams.append(_Gram(gram_start_pos, i))
+    if token not in ["[CLS]", "[SEP]"]:
+      gram_start_pos = i
+    else:
+      gram_start_pos = None
+  if gram_start_pos is not None:
+    grams.append(_Gram(gram_start_pos, len(tokens)))
+  return grams
+def create_masked_lm_predictions(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng,
+                                 do_whole_word_mask,
+                                 max_ngram_size=None):
+  """Creates the predictions for the masked LM objective."""
+  if do_whole_word_mask:
+    grams = _tokens_to_grams(tokens)
+  else:
+    # Here we consider each token to be a word to allow for sub-word masking.
+    if max_ngram_size:
+      raise ValueError("cannot use ngram masking without whole word masking")
+    grams = [_Gram(i, i+1) for i in range(0, len(tokens))
+             if tokens[i] not in ["[CLS]", "[SEP]"]]
+  num_to_predict = min(max_predictions_per_seq,
+                       max(1, int(round(len(tokens) * masked_lm_prob))))
+  # Generate masks.  If `max_ngram_size` in [0, None] it means we're doing
+  # whole word masking or token level masking.  Both of these can be treated
+  # as the `max_ngram_size=1` case.
+  masked_grams = _masking_ngrams(grams, max_ngram_size or 1,
+                                 num_to_predict, rng)
+  masked_lms = []
+  output_tokens = list(tokens)
+  for gram in masked_grams:
+    # 80% of the time, replace all n-gram tokens with [MASK]
+    if rng.random() < 0.8:
+      replacement_action = lambda idx: "[MASK]"
+    else:
+      # 10% of the time, keep all the original n-gram tokens.
+      if rng.random() < 0.5:
+        replacement_action = lambda idx: tokens[idx]
+      # 10% of the time, replace each n-gram token with a random word.
+      else:
+        replacement_action = lambda idx: rng.choice(vocab_words)
+    for idx in range(gram.begin, gram.end):
+      output_tokens[idx] = replacement_action(idx)
+      masked_lms.append(MaskedLmInstance(index=idx, label=tokens[idx]))
+  assert len(masked_lms) <= num_to_predict
+  masked_lms = sorted(masked_lms, key=lambda x: x.index)
+  masked_lm_positions = []
+  masked_lm_labels = []
+  for p in masked_lms:
+    masked_lm_positions.append(p.index)
+    masked_lm_labels.append(p.label)
+  return (output_tokens, masked_lm_positions, masked_lm_labels)
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+  """Truncates a pair of sequences to a maximum sequence length."""
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_num_tokens:
+      break
+    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+    assert len(trunc_tokens) >= 1
+    # We want to sometimes truncate from the front and sometimes from the
+    # back to add more randomness and avoid biases.
+    if rng.random() < 0.5:
+      del trunc_tokens[0]
+    else:
+      trunc_tokens.pop()
+def get_processor_text_fn(is_sentence_piece, do_lower_case):
+  def processor_text_fn(text):
+    text = tokenization.convert_to_unicode(text)
+    if is_sentence_piece:
+      # Additional preprocessing specific to the SentencePiece tokenizer.
+      text = tokenization.preprocess_text(text, lower=do_lower_case)
+    return text.strip()
+  return processor_text_fn
+def main(_):
+  if FLAGS.tokenization == "WordPiece":
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case
+    )
+    processor_text_fn = get_processor_text_fn(False, FLAGS.do_lower_case)
+  else:
+    assert FLAGS.tokenization == "SentencePiece"
+    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
+    processor_text_fn = get_processor_text_fn(True, FLAGS.do_lower_case)
+  input_files = []
+  for input_pattern in FLAGS.input_file.split(","):
+    input_files.extend(tf.io.gfile.glob(input_pattern))
+  logging.info("*** Reading from input files ***")
+  for input_file in input_files:
+    logging.info("  %s", input_file)
+  rng = random.Random(FLAGS.random_seed)
+  instances = create_training_instances(
+      input_files,
+      tokenizer,
+      processor_text_fn,
+      FLAGS.max_seq_length,
+      FLAGS.dupe_factor,
+      FLAGS.short_seq_prob,
+      FLAGS.masked_lm_prob,
+      FLAGS.max_predictions_per_seq,
+      rng,
+      FLAGS.do_whole_word_mask,
+      FLAGS.max_ngram_size,
+  )
+  output_files = FLAGS.output_file.split(",")
+  logging.info("*** Writing to output files ***")
+  for output_file in output_files:
+    logging.info("  %s", output_file)
+  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
+                                  FLAGS.max_predictions_per_seq, output_files,
+                                  FLAGS.gzip_compress,
+                                  FLAGS.use_v2_feature_names)
+if __name__ == "__main__":
+  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("output_file")
+  app.run(main)

create_pretraining_data_test.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for official.nlp.data.create_pretraining_data."""
+import random
+import tensorflow as tf, tf_keras
+from official.nlp.data import create_pretraining_data as cpd
+_VOCAB_WORDS = ["vocab_1", "vocab_2"]
+class CreatePretrainingDataTest(tf.test.TestCase):
+  def assertTokens(self, input_tokens, output_tokens, masked_positions,
+                   masked_labels):
+    # Ensure the masked positions are unique.
+    self.assertCountEqual(masked_positions, set(masked_positions))
+    # Ensure we can reconstruct the input from the output.
+    reconstructed_tokens = output_tokens
+    for pos, label in zip(masked_positions, masked_labels):
+      reconstructed_tokens[pos] = label
+    self.assertEqual(input_tokens, reconstructed_tokens)
+    # Ensure each label is valid.
+    for pos, label in zip(masked_positions, masked_labels):
+      output_token = output_tokens[pos]
+      if (output_token == "[MASK]" or output_token in _VOCAB_WORDS or
+          output_token == input_tokens[pos]):
+        continue
+      self.fail("invalid mask value: {}".format(output_token))
+  def test_tokens_to_grams(self):
+    tests = [
+        (["That", "cone"], [(0, 1), (1, 2)]),
+        (["That", "cone", "##s"], [(0, 1), (1, 3)]),
+        (["Swit", "##zer", "##land"], [(0, 3)]),
+        (["[CLS]", "Up", "##dog"], [(1, 3)]),
+        (["[CLS]", "Up", "##dog", "[SEP]", "Down"], [(1, 3), (4, 5)]),
+    ]
+    for inp, expected in tests:
+      output = cpd._tokens_to_grams(inp)
+      self.assertEqual(expected, output)
+  def test_window(self):
+    input_list = [1, 2, 3, 4]
+    window_outputs = [
+        (1, [[1], [2], [3], [4]]),
+        (2, [[1, 2], [2, 3], [3, 4]]),
+        (3, [[1, 2, 3], [2, 3, 4]]),
+        (4, [[1, 2, 3, 4]]),
+        (5, []),
+    ]
+    for window, expected in window_outputs:
+      output = cpd._window(input_list, window)
+      self.assertEqual(expected, list(output))
+  def test_create_masked_lm_predictions(self):
+    tokens = ["[CLS]", "a", "##a", "b", "##b", "c", "##c", "[SEP]"]
+    rng = random.Random(123)
+    for _ in range(0, 5):
+      output_tokens, masked_positions, masked_labels = (
+          cpd.create_masked_lm_predictions(
+              tokens=tokens,
+              masked_lm_prob=1.0,
+              max_predictions_per_seq=3,
+              vocab_words=_VOCAB_WORDS,
+              rng=rng,
+              do_whole_word_mask=False,
+              max_ngram_size=None))
+      self.assertLen(masked_positions, 3)
+      self.assertLen(masked_labels, 3)
+      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
+  def test_create_masked_lm_predictions_whole_word(self):
+    tokens = ["[CLS]", "a", "##a", "b", "##b", "c", "##c", "[SEP]"]
+    rng = random.Random(345)
+    for _ in range(0, 5):
+      output_tokens, masked_positions, masked_labels = (
+          cpd.create_masked_lm_predictions(
+              tokens=tokens,
+              masked_lm_prob=1.0,
+              max_predictions_per_seq=3,
+              vocab_words=_VOCAB_WORDS,
+              rng=rng,
+              do_whole_word_mask=True,
+              max_ngram_size=None))
+      # since we can't get exactly three tokens without breaking a word we
+      # only take two.
+      self.assertLen(masked_positions, 2)
+      self.assertLen(masked_labels, 2)
+      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
+      # ensure that we took an entire word.
+      self.assertIn(masked_labels, [["a", "##a"], ["b", "##b"], ["c", "##c"]])
+  def test_create_masked_lm_predictions_ngram(self):
+    tokens = ["[CLS]"] + ["tok{}".format(i) for i in range(0, 512)] + ["[SEP]"]
+    rng = random.Random(345)
+    for _ in range(0, 5):
+      output_tokens, masked_positions, masked_labels = (
+          cpd.create_masked_lm_predictions(
+              tokens=tokens,
+              masked_lm_prob=1.0,
+              max_predictions_per_seq=76,
+              vocab_words=_VOCAB_WORDS,
+              rng=rng,
+              do_whole_word_mask=True,
+              max_ngram_size=3))
+      self.assertLen(masked_positions, 76)
+      self.assertLen(masked_labels, 76)
+      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
+if __name__ == "__main__":
+  tf.test.main()

create_xlnet_pretraining_data.py ADDED Viewed

	@@ -0,0 +1,721 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Create LM TF examples for XLNet."""
+import dataclasses
+import json
+import math
+import os
+import random
+from typing import Iterable, Mapping, List, Optional, Tuple
+import unicodedata
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+import tensorflow as tf, tf_keras
+from official.nlp.tools import tokenization
+special_symbols = {
+    "<unk>": 0,
+    "<s>": 1,
+    "</s>": 2,
+    "<cls>": 3,
+    "<sep>": 4,
+    "<pad>": 5,
+    "<mask>": 6,
+    "<eod>": 7,
+    "<eop>": 8,
+}
+FLAGS = flags.FLAGS
+flags.DEFINE_integer("seq_length", 512,
+                     help="Sequence length.")
+flags.DEFINE_integer("reuse_length", 256,
+                     help="Number of token that can be reused as memory. "
+                     "Could be half of `seq_len`.")
+flags.DEFINE_string("input_file", None,
+                    "Input raw text file (or comma-separated list of files).")
+flags.DEFINE_string(
+    "save_dir", None,
+    "Directory for saving processed data.")
+flags.DEFINE_string("sp_model_file", "",
+                    "The path to the model used by sentence piece tokenizer.")
+flags.DEFINE_bool("use_eod_token", True,
+                  "Whether or not to include EOD tokens.")
+flags.DEFINE_bool("bi_data", True, "Whether or not to use bi-directional data.")
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+flags.DEFINE_integer("per_host_batch_size", 32, "Batch size per host.")
+flags.DEFINE_integer("num_cores_per_host", 16,
+                     "The number of (TPU) cores per host.")
+flags.DEFINE_string("prefix", "", "Filename prefix.")
+flags.DEFINE_string("suffix", "", "Filename suffix.")
+flags.DEFINE_integer("task_id", None,
+                     "The id of the current task.")
+flags.DEFINE_integer("num_tasks", None,
+                     "The total number of tasks.")
+flags.DEFINE_integer("num_passes", 1, "The number of times to run the script.")
+@dataclasses.dataclass
+class TrainingInstance:
+  """Representation of a single XLNet Pretraining instance."""
+  data: Iterable[int]
+  segment_ids: Iterable[int]
+  boundary_indices: Iterable[int]
+  label: int
+  def to_feature(self) -> Mapping[str, tf.train.Feature]:
+    feat = lambda x: tf.train.Feature(int64_list=tf.train.Int64List(value=x))
+    return dict(
+        input_word_ids=feat(self.data),
+        input_type_ids=feat(self.segment_ids),
+        boundary_indices=feat(self.boundary_indices),
+        label=feat([self.label]))
+  def to_example(self) -> tf.train.Example:
+    return tf.train.Example(
+        features=tf.train.Features(feature=self.to_feature()))
+  def __str__(self):
+    def seq_to_str(seq):
+      return " ".join([str(x) for x in seq])
+    s = ""
+    s += "tokens: %s\n" % seq_to_str(self.data)
+    s += "segment_ids: %s\n" % seq_to_str(self.segment_ids)
+    s += "boundary_indices: %s\n" % seq_to_str(self.boundary_indices)
+    s += "label: %s\n" % self.label
+    s += "\n"
+    return s
+  def __repr__(self):
+    return self.__str__()
+def _preprocess_line(line: str, do_lower_case: bool = False) -> str:
+  """Preprocesses an individual raw text line.
+  This function will:
+    - Remove extraneous spaces.
+    - Replace `` with ", and '' with ".
+    - Replaces accents.
+    - Applies lower casing.
+  Args:
+    line: The input line to preprocess.
+    do_lower_case: Whether or not to lower case the text.
+  Returns:
+    The preprocessed line.
+  """
+  line = " ".join(line.split())
+  line = line.replace("``", "\"").replace("''", "\"")
+  # Replace accents.
+  line = unicodedata.normalize("NFKD", line)
+  line = "".join([c for c in line if not unicodedata.combining(c)])
+  if do_lower_case:
+    line = line.lower()
+  return line
+def preprocess_and_tokenize_input_files(
+    input_files: Iterable[str],
+    tokenizer: tokenization.FullSentencePieceTokenizer,
+    use_eod: bool = True,
+    do_lower_case: bool = False,
+    log_example_freq: int = 100000) -> List[Tuple[np.array, np.array]]:
+  """Preprocesses and encodes raw text from input files.
+  This function preprocesses raw text and encodes them into tokens using a
+  `SentencePieceModel` tokenization method. This also provides the sentence
+  indicator for each token.
+  Args:
+    input_files: The list of input file names.
+    tokenizer: The SentencePiece tokenizer that has the attribute `sp_model`.
+    use_eod: Whether or not to use an EOD indicator. If `False`, then EOD is
+      not included.
+    do_lower_case: Whether or not to apply lower casing during raw text
+      preprocessing.
+    log_example_freq: The optional field for how many lines to process before
+      emitting an info log.
+  Returns:
+    The preprocessed list. Each entry in the list is a tuple consisting of
+    the token IDs and the sentence IDs.
+  """
+  all_data = []
+  eod_symbol = special_symbols["<eod>"]
+  total_number_of_lines = 0
+  # Input file format:
+  # (1) One sentence per line. These should ideally be actual sentences, not
+  # entire paragraphs or arbitrary spans of text. (Because we use the
+  # sentence boundaries for the "next sentence prediction" task).
+  # (2) Blank lines between documents. Document boundaries are needed so
+  # that the "next sentence prediction" task doesn't span between documents.
+  for input_file in input_files:
+    line_count = 0
+    logging.info("Preprocessing %s", input_file)
+    all_tokens = []
+    all_sentence_ids = []
+    sentence_id = True
+    with tf.io.gfile.GFile(input_file, "rb") as reader:
+      while True:
+        line = tokenization.convert_to_unicode(reader.readline())
+        if not line:
+          break
+        line_count += 1
+        if line_count % log_example_freq == 0:
+          logging.info("Loading line %d", line_count)
+        line = line.strip()
+        if not line:
+          if use_eod:
+            token_ids = [eod_symbol]
+            sentence_id = not sentence_id
+          else:
+            continue
+        else:
+          preprocessed_line = _preprocess_line(
+              line=line, do_lower_case=do_lower_case)
+          token_ids = tokenization.encode_ids(
+              sp_model=tokenizer.sp_model, text=preprocessed_line)
+        all_tokens.extend(token_ids)
+        all_sentence_ids.extend([sentence_id] * len(token_ids))
+        sentence_id = not sentence_id
+      logging.info("Finished processing %s. Number of lines: %d",
+                   input_file, line_count)
+      if line_count == 0:
+        continue
+      total_number_of_lines += line_count
+      all_tokens = np.array(all_tokens, dtype=np.int64)
+      all_sentence_ids = np.array(all_sentence_ids, dtype=bool)
+      all_data.append((all_tokens, all_sentence_ids))
+  logging.info("Completed text preprocessing. Total number of lines: %d",
+               total_number_of_lines)
+  return all_data
+def _reshape_to_batch_dimensions(
+    tokens: np.array,
+    sentence_ids: np.array,
+    per_host_batch_size: int) -> Tuple[np.array, np.array]:
+  """Truncates and reshapes input data with a batch major dimension.
+  Args:
+    tokens: The input token ids. This should have the same shape as
+      `sentence_ids`.
+    sentence_ids: The input sentence ids. This should have the same shape as
+      `token_ids`.
+    per_host_batch_size: The target per-host batch size.
+  Returns:
+    The tuple of reshaped tokens and sentence_ids.
+  """
+  num_steps = len(tokens) // per_host_batch_size
+  truncated_data_length = num_steps * per_host_batch_size
+  logging.info("per_host_batch_size: %d", per_host_batch_size)
+  logging.info("num_steps: %d", num_steps)
+  def truncate_and_reshape(a):
+    return a[:truncated_data_length].reshape((per_host_batch_size, num_steps))
+  return (truncate_and_reshape(tokens), truncate_and_reshape(sentence_ids))
+def _create_a_and_b_segments(
+    tokens: np.array,
+    sentence_ids: np.array,
+    begin_index: int,
+    total_length: int,
+    no_cut_probability: float = 0.5):
+  """Splits segments A and B from a single instance of tokens and sentence ids.
+  Args:
+    tokens: The 1D input token ids. This represents an individual entry within a
+      batch.
+    sentence_ids: The 1D input sentence ids. This represents an individual entry
+      within a batch. This should be the same length as `tokens`.
+    begin_index: The reference beginning index to split data.
+    total_length: The target combined length of segments A and B.
+    no_cut_probability: The probability of not cutting a segment despite
+      a cut possibly existing.
+  Returns:
+    A tuple consisting of A data, B data, and label.
+  """
+  data_length = tokens.shape[0]
+  if begin_index + total_length >= data_length:
+    logging.info("[_create_segments]: begin_index %d + total_length %d >= "
+                 "data_length %d", begin_index, total_length, data_length)
+    return None
+  end_index = begin_index + 1
+  cut_indices = []
+  # Identify all indices where sentence IDs change from one to the next.
+  while end_index < data_length:
+    if sentence_ids[end_index] != sentence_ids[end_index - 1]:
+      if end_index - begin_index >= total_length:
+        break
+      cut_indices.append(end_index)
+    end_index += 1
+  a_begin = begin_index
+  if not cut_indices or random.random() < no_cut_probability:
+    # Segments A and B are contained within the same sentence.
+    label = 0
+    if not cut_indices:
+      a_end = end_index
+    else:
+      a_end = random.choice(cut_indices)
+    b_length = max(1, total_length - (a_end - a_begin))
+    b_begin = random.randint(0, data_length - 1 - b_length)
+    b_end = b_begin + b_length
+    while b_begin > 0 and sentence_ids[b_begin - 1] == sentence_ids[b_begin]:
+      b_begin -= 1
+    while (b_end < data_length - 1 and
+           sentence_ids[b_end - 1] == sentence_ids[b_end]):
+      b_end += 1
+  else:
+    # Segments A and B are different sentences.
+    label = 1
+    a_end = random.choice(cut_indices)
+    b_begin = a_end
+    b_end = end_index
+  while a_end - a_begin + b_end - b_begin > total_length:
+    if a_end - a_begin > b_end - b_begin:
+      # Delete only the right side for the LM objective.
+      a_end -= 1
+    else:
+      b_end -= 1
+  if a_end >= data_length or b_end >= data_length:
+    logging.info("[_create_segments]: a_end %d or b_end %d >= data_length %d",
+                 a_end, b_end, data_length)
+    return None
+  a_data = tokens[a_begin: a_end]
+  b_data = tokens[b_begin: b_end]
+  return a_data, b_data, label
+def _is_functional_piece(piece: str) -> bool:
+  return piece != "<unk>" and piece.startswith("<") and piece.endswith(">")
+def _is_start_piece(piece: str) -> bool:
+  special_pieces = set(list('!"#$%&\"()*+,-./:;?@[\\]^_`{|}~'))
+  if (piece.startswith("▁") or piece in special_pieces):
+    return True
+  else:
+    return False
+def _get_boundary_indices(
+    data: np.array,
+    tokenizer: tokenization.FullSentencePieceTokenizer) -> np.array:
+  """Gets the boundary indices of whole words."""
+  seq_length = len(data)
+  boundary_indices = []
+  for index, piece in enumerate(tokenizer.convert_ids_to_tokens(data.tolist())):
+    if _is_start_piece(piece) and not _is_functional_piece(piece):
+      boundary_indices.append(index)
+  boundary_indices.append(seq_length)
+  return boundary_indices
+def _convert_tokens_to_instances(
+    tokens: np.array,
+    sentence_ids: np.array,
+    per_host_batch_size: int,
+    seq_length: int,
+    reuse_length: int,
+    bi_data: bool,
+    tokenizer: tokenization.FullSentencePieceTokenizer,
+    num_cores_per_host: int = 0,
+    logging_frequency: int = 500) -> List[TrainingInstance]:
+  """Converts tokens and sentence IDs into individual training instances.
+  The format of data in the XLNet pretraining task is very similar to the
+  BERT pretraining task. Two segments A and B are randomly sampled, and the
+  contatenation of A and B into a single sequence is used to perform
+  language modeling.
+  To create an XLNet Pretraining instance from a single long sequence, S:
+  - Create a segment of length `reuse_length`. This first segment represents
+    past tokens. During modeling, this segment is used to cache obtained
+    content representations for the segment recurrence mechanism.
+  - Similar to BERT, create a segment of length `seq_length` - `reuse_length`
+    composed of A and B segments.
+    For XLNet, the order is "A", "SEP", "B", "SEP", "CLS".
+  Args:
+    tokens: All tokens concatenated into a single list.
+    sentence_ids: All sentence IDs concatenated into a single list.
+    per_host_batch_size: The target batch size per host.
+    seq_length: The max sequence length.
+    reuse_length: The number of tokens to use from the previous segment.
+    bi_data: Whether or not to use bidirectional data.
+    tokenizer: The SentencePiece tokenizer that has the attribute `sp_model`.
+    num_cores_per_host: The number of cores per host. This is required if
+      `bi_data` = `True`.
+    logging_frequency: The frequency at which to log status updates.
+  Returns:
+    A list of `TrainingInstance` objects.
+  """
+  instances = []
+  per_core_batch_size = (per_host_batch_size // num_cores_per_host
+                         if bi_data else None)
+  if bi_data:
+    logging.info("Bi-directional data enabled.")
+    assert per_host_batch_size % (2 * num_cores_per_host) == 0
+    forward_tokens, forward_sentence_ids = _reshape_to_batch_dimensions(
+        tokens=tokens,
+        sentence_ids=sentence_ids,
+        per_host_batch_size=per_host_batch_size // 2)
+    forward_data_shape = (num_cores_per_host, 1, per_core_batch_size // 2, -1)
+    forward_tokens = forward_tokens.reshape(forward_data_shape)
+    forward_sentence_ids = forward_sentence_ids.reshape(forward_data_shape)
+    backwards_tokens = forward_tokens[:, :, :, ::-1]
+    backwards_sentence_ids = forward_sentence_ids[:, :, :, ::-1]
+    tokens = np.concatenate([forward_tokens, backwards_tokens], 1).reshape(
+        per_host_batch_size, -1)
+    sentence_ids = np.concatenate(
+        [forward_sentence_ids, backwards_sentence_ids]).reshape(
+            per_host_batch_size, -1)
+  else:
+    logging.info("Bi-directional data disabled.")
+    tokens, sentence_ids = _reshape_to_batch_dimensions(
+        tokens=tokens,
+        sentence_ids=sentence_ids,
+        per_host_batch_size=per_host_batch_size)
+  logging.info("Tokens shape: %s", tokens.shape)
+  data_length = tokens.shape[1]
+  sep = np.array([special_symbols["<sep>"]], dtype=np.int64)
+  cls = np.array([special_symbols["<cls>"]], dtype=np.int64)
+  # 2 sep, 1 cls
+  num_special_tokens = 3
+  data_index = 0
+  batch_number = 0
+  step_size = reuse_length if reuse_length else seq_length
+  num_batches = math.ceil(data_length / step_size)
+  while data_index + seq_length <= data_length:
+    if batch_number % logging_frequency == 0:
+      logging.info("Processing batch %d of %d", batch_number, num_batches)
+    for batch_index in range(per_host_batch_size):
+      previous_segment_tokens = tokens[
+          batch_index, data_index: data_index + reuse_length]
+      results = _create_a_and_b_segments(
+          tokens=tokens[batch_index],
+          sentence_ids=sentence_ids[batch_index],
+          begin_index=data_index + reuse_length,
+          total_length=seq_length - reuse_length - num_special_tokens)
+      if results is None:
+        logging.info("Stopping at data index: %d", data_index)
+        break
+      a_data, b_data, label = results
+      data = np.concatenate(
+          [previous_segment_tokens, a_data, sep, b_data, sep, cls])
+      a_length = a_data.shape[0]
+      b_length = b_data.shape[0]
+      segment_ids = ([0] * (reuse_length + a_length) + [0]
+                     + [1] * b_length + [1] + [2])
+      boundary_indices = _get_boundary_indices(tokenizer=tokenizer,
+                                               data=data)
+      assert len(data) == seq_length
+      assert len(segment_ids) == seq_length
+      assert len(boundary_indices) > 0  # pylint: disable=g-explicit-length-test
+      instances.append(TrainingInstance(
+          data=data,
+          segment_ids=segment_ids,
+          boundary_indices=boundary_indices,
+          label=label))
+    batch_number += 1
+    data_index += step_size
+  return instances
+def write_instances_to_tfrecord(
+    instances: Iterable[TrainingInstance],
+    save_path: str):
+  """Writes instances to TFRecord."""
+  record_writer = tf.io.TFRecordWriter(save_path)
+  logging.info("Start writing to %s.", save_path)
+  for i, instance in enumerate(instances):
+    if i < 5:
+      logging.info("Instance %d: %s", i, str(instance))
+    record_writer.write(instance.to_example().SerializeToString())
+  record_writer.close()
+  logging.info("Done writing %s.", save_path)
+def shuffle_and_combine_preprocessed_data(
+    all_data: List[Tuple[np.array, np.array]]) -> Tuple[np.array, np.array]:
+  """Shuffles and combines preprocessed token/sentence IDs from documents."""
+  document_permutation = np.random.permutation(len(all_data))
+  previous_sentence_id = None
+  all_tokens, all_sentence_ids = [], []
+  for document_index in document_permutation:
+    tokens, sentence_ids = all_data[document_index]
+    # pylint: disable=g-explicit-length-test
+    if len(tokens) == 0:
+      continue
+    if (previous_sentence_id is not None and
+        sentence_ids[0] == previous_sentence_id):
+      sentence_ids = np.logical_not(sentence_ids)
+    all_tokens.append(tokens)
+    all_sentence_ids.append(sentence_ids)
+    previous_sentence_id = sentence_ids[-1]
+  return np.concatenate(all_tokens), np.concatenate(all_sentence_ids)
+def get_tfrecord_name(
+    per_host_batch_size: int,
+    num_cores_per_host: int,
+    seq_length: int,
+    bi_data: bool,
+    reuse_length: int,
+    do_lower_case: bool,
+    use_eod_token: bool,
+    prefix: str = "",
+    suffix: str = "",
+    pass_id: int = 0,
+    num_passes: int = 1,
+    task_id: int = None,
+    num_tasks: int = None) -> str:
+  """Formats the resulting TFRecord name based on provided inputs."""
+  components = []
+  if prefix:
+    components.append(prefix)
+  components.append("seqlen-{}".format(seq_length))
+  if reuse_length == 0:
+    components.append("memless")
+  else:
+    components.append("reuse-{}".format(reuse_length))
+  components.append("bs-{}".format(per_host_batch_size))
+  components.append("cores-{}".format(num_cores_per_host))
+  if do_lower_case:
+    components.append("uncased")
+  else:
+    components.append("cased")
+  if use_eod_token:
+    components.append("eod")
+  if bi_data:
+    components.append("bi")
+  else:
+    components.append("uni")
+  if suffix:
+    components.append(suffix)
+  s = "_".join(components) + ".tfrecord"
+  if num_passes == 1 and task_id is None:
+    return s
+  if task_id is None:
+    num_tasks = 1
+    task_id = 0
+  current_shard = task_id * num_passes + pass_id
+  total_shards = num_tasks * num_passes
+  return s + "-{}-of-{}".format(current_shard, total_shards)
+def create_tfrecords(
+    tokenizer: tokenization.FullSentencePieceTokenizer,
+    input_file_or_files: str,
+    use_eod_token: bool,
+    do_lower_case: bool,
+    per_host_batch_size: int,
+    seq_length: int,
+    reuse_length: int,
+    bi_data: bool,
+    num_cores_per_host: int,
+    save_dir: str,
+    prefix: str = "",
+    suffix: str = "",
+    num_tasks: Optional[int] = None,
+    task_id: Optional[int] = None,
+    num_passes: int = 1):
+  """Runs the end-to-end preprocessing pipeline."""
+  logging.info("Input configuration:")
+  logging.info("input file(s): %s", input_file_or_files)
+  logging.info("use_eod_token: %s", use_eod_token)
+  logging.info("do_lower_case: %s", do_lower_case)
+  logging.info("per_host_batch_size: %d", per_host_batch_size)
+  logging.info("seq_length: %d", seq_length)
+  logging.info("reuse_length: %d", reuse_length)
+  logging.info("bi_data: %s", bi_data)
+  logging.info("num_cores_per_host: %d", num_cores_per_host)
+  logging.info("save_dir: %s", save_dir)
+  if task_id is not None and num_tasks is not None:
+    logging.info("task_id: %d", task_id)
+    logging.info("num_tasks: %d", num_tasks)
+  input_files = []
+  for input_pattern in input_file_or_files.split(","):
+    input_files.extend(tf.io.gfile.glob(input_pattern))
+  logging.info("*** Reading from input files ***")
+  for input_file in input_files:
+    logging.info("  %s", input_file)
+  logging.info("Shuffling the files with a fixed random seed.")
+  np.random.shuffle(input_files)
+  if num_tasks is not None:
+    assert task_id is not None
+    logging.info("Total number of input files: %d", len(input_files))
+    logging.info("Splitting into %d shards of %d files each.",
+                 num_tasks, len(input_files) // num_tasks)
+    input_files = input_files[task_id::num_tasks]
+  all_data = preprocess_and_tokenize_input_files(
+      input_files=input_files,
+      tokenizer=tokenizer,
+      use_eod=use_eod_token,
+      do_lower_case=do_lower_case)
+  for pass_id in range(num_passes):
+    logging.info("Beginning pass %d of %d", pass_id, num_passes)
+    tokens, sentence_ids = shuffle_and_combine_preprocessed_data(all_data)
+    assert len(tokens) == len(sentence_ids)
+    filename = get_tfrecord_name(
+        per_host_batch_size=per_host_batch_size,
+        num_cores_per_host=num_cores_per_host,
+        seq_length=seq_length,
+        bi_data=bi_data,
+        use_eod_token=use_eod_token,
+        reuse_length=reuse_length,
+        do_lower_case=do_lower_case,
+        prefix=prefix,
+        suffix=suffix,
+        pass_id=pass_id,
+        num_passes=num_passes,
+        num_tasks=num_tasks,
+        task_id=task_id)
+    save_path = os.path.join(save_dir, filename)
+    if os.path.exists(save_path):
+      # If the path already exists, then we were probably preempted but
+      # previously wrote this file.
+      logging.info("%s already exists, skipping this batch.", save_path)
+    else:
+      instances = _convert_tokens_to_instances(
+          tokenizer=tokenizer,
+          tokens=tokens,
+          sentence_ids=sentence_ids,
+          per_host_batch_size=per_host_batch_size,
+          seq_length=seq_length,
+          reuse_length=reuse_length,
+          bi_data=bi_data,
+          num_cores_per_host=num_cores_per_host)
+      write_instances_to_tfrecord(instances=instances, save_path=save_path)
+  if task_id is None or task_id == 0:
+    corpus_info = {
+        "vocab_size": 32000,
+        "per_host_batch_size": per_host_batch_size,
+        "num_cores_per_host": num_cores_per_host,
+        "seq_length": seq_length,
+        "reuse_length": reuse_length,
+        "do_lower_case": do_lower_case,
+        "bi_data": bi_data,
+        "use_eod_token": use_eod_token,
+    }
+    corpus_fname = os.path.basename(filename) + ".json"
+    corpus_destination = os.path.join(save_dir, corpus_fname)
+    logging.info("Saving corpus info to %s", corpus_destination)
+    with tf.io.gfile.GFile(corpus_destination, "w") as fp:
+      json.dump(corpus_info, fp)
+def main(_):
+  tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
+  create_tfrecords(
+      tokenizer=tokenizer,
+      input_file_or_files=FLAGS.input_file,
+      use_eod_token=FLAGS.use_eod_token,
+      do_lower_case=FLAGS.do_lower_case,
+      per_host_batch_size=FLAGS.per_host_batch_size,
+      seq_length=FLAGS.seq_length,
+      reuse_length=FLAGS.reuse_length,
+      bi_data=FLAGS.bi_data,
+      num_cores_per_host=FLAGS.num_cores_per_host,
+      save_dir=FLAGS.save_dir,
+      prefix=FLAGS.prefix,
+      suffix=FLAGS.suffix,
+      num_tasks=FLAGS.num_tasks,
+      task_id=FLAGS.task_id,
+      num_passes=FLAGS.num_passes)
+if __name__ == "__main__":
+  np.random.seed(0)
+  logging.set_verbosity(logging.INFO)
+  app.run(main)

create_xlnet_pretraining_data_test.py ADDED Viewed

	@@ -0,0 +1,355 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for official.nlp.data.create_xlnet_pretraining_data."""
+import os
+import tempfile
+from typing import List
+from absl import logging
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf, tf_keras
+from official.nlp.data import create_xlnet_pretraining_data as cpd
+_VOCAB_WORDS = ["vocab_1", "vocab_2"]
+# pylint: disable=invalid-name
+def _create_files(
+    temp_dir: str, file_contents: List[List[str]]) -> List[str]:
+  """Writes arbitrary documents into files."""
+  root_dir = tempfile.mkdtemp(dir=temp_dir)
+  files = []
+  for i, file_content in enumerate(file_contents):
+    destination = os.path.join(root_dir, "%d.txt" % i)
+    with open(destination, "wb") as f:
+      for line in file_content:
+        f.write(line.encode("utf-8"))
+    files.append(destination)
+  return files
+def _get_mock_tokenizer():
+  """Creates a mock tokenizer."""
+  class MockSpieceModel:
+    """Mock Spiece model for testing."""
+    def __init__(self):
+      self._special_piece_to_id = {
+          "<unk>": 0,
+      }
+      for piece in set(list('!"#$%&\"()*+,-./:;?@[\\]^_`{|}~')):
+        self._special_piece_to_id[piece] = 1
+    def EncodeAsPieces(self, inputs: str) -> List[str]:
+      return inputs
+    def SampleEncodeAsPieces(self,
+                             inputs: str,
+                             nbest_size: int,
+                             theta: float) -> List[str]:
+      del nbest_size, theta
+      return inputs
+    def PieceToId(self, piece: str) -> int:
+      return ord(piece[0])
+    def IdToPiece(self, id_: int) -> str:
+      return chr(id_) * 3
+  class Tokenizer:
+    """Mock Tokenizer for testing."""
+    def __init__(self):
+      self.sp_model = MockSpieceModel()
+    def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
+      return [self.sp_model.IdToPiece(id_) for id_ in ids]
+  return Tokenizer()
+class PreprocessDataTest(tf.test.TestCase):
+  def test_remove_extraneous_space(self):
+    line = "   abc   "
+    output = cpd._preprocess_line(line)
+    self.assertEqual(output, "abc")
+  def test_symbol_replacements(self):
+    self.assertEqual(cpd._preprocess_line("``abc``"), "\"abc\"")
+    self.assertEqual(cpd._preprocess_line("''abc''"), "\"abc\"")
+  def test_accent_replacements(self):
+    self.assertEqual(cpd._preprocess_line("åbc"), "abc")
+  def test_lower_case(self):
+    self.assertEqual(cpd._preprocess_line("ABC", do_lower_case=True), "abc")
+  def test_end_to_end(self):
+    self.assertEqual(
+        cpd._preprocess_line("HelLo ``wórLd``", do_lower_case=True),
+        "hello \"world\"")
+class PreprocessAndTokenizeFilesTest(tf.test.TestCase):
+  def test_basic_end_to_end(self):
+    documents = [
+        [
+            "This is sentence 1.\n",
+            "This is sentence 2.\n",
+            "Sentence 3 is what this is.\n",
+        ],
+        [
+            "This is the second document.\n",
+            "This is the second line of the second document.\n"
+        ],
+    ]
+    input_files = _create_files(temp_dir=self.get_temp_dir(),
+                                file_contents=documents)
+    all_data = cpd.preprocess_and_tokenize_input_files(
+        input_files=input_files,
+        tokenizer=_get_mock_tokenizer(),
+        log_example_freq=1)
+    self.assertEqual(len(all_data), len(documents))
+    for token_ids, sentence_ids in all_data:
+      self.assertEqual(len(token_ids), len(sentence_ids))
+  def test_basic_correctness(self):
+    documents = [["a\n", "b\n", "c\n"]]
+    input_files = _create_files(temp_dir=self.get_temp_dir(),
+                                file_contents=documents)
+    all_data = cpd.preprocess_and_tokenize_input_files(
+        input_files=input_files,
+        tokenizer=_get_mock_tokenizer(),
+        log_example_freq=1)
+    token_ids, sentence_ids = all_data[0]
+    self.assertAllClose(token_ids, [97, 98, 99])
+    self.assertAllClose(sentence_ids, [True, False, True])
+  def test_correctness_with_spaces_and_accents(self):
+    documents = [[
+        "       å   \n",
+        "b          \n",
+        "   c      \n",
+    ]]
+    input_files = _create_files(temp_dir=self.get_temp_dir(),
+                                file_contents=documents)
+    all_data = cpd.preprocess_and_tokenize_input_files(
+        input_files=input_files,
+        tokenizer=_get_mock_tokenizer(),
+        log_example_freq=1)
+    token_ids, sentence_ids = all_data[0]
+    self.assertAllClose(token_ids, [97, 98, 99])
+    self.assertAllClose(sentence_ids, [True, False, True])
+class BatchReshapeTests(tf.test.TestCase):
+  def test_basic_functionality(self):
+    per_host_batch_size = 3
+    mock_shape = (20,)
+    # Should truncate and reshape.
+    expected_result_shape = (3, 6)
+    tokens = np.zeros(mock_shape)
+    sentence_ids = np.zeros(mock_shape)
+    reshaped_data = cpd._reshape_to_batch_dimensions(
+        tokens=tokens,
+        sentence_ids=sentence_ids,
+        per_host_batch_size=per_host_batch_size)
+    for values in reshaped_data:
+      self.assertEqual(len(values.flatten()) % per_host_batch_size, 0)
+      self.assertAllClose(values.shape, expected_result_shape)
+class CreateSegmentsTest(tf.test.TestCase):
+  def test_basic_functionality(self):
+    data_length = 10
+    tokens = np.arange(data_length)
+    sentence_ids = np.concatenate([np.zeros(data_length // 2),
+                                   np.ones(data_length // 2)])
+    begin_index = 0
+    total_length = 8
+    a_data, b_data, label = cpd._create_a_and_b_segments(
+        tokens=tokens,
+        sentence_ids=sentence_ids,
+        begin_index=begin_index,
+        total_length=total_length,
+        no_cut_probability=0.)
+    self.assertAllClose(a_data, [0, 1, 2, 3])
+    self.assertAllClose(b_data, [5, 6, 7, 8])
+    self.assertEqual(label, 1)
+  def test_no_cut(self):
+    data_length = 10
+    tokens = np.arange(data_length)
+    sentence_ids = np.zeros(data_length)
+    begin_index = 0
+    total_length = 8
+    a_data, b_data, label = cpd._create_a_and_b_segments(
+        tokens=tokens,
+        sentence_ids=sentence_ids,
+        begin_index=begin_index,
+        total_length=total_length,
+        no_cut_probability=0.)
+    self.assertGreater(len(a_data), 0)
+    self.assertGreater(len(b_data), 0)
+    self.assertEqual(label, 0)
+  def test_no_cut_with_probability(self):
+    data_length = 10
+    tokens = np.arange(data_length)
+    sentence_ids = np.concatenate([np.zeros(data_length // 2),
+                                   np.ones(data_length // 2)])
+    begin_index = 0
+    total_length = 8
+    a_data, b_data, label = cpd._create_a_and_b_segments(
+        tokens=tokens,
+        sentence_ids=sentence_ids,
+        begin_index=begin_index,
+        total_length=total_length,
+        no_cut_probability=1.)
+    self.assertGreater(len(a_data), 0)
+    self.assertGreater(len(b_data), 0)
+    self.assertEqual(label, 0)
+class CreateInstancesTest(tf.test.TestCase):
+  """Tests conversions of Token/Sentence IDs to training instances."""
+  def test_basic(self):
+    data_length = 12
+    tokens = np.arange(data_length)
+    sentence_ids = np.zeros(data_length)
+    seq_length = 8
+    instances = cpd._convert_tokens_to_instances(
+        tokens=tokens,
+        sentence_ids=sentence_ids,
+        per_host_batch_size=2,
+        seq_length=seq_length,
+        reuse_length=4,
+        tokenizer=_get_mock_tokenizer(),
+        bi_data=False,
+        num_cores_per_host=1,
+        logging_frequency=1)
+    for instance in instances:
+      self.assertEqual(len(instance.data), seq_length)
+      self.assertEqual(len(instance.segment_ids), seq_length)
+      self.assertIsInstance(instance.label, int)
+      self.assertIsInstance(instance.boundary_indices, list)
+class TFRecordPathTests(tf.test.TestCase):
+  def test_basic(self):
+    base_kwargs = dict(
+        per_host_batch_size=1,
+        num_cores_per_host=1,
+        seq_length=2,
+        reuse_length=1)
+    config1 = dict(
+        prefix="test",
+        suffix="",
+        bi_data=True,
+        use_eod_token=False,
+        do_lower_case=True)
+    config1.update(base_kwargs)
+    expectation1 = "test_seqlen-2_reuse-1_bs-1_cores-1_uncased_bi.tfrecord"
+    self.assertEqual(cpd.get_tfrecord_name(**config1), expectation1)
+    config2 = dict(
+        prefix="",
+        suffix="test",
+        bi_data=False,
+        use_eod_token=False,
+        do_lower_case=False)
+    config2.update(base_kwargs)
+    expectation2 = "seqlen-2_reuse-1_bs-1_cores-1_cased_uni_test.tfrecord"
+    self.assertEqual(cpd.get_tfrecord_name(**config2), expectation2)
+    config3 = dict(
+        prefix="",
+        suffix="",
+        use_eod_token=True,
+        bi_data=False,
+        do_lower_case=True)
+    config3.update(base_kwargs)
+    expectation3 = "seqlen-2_reuse-1_bs-1_cores-1_uncased_eod_uni.tfrecord"
+    self.assertEqual(cpd.get_tfrecord_name(**config3), expectation3)
+class TestCreateTFRecords(parameterized.TestCase, tf.test.TestCase):
+  @parameterized.named_parameters(
+      ("bi_data_only", True, False, False),
+      ("eod_token_only", False, True, True),
+      ("lower_case_only", False, False, True),
+      ("all_enabled", True, True, True),
+      )
+  def test_end_to_end(self,
+                      bi_data: bool,
+                      use_eod_token: bool,
+                      do_lower_case: bool):
+    tokenizer = _get_mock_tokenizer()
+    num_documents = 5
+    sentences_per_document = 10
+    document_length = 50
+    documents = [
+        ["a " * document_length for _ in range(sentences_per_document)]
+        for _ in range(num_documents)]
+    save_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    files = _create_files(temp_dir=self.get_temp_dir(), file_contents=documents)
+    cpd.create_tfrecords(
+        tokenizer=tokenizer,
+        input_file_or_files=",".join(files),
+        use_eod_token=use_eod_token,
+        do_lower_case=do_lower_case,
+        per_host_batch_size=8,
+        seq_length=8,
+        reuse_length=4,
+        bi_data=bi_data,
+        num_cores_per_host=2,
+        save_dir=save_dir)
+    self.assertTrue(any(filter(lambda x: x.endswith(".json"),
+                               os.listdir(save_dir))))
+    self.assertTrue(any(filter(lambda x: x.endswith(".tfrecord"),
+                               os.listdir(save_dir))))
+if __name__ == "__main__":
+  np.random.seed(0)
+  logging.set_verbosity(logging.INFO)
+  tf.test.main()

data_loader.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""An abstraction that NLP models define input pipelines."""
+import abc
+from typing import Optional
+import tensorflow as tf, tf_keras
+class DataLoader(metaclass=abc.ABCMeta):
+  """An abstract class defining the APIs for tf.data input pipeline."""
+  @abc.abstractmethod
+  def load(
+      self,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Implements DataLoader load method.
+    Builds the entire input pipeline inside the load method. Users can define
+    states inside the DataLoader class and returns a tf.data dataset
+    object.
+    Args:
+      input_context: This is a context class that is passed to the user's input
+        function and contains information about the compute replicas and input
+        pipelines. This object is used for multi-host inputs and passed by the
+        distribution strategy.
+    Returns:
+      A per-host tf.data dataset. Note that, we usually create the distributed
+        dataset through the load method, so we should not directly return a
+        distributed dataset here.
+    """
+    pass

data_loader_factory.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A global factory to access NLP registered data loaders."""
+from official.core import registry
+_REGISTERED_DATA_LOADER_CLS = {}
+def register_data_loader_cls(data_config_cls):
+  """Decorates a factory of DataLoader for lookup by a subclass of DataConfig.
+  This decorator supports registration of data loaders as follows:
+  ```
+  @dataclasses.dataclass
+  class MyDataConfig(DataConfig):
+    # Add fields here.
+    pass
+  @register_data_loader_cls(MyDataConfig)
+  class MyDataLoader:
+    # Inherits def __init__(self, data_config).
+    pass
+  my_data_config = MyDataConfig()
+  # Returns MyDataLoader(my_data_config).
+  my_loader = get_data_loader(my_data_config)
+  ```
+  Args:
+    data_config_cls: a subclass of DataConfig (*not* an instance
+      of DataConfig).
+  Returns:
+    A callable for use as class decorator that registers the decorated class
+      for creation from an instance of data_config_cls.
+  """
+  return registry.register(_REGISTERED_DATA_LOADER_CLS, data_config_cls)
+def get_data_loader(data_config):
+  """Creates a data_loader from data_config."""
+  return registry.lookup(_REGISTERED_DATA_LOADER_CLS, data_config.__class__)(
+      data_config)

data_loader_factory_test.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for official.nlp.data.data_loader_factory."""
+import dataclasses
+import tensorflow as tf, tf_keras
+from official.core import config_definitions as cfg
+from official.nlp.data import data_loader_factory
+@dataclasses.dataclass
+class MyDataConfig(cfg.DataConfig):
+  is_training: bool = True
+@data_loader_factory.register_data_loader_cls(MyDataConfig)
+class MyDataLoader:
+  def __init__(self, params):
+    self.params = params
+class DataLoaderFactoryTest(tf.test.TestCase):
+  def test_register_and_load(self):
+    train_config = MyDataConfig()
+    train_loader = data_loader_factory.get_data_loader(train_config)
+    self.assertTrue(train_loader.params.is_training)
+if __name__ == "__main__":
+  tf.test.main()

dual_encoder_dataloader.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Loads dataset for the dual encoder (retrieval) task."""
+import dataclasses
+import functools
+import itertools
+from typing import Iterable, Mapping, Optional, Tuple
+import tensorflow as tf, tf_keras
+import tensorflow_hub as hub
+from official.common import dataset_fn
+from official.core import config_definitions as cfg
+from official.core import input_reader
+from official.nlp.data import data_loader
+from official.nlp.data import data_loader_factory
+from official.nlp.modeling import layers
+@dataclasses.dataclass
+class DualEncoderDataConfig(cfg.DataConfig):
+  """Data config for dual encoder task (tasks/dual_encoder)."""
+  # Either set `input_path`...
+  input_path: str = ''
+  # ...or `tfds_name` and `tfds_split` to specify input.
+  tfds_name: str = ''
+  tfds_split: str = ''
+  global_batch_size: int = 32
+  # Either build preprocessing with Python code by specifying these values...
+  vocab_file: str = ''
+  lower_case: bool = True
+  # ...or load preprocessing from a SavedModel at this location.
+  preprocessing_hub_module_url: str = ''
+  left_text_fields: Tuple[str] = ('left_input',)
+  right_text_fields: Tuple[str] = ('right_input',)
+  is_training: bool = True
+  seq_length: int = 128
+  file_type: str = 'tfrecord'
+@data_loader_factory.register_data_loader_cls(DualEncoderDataConfig)
+class DualEncoderDataLoader(data_loader.DataLoader):
+  """A class to load dataset for dual encoder task (tasks/dual_encoder)."""
+  def __init__(self, params):
+    if bool(params.tfds_name) == bool(params.input_path):
+      raise ValueError('Must specify either `tfds_name` and `tfds_split` '
+                       'or `input_path`.')
+    if bool(params.vocab_file) == bool(params.preprocessing_hub_module_url):
+      raise ValueError('Must specify exactly one of vocab_file (with matching '
+                       'lower_case flag) or preprocessing_hub_module_url.')
+    self._params = params
+    self._seq_length = params.seq_length
+    self._left_text_fields = params.left_text_fields
+    self._right_text_fields = params.right_text_fields
+    if params.preprocessing_hub_module_url:
+      preprocessing_hub_module = hub.load(params.preprocessing_hub_module_url)
+      self._tokenizer = preprocessing_hub_module.tokenize
+      self._pack_inputs = functools.partial(
+          preprocessing_hub_module.bert_pack_inputs,
+          seq_length=params.seq_length)
+    else:
+      self._tokenizer = layers.BertTokenizer(
+          vocab_file=params.vocab_file, lower_case=params.lower_case)
+      self._pack_inputs = layers.BertPackInputs(
+          seq_length=params.seq_length,
+          special_tokens_dict=self._tokenizer.get_special_tokens_dict())
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    name_to_features = {
+        x: tf.io.FixedLenFeature([], tf.string)
+        for x in itertools.chain(
+            *[self._left_text_fields, self._right_text_fields])
+    }
+    example = tf.io.parse_single_example(record, name_to_features)
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in example:
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+    return example
+  def _bert_tokenize(
+      self, record: Mapping[str, tf.Tensor],
+      text_fields: Iterable[str]) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    """Tokenize the input in text_fields using BERT tokenizer.
+    Args:
+      record: A tfexample record contains the features.
+      text_fields: A list of fields to be tokenzied.
+    Returns:
+      The tokenized features in a tuple of (input_word_ids, input_mask,
+      input_type_ids).
+    """
+    segments_text = [record[x] for x in text_fields]
+    segments_tokens = [self._tokenizer(s) for s in segments_text]
+    segments = [tf.cast(x.merge_dims(1, 2), tf.int32) for x in segments_tokens]
+    return self._pack_inputs(segments)
+  def _bert_preprocess(
+      self, record: Mapping[str, tf.Tensor]) -> Mapping[str, tf.Tensor]:
+    """Perform the bert word piece tokenization for left and right inputs."""
+    def _switch_prefix(string, old, new):
+      if string.startswith(old): return new + string[len(old):]
+      raise ValueError('Expected {} to start with {}'.format(string, old))
+    def _switch_key_prefix(d, old, new):
+      return {_switch_prefix(key, old, new): value for key, value in d.items()}  # pytype: disable=attribute-error  # trace-all-classes
+    model_inputs = _switch_key_prefix(
+        self._bert_tokenize(record, self._left_text_fields),
+        'input_', 'left_')
+    model_inputs.update(_switch_key_prefix(
+        self._bert_tokenize(record, self._right_text_fields),
+        'input_', 'right_'))
+    return model_inputs
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    reader = input_reader.InputReader(
+        params=self._params,
+        # Skip `decoder_fn` for tfds input.
+        decoder_fn=self._decode if self._params.input_path else None,
+        dataset_fn=dataset_fn.pick_dataset_fn(self._params.file_type),
+        postprocess_fn=self._bert_preprocess)
+    return reader.read(input_context)

dual_encoder_dataloader_test.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for official.nlp.data.dual_encoder_dataloader."""
+import os
+from absl.testing import parameterized
+import tensorflow as tf, tf_keras
+from official.nlp.data import dual_encoder_dataloader
+_LEFT_FEATURE_NAME = 'left_input'
+_RIGHT_FEATURE_NAME = 'right_input'
+def _create_fake_dataset(output_path):
+  """Creates a fake dataset contains examples for training a dual encoder model.
+    The created dataset contains examples with two byteslist features keyed by
+    _LEFT_FEATURE_NAME and _RIGHT_FEATURE_NAME.
+  Args:
+    output_path: The output path of the fake dataset.
+  """
+  def create_str_feature(values):
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=values))
+  with tf.io.TFRecordWriter(output_path) as writer:
+    for _ in range(100):
+      features = {}
+      features[_LEFT_FEATURE_NAME] = create_str_feature([b'hello world.'])
+      features[_RIGHT_FEATURE_NAME] = create_str_feature([b'world hello.'])
+      tf_example = tf.train.Example(
+          features=tf.train.Features(feature=features))
+      writer.write(tf_example.SerializeToString())
+def _make_vocab_file(vocab, output_path):
+  with tf.io.gfile.GFile(output_path, 'w') as f:
+    f.write('\n'.join(vocab + ['']))
+class DualEncoderDataTest(tf.test.TestCase, parameterized.TestCase):
+  def test_load_dataset(self):
+    seq_length = 16
+    batch_size = 10
+    train_data_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
+    vocab_path = os.path.join(self.get_temp_dir(), 'vocab.txt')
+    _create_fake_dataset(train_data_path)
+    _make_vocab_file(
+        ['[PAD]', '[UNK]', '[CLS]', '[SEP]', 'he', '#llo', 'world'], vocab_path)
+    data_config = dual_encoder_dataloader.DualEncoderDataConfig(
+        input_path=train_data_path,
+        seq_length=seq_length,
+        vocab_file=vocab_path,
+        lower_case=True,
+        left_text_fields=(_LEFT_FEATURE_NAME,),
+        right_text_fields=(_RIGHT_FEATURE_NAME,),
+        global_batch_size=batch_size)
+    dataset = dual_encoder_dataloader.DualEncoderDataLoader(
+        data_config).load()
+    features = next(iter(dataset))
+    self.assertCountEqual(
+        ['left_word_ids', 'left_mask', 'left_type_ids', 'right_word_ids',
+         'right_mask', 'right_type_ids'],
+        features.keys())
+    self.assertEqual(features['left_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['left_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['left_type_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['right_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['right_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['right_type_ids'].shape, (batch_size, seq_length))
+  @parameterized.parameters(False, True)
+  def test_load_tfds(self, use_preprocessing_hub):
+    seq_length = 16
+    batch_size = 10
+    if use_preprocessing_hub:
+      vocab_path = ''
+      preprocessing_hub = (
+          'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3')
+    else:
+      vocab_path = os.path.join(self.get_temp_dir(), 'vocab.txt')
+      _make_vocab_file(
+          ['[PAD]', '[UNK]', '[CLS]', '[SEP]', 'he', '#llo', 'world'],
+          vocab_path)
+      preprocessing_hub = ''
+    data_config = dual_encoder_dataloader.DualEncoderDataConfig(
+        tfds_name='para_crawl/enmt',
+        tfds_split='train',
+        seq_length=seq_length,
+        vocab_file=vocab_path,
+        lower_case=True,
+        left_text_fields=('en',),
+        right_text_fields=('mt',),
+        preprocessing_hub_module_url=preprocessing_hub,
+        global_batch_size=batch_size)
+    dataset = dual_encoder_dataloader.DualEncoderDataLoader(
+        data_config).load()
+    features = next(iter(dataset))
+    self.assertCountEqual(
+        ['left_word_ids', 'left_mask', 'left_type_ids', 'right_word_ids',
+         'right_mask', 'right_type_ids'],
+        features.keys())
+    self.assertEqual(features['left_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['left_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['left_type_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['right_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['right_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['right_type_ids'].shape, (batch_size, seq_length))
+if __name__ == '__main__':
+  tf.test.main()

pretrain_dataloader.py ADDED Viewed

	@@ -0,0 +1,589 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Loads dataset for the BERT pretraining task."""
+import dataclasses
+from typing import Mapping, Optional
+from absl import logging
+import numpy as np
+import tensorflow as tf, tf_keras
+from official.common import dataset_fn
+from official.core import config_definitions as cfg
+from official.core import input_reader
+from official.nlp.data import data_loader
+from official.nlp.data import data_loader_factory
+@dataclasses.dataclass
+class BertPretrainDataConfig(cfg.DataConfig):
+  """Data config for BERT pretraining task (tasks/masked_lm)."""
+  input_path: str = ''
+  global_batch_size: int = 512
+  is_training: bool = True
+  seq_length: int = 512
+  max_predictions_per_seq: int = 76
+  use_next_sentence_label: bool = True
+  use_position_id: bool = False
+  # Historically, BERT implementations take `input_ids` and `segment_ids` as
+  # feature names. Inside the TF Model Garden implementation, the Keras model
+  # inputs are set as `input_word_ids` and `input_type_ids`. When
+  # v2_feature_names is True, the data loader assumes the tf.Examples use
+  # `input_word_ids` and `input_type_ids` as keys.
+  use_v2_feature_names: bool = False
+  file_type: str = 'tfrecord'
+@data_loader_factory.register_data_loader_cls(BertPretrainDataConfig)
+class BertPretrainDataLoader(data_loader.DataLoader):
+  """A class to load dataset for bert pretraining task."""
+  def __init__(self, params):
+    """Inits `BertPretrainDataLoader` class.
+    Args:
+      params: A `BertPretrainDataConfig` object.
+    """
+    self._params = params
+    self._seq_length = params.seq_length
+    self._max_predictions_per_seq = params.max_predictions_per_seq
+    self._use_next_sentence_label = params.use_next_sentence_label
+    self._use_position_id = params.use_position_id
+  def _name_to_features(self):
+    name_to_features = {
+        'input_mask':
+            tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'masked_lm_positions':
+            tf.io.FixedLenFeature([self._max_predictions_per_seq], tf.int64),
+        'masked_lm_ids':
+            tf.io.FixedLenFeature([self._max_predictions_per_seq], tf.int64),
+        'masked_lm_weights':
+            tf.io.FixedLenFeature([self._max_predictions_per_seq], tf.float32),
+    }
+    if self._params.use_v2_feature_names:
+      name_to_features.update({
+          'input_word_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+          'input_type_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+      })
+    else:
+      name_to_features.update({
+          'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+          'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+      })
+    if self._use_next_sentence_label:
+      name_to_features['next_sentence_labels'] = tf.io.FixedLenFeature([1],
+                                                                       tf.int64)
+    if self._use_position_id:
+      name_to_features['position_ids'] = tf.io.FixedLenFeature(
+          [self._seq_length], tf.int64)
+    return name_to_features
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    name_to_features = self._name_to_features()
+    example = tf.io.parse_single_example(record, name_to_features)
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in list(example.keys()):
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+    return example
+  def _parse(self, record: Mapping[str, tf.Tensor]):
+    """Parses raw tensors into a dict of tensors to be consumed by the model."""
+    x = {
+        'input_mask': record['input_mask'],
+        'masked_lm_positions': record['masked_lm_positions'],
+        'masked_lm_ids': record['masked_lm_ids'],
+        'masked_lm_weights': record['masked_lm_weights'],
+    }
+    if self._params.use_v2_feature_names:
+      x['input_word_ids'] = record['input_word_ids']
+      x['input_type_ids'] = record['input_type_ids']
+    else:
+      x['input_word_ids'] = record['input_ids']
+      x['input_type_ids'] = record['segment_ids']
+    if self._use_next_sentence_label:
+      x['next_sentence_labels'] = record['next_sentence_labels']
+    if self._use_position_id:
+      x['position_ids'] = record['position_ids']
+    return x
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    reader = input_reader.InputReader(
+        params=self._params,
+        dataset_fn=dataset_fn.pick_dataset_fn(self._params.file_type),
+        decoder_fn=self._decode,
+        parser_fn=self._parse)
+    return reader.read(input_context)
+@dataclasses.dataclass
+class XLNetPretrainDataConfig(cfg.DataConfig):
+  """Data config for XLNet pretraining task.
+  Attributes:
+    input_path: See base class.
+    global_batch_size: See base class.
+    is_training: See base class.
+    seq_length: The length of each sequence.
+    max_predictions_per_seq: The number of predictions per sequence.
+    reuse_length: The number of tokens in a previous segment to reuse. This
+      should be the same value used during pretrain data creation.
+    sample_strategy: The strategy used to sample factorization permutations.
+      Possible values: 'single_token', 'whole_word', 'token_span', 'word_span'.
+    min_num_tokens: The minimum number of tokens to sample in a span. This is
+      used when `sample_strategy` is 'token_span'.
+    max_num_tokens: The maximum number of tokens to sample in a span. This is
+      used when `sample_strategy` is 'token_span'.
+    min_num_words: The minimum number of words to sample in a span. This is used
+      when `sample_strategy` is 'word_span'.
+    max_num_words: The maximum number of words to sample in a span. This is used
+      when `sample_strategy` is 'word_span'.
+    permutation_size: The length of the longest permutation. This can be set to
+      `reuse_length`. This should NOT be greater than `reuse_length`, otherwise
+      this may introduce data leaks.
+    leak_ratio: The percentage of masked tokens that are leaked.
+    segment_sep_id: The ID of the SEP token used when preprocessing the dataset.
+    segment_cls_id: The ID of the CLS token used when preprocessing the dataset.
+  """
+  input_path: str = ''
+  global_batch_size: int = 512
+  is_training: bool = True
+  seq_length: int = 512
+  max_predictions_per_seq: int = 76
+  reuse_length: int = 256
+  sample_strategy: str = 'word_span'
+  min_num_tokens: int = 1
+  max_num_tokens: int = 5
+  min_num_words: int = 1
+  max_num_words: int = 5
+  permutation_size: int = 256
+  leak_ratio: float = 0.1
+  segment_sep_id: int = 4
+  segment_cls_id: int = 3
+@data_loader_factory.register_data_loader_cls(XLNetPretrainDataConfig)
+class XLNetPretrainDataLoader(data_loader.DataLoader):
+  """A class to load dataset for xlnet pretraining task."""
+  def __init__(self, params: XLNetPretrainDataConfig):
+    """Inits `XLNetPretrainDataLoader` class.
+    Args:
+      params: A `XLNetPretrainDataConfig` object.
+    """
+    self._params = params
+    self._seq_length = params.seq_length
+    self._max_predictions_per_seq = params.max_predictions_per_seq
+    self._reuse_length = params.reuse_length
+    self._num_replicas_in_sync = None
+    self._permutation_size = params.permutation_size
+    self._sep_id = params.segment_sep_id
+    self._cls_id = params.segment_cls_id
+    self._sample_strategy = params.sample_strategy
+    self._leak_ratio = params.leak_ratio
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    name_to_features = {
+        'input_word_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'input_type_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'boundary_indices': tf.io.VarLenFeature(tf.int64),
+    }
+    example = tf.io.parse_single_example(record, name_to_features)
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in list(example.keys()):
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+    return example
+  def _parse(self, record: Mapping[str, tf.Tensor]):
+    """Parses raw tensors into a dict of tensors to be consumed by the model."""
+    x = {}
+    inputs = record['input_word_ids']
+    x['input_type_ids'] = record['input_type_ids']
+    if self._sample_strategy in ['whole_word', 'word_span']:
+      boundary = tf.sparse.to_dense(record['boundary_indices'])
+    else:
+      boundary = None
+    input_mask = self._online_sample_mask(inputs=inputs, boundary=boundary)
+    if self._reuse_length > 0:
+      if self._permutation_size > self._reuse_length:
+        logging.warning(
+            '`permutation_size` is greater than `reuse_length` (%d > %d).'
+            'This may introduce data leakage.', self._permutation_size,
+            self._reuse_length)
+      # Enable the memory mechanism.
+      # Permute the reuse and non-reuse segments separately.
+      non_reuse_len = self._seq_length - self._reuse_length
+      if not (self._reuse_length % self._permutation_size == 0 and
+              non_reuse_len % self._permutation_size == 0):
+        raise ValueError('`reuse_length` and `seq_length` should both be '
+                         'a multiple of `permutation_size`.')
+      # Creates permutation mask and target mask for the first reuse_len tokens.
+      # The tokens in this part are reused from the last sequence.
+      perm_mask_0, target_mask_0, tokens_0, masked_0 = self._get_factorization(
+          inputs=inputs[:self._reuse_length],
+          input_mask=input_mask[:self._reuse_length])
+      # Creates permutation mask and target mask for the rest of tokens in
+      # current example, which are concatenation of two new segments.
+      perm_mask_1, target_mask_1, tokens_1, masked_1 = self._get_factorization(
+          inputs[self._reuse_length:], input_mask[self._reuse_length:])
+      perm_mask_0 = tf.concat([
+          perm_mask_0,
+          tf.zeros([self._reuse_length, non_reuse_len], dtype=tf.int32)
+      ],
+                              axis=1)
+      perm_mask_1 = tf.concat([
+          tf.ones([non_reuse_len, self._reuse_length], dtype=tf.int32),
+          perm_mask_1
+      ],
+                              axis=1)
+      perm_mask = tf.concat([perm_mask_0, perm_mask_1], axis=0)
+      target_mask = tf.concat([target_mask_0, target_mask_1], axis=0)
+      tokens = tf.concat([tokens_0, tokens_1], axis=0)
+      masked_tokens = tf.concat([masked_0, masked_1], axis=0)
+    else:
+      # Disable the memory mechanism.
+      if self._seq_length % self._permutation_size != 0:
+        raise ValueError('`seq_length` should be a multiple of '
+                         '`permutation_size`.')
+      # Permute the entire sequence together
+      perm_mask, target_mask, tokens, masked_tokens = self._get_factorization(
+          inputs=inputs, input_mask=input_mask)
+    x['permutation_mask'] = tf.reshape(perm_mask,
+                                       [self._seq_length, self._seq_length])
+    x['input_word_ids'] = tokens
+    x['masked_tokens'] = masked_tokens
+    target = tokens
+    if self._max_predictions_per_seq is not None:
+      indices = tf.range(self._seq_length, dtype=tf.int32)
+      bool_target_mask = tf.cast(target_mask, tf.bool)
+      indices = tf.boolean_mask(indices, bool_target_mask)
+      # account for extra padding due to CLS/SEP.
+      actual_num_predict = tf.shape(indices)[0]
+      pad_len = self._max_predictions_per_seq - actual_num_predict
+      target_mapping = tf.one_hot(indices, self._seq_length, dtype=tf.int32)
+      paddings = tf.zeros([pad_len, self._seq_length],
+                          dtype=target_mapping.dtype)
+      target_mapping = tf.concat([target_mapping, paddings], axis=0)
+      x['target_mapping'] = tf.reshape(
+          target_mapping, [self._max_predictions_per_seq, self._seq_length])
+      target = tf.boolean_mask(target, bool_target_mask)
+      paddings = tf.zeros([pad_len], dtype=target.dtype)
+      target = tf.concat([target, paddings], axis=0)
+      x['target'] = tf.reshape(target, [self._max_predictions_per_seq])
+      target_mask = tf.concat([
+          tf.ones([actual_num_predict], dtype=tf.int32),
+          tf.zeros([pad_len], dtype=tf.int32)
+      ],
+                              axis=0)
+      x['target_mask'] = tf.reshape(target_mask,
+                                    [self._max_predictions_per_seq])
+    else:
+      x['target'] = tf.reshape(target, [self._seq_length])
+      x['target_mask'] = tf.reshape(target_mask, [self._seq_length])
+    return x
+  def _index_pair_to_mask(self, begin_indices: tf.Tensor,
+                          end_indices: tf.Tensor,
+                          inputs: tf.Tensor) -> tf.Tensor:
+    """Converts beginning and end indices into an actual mask."""
+    non_func_mask = tf.logical_and(
+        tf.not_equal(inputs, self._sep_id), tf.not_equal(inputs, self._cls_id))
+    all_indices = tf.where(
+        non_func_mask, tf.range(self._seq_length, dtype=tf.int32),
+        tf.constant(-1, shape=[self._seq_length], dtype=tf.int32))
+    candidate_matrix = tf.cast(
+        tf.logical_and(all_indices[None, :] >= begin_indices[:, None],
+                       all_indices[None, :] < end_indices[:, None]), tf.float32)
+    cumsum_matrix = tf.reshape(
+        tf.cumsum(tf.reshape(candidate_matrix, [-1])), [-1, self._seq_length])
+    masked_matrix = tf.cast(cumsum_matrix <= self._max_predictions_per_seq,
+                            tf.float32)
+    target_mask = tf.reduce_sum(candidate_matrix * masked_matrix, axis=0)
+    return tf.cast(target_mask, tf.bool)
+  def _single_token_mask(self, inputs: tf.Tensor) -> tf.Tensor:
+    """Samples individual tokens as prediction targets."""
+    all_indices = tf.range(self._seq_length, dtype=tf.int32)
+    non_func_mask = tf.logical_and(
+        tf.not_equal(inputs, self._sep_id), tf.not_equal(inputs, self._cls_id))
+    non_func_indices = tf.boolean_mask(all_indices, non_func_mask)
+    masked_pos = tf.random.shuffle(non_func_indices)
+    masked_pos = tf.sort(masked_pos[:self._max_predictions_per_seq])
+    sparse_indices = tf.stack([tf.zeros_like(masked_pos), masked_pos], axis=-1)
+    sparse_indices = tf.cast(sparse_indices, tf.int64)
+    sparse_indices = tf.sparse.SparseTensor(
+        sparse_indices,
+        values=tf.ones_like(masked_pos),
+        dense_shape=(1, self._seq_length))
+    target_mask = tf.sparse.to_dense(sp_input=sparse_indices, default_value=0)
+    return tf.squeeze(tf.cast(target_mask, tf.bool))
+  def _whole_word_mask(self, inputs: tf.Tensor,
+                       boundary: tf.Tensor) -> tf.Tensor:
+    """Samples whole words as prediction targets."""
+    pair_indices = tf.concat([boundary[:-1, None], boundary[1:, None]], axis=1)
+    cand_pair_indices = tf.random.shuffle(
+        pair_indices)[:self._max_predictions_per_seq]
+    begin_indices = cand_pair_indices[:, 0]
+    end_indices = cand_pair_indices[:, 1]
+    return self._index_pair_to_mask(
+        begin_indices=begin_indices, end_indices=end_indices, inputs=inputs)
+  def _token_span_mask(self, inputs: tf.Tensor) -> tf.Tensor:
+    """Samples token spans as prediction targets."""
+    min_num_tokens = self._params.min_num_tokens
+    max_num_tokens = self._params.max_num_tokens
+    mask_alpha = self._seq_length / self._max_predictions_per_seq
+    round_to_int = lambda x: tf.cast(tf.round(x), tf.int32)
+    # Sample span lengths from a zipf distribution
+    span_len_seq = np.arange(min_num_tokens, max_num_tokens + 1)
+    probs = np.array([1.0 / (i + 1) for i in span_len_seq])
+    probs /= np.sum(probs)
+    logits = tf.constant(np.log(probs), dtype=tf.float32)
+    span_lens = tf.random.categorical(
+        logits=logits[None],
+        num_samples=self._max_predictions_per_seq,
+        dtype=tf.int32,
+    )[0] + min_num_tokens
+    # Sample the ratio [0.0, 1.0) of left context lengths
+    span_lens_float = tf.cast(span_lens, tf.float32)
+    left_ratio = tf.random.uniform(
+        shape=[self._max_predictions_per_seq], minval=0.0, maxval=1.0)
+    left_ctx_len = left_ratio * span_lens_float * (mask_alpha - 1)
+    left_ctx_len = round_to_int(left_ctx_len)
+    # Compute the offset from left start to the right end
+    right_offset = round_to_int(span_lens_float * mask_alpha) - left_ctx_len
+    # Get the actual begin and end indices
+    begin_indices = (
+        tf.cumsum(left_ctx_len) + tf.cumsum(right_offset, exclusive=True))
+    end_indices = begin_indices + span_lens
+    # Remove out of range indices
+    valid_idx_mask = end_indices < self._seq_length
+    begin_indices = tf.boolean_mask(begin_indices, valid_idx_mask)
+    end_indices = tf.boolean_mask(end_indices, valid_idx_mask)
+    # Shuffle valid indices
+    num_valid = tf.cast(tf.shape(begin_indices)[0], tf.int32)
+    order = tf.random.shuffle(tf.range(num_valid, dtype=tf.int32))
+    begin_indices = tf.gather(begin_indices, order)
+    end_indices = tf.gather(end_indices, order)
+    return self._index_pair_to_mask(
+        begin_indices=begin_indices, end_indices=end_indices, inputs=inputs)
+  def _word_span_mask(self, inputs: tf.Tensor, boundary: tf.Tensor):
+    """Sample whole word spans as prediction targets."""
+    min_num_words = self._params.min_num_words
+    max_num_words = self._params.max_num_words
+    # Note: 1.2 is the token-to-word ratio
+    mask_alpha = self._seq_length / self._max_predictions_per_seq / 1.2
+    round_to_int = lambda x: tf.cast(tf.round(x), tf.int32)
+    # Sample span lengths from a zipf distribution
+    span_len_seq = np.arange(min_num_words, max_num_words + 1)
+    probs = np.array([1.0 / (i + 1) for i in span_len_seq])
+    probs /= np.sum(probs)
+    logits = tf.constant(np.log(probs), dtype=tf.float32)
+    # Sample `num_predict` words here: note that this is over sampling
+    span_lens = tf.random.categorical(
+        logits=logits[None],
+        num_samples=self._max_predictions_per_seq,
+        dtype=tf.int32,
+    )[0] + min_num_words
+    # Sample the ratio [0.0, 1.0) of left context lengths
+    span_lens_float = tf.cast(span_lens, tf.float32)
+    left_ratio = tf.random.uniform(
+        shape=[self._max_predictions_per_seq], minval=0.0, maxval=1.0)
+    left_ctx_len = left_ratio * span_lens_float * (mask_alpha - 1)
+    left_ctx_len = round_to_int(left_ctx_len)
+    right_offset = round_to_int(span_lens_float * mask_alpha) - left_ctx_len
+    begin_indices = (
+        tf.cumsum(left_ctx_len) + tf.cumsum(right_offset, exclusive=True))
+    end_indices = begin_indices + span_lens
+    # Remove out of range indices
+    max_boundary_index = tf.cast(tf.shape(boundary)[0] - 1, tf.int32)
+    valid_idx_mask = end_indices < max_boundary_index
+    begin_indices = tf.boolean_mask(begin_indices, valid_idx_mask)
+    end_indices = tf.boolean_mask(end_indices, valid_idx_mask)
+    begin_indices = tf.gather(boundary, begin_indices)
+    end_indices = tf.gather(boundary, end_indices)
+    # Shuffle valid indices
+    num_valid = tf.cast(tf.shape(begin_indices)[0], tf.int32)
+    order = tf.random.shuffle(tf.range(num_valid, dtype=tf.int32))
+    begin_indices = tf.gather(begin_indices, order)
+    end_indices = tf.gather(end_indices, order)
+    return self._index_pair_to_mask(
+        begin_indices=begin_indices, end_indices=end_indices, inputs=inputs)
+  def _online_sample_mask(self, inputs: tf.Tensor,
+                          boundary: tf.Tensor) -> tf.Tensor:
+    """Samples target positions for predictions.
+    Descriptions of each strategy:
+      - 'single_token': Samples individual tokens as prediction targets.
+      - 'token_span': Samples spans of tokens as prediction targets.
+      - 'whole_word': Samples individual words as prediction targets.
+      - 'word_span': Samples spans of words as prediction targets.
+    Args:
+      inputs: The input tokens.
+      boundary: The `int` Tensor of indices indicating whole word boundaries.
+        This is used in 'whole_word' and 'word_span'
+    Returns:
+      The sampled `bool` input mask.
+    Raises:
+      `ValueError`: if `max_predictions_per_seq` is not set or if boundary is
+        not provided for 'whole_word' and 'word_span' sample strategies.
+    """
+    if self._max_predictions_per_seq is None:
+      raise ValueError('`max_predictions_per_seq` must be set.')
+    if boundary is None and 'word' in self._sample_strategy:
+      raise ValueError('`boundary` must be provided for {} strategy'.format(
+          self._sample_strategy))
+    if self._sample_strategy == 'single_token':
+      return self._single_token_mask(inputs)
+    elif self._sample_strategy == 'token_span':
+      return self._token_span_mask(inputs)
+    elif self._sample_strategy == 'whole_word':
+      return self._whole_word_mask(inputs, boundary)
+    elif self._sample_strategy == 'word_span':
+      return self._word_span_mask(inputs, boundary)
+    else:
+      raise NotImplementedError('Invalid sample strategy.')
+  def _get_factorization(self, inputs: tf.Tensor, input_mask: tf.Tensor):
+    """Samples a permutation of the factorization order.
+    Args:
+      inputs: the input tokens.
+      input_mask: the `bool` Tensor of the same shape as `inputs`. If `True`,
+        then this means select for partial prediction.
+    Returns:
+      perm_mask: An `int32` Tensor of shape [seq_length, seq_length] consisting
+        of 0s and 1s. If perm_mask[i][j] == 0, then this means that the i-th
+        token (in original order) cannot attend to the jth attention token.
+      target_mask: An `int32` Tensor of shape [seq_len] consisting of 0s and 1s.
+        If target_mask[i] == 1, then the i-th token needs to be predicted and
+        the mask will be used as input. This token will be included in the loss.
+        If target_mask[i] == 0, then the token (or [SEP], [CLS]) will be used as
+        input. This token will not be included in the loss.
+      tokens: int32 Tensor of shape [seq_length].
+      masked_tokens: int32 Tensor of shape [seq_length].
+    """
+    factorization_length = tf.shape(inputs)[0]
+    # Generate permutation indices
+    index = tf.range(factorization_length, dtype=tf.int32)
+    index = tf.transpose(tf.reshape(index, [-1, self._permutation_size]))
+    index = tf.random.shuffle(index)
+    index = tf.reshape(tf.transpose(index), [-1])
+    input_mask = tf.cast(input_mask, tf.bool)
+    # non-functional tokens
+    non_func_tokens = tf.logical_not(
+        tf.logical_or(
+            tf.equal(inputs, self._sep_id), tf.equal(inputs, self._cls_id)))
+    masked_tokens = tf.logical_and(input_mask, non_func_tokens)
+    non_masked_or_func_tokens = tf.logical_not(masked_tokens)
+    smallest_index = -2 * tf.ones([factorization_length], dtype=tf.int32)
+    # Similar to BERT, randomly leak some masked tokens
+    if self._leak_ratio > 0:
+      leak_tokens = tf.logical_and(
+          masked_tokens,
+          tf.random.uniform([factorization_length], maxval=1.0) <
+          self._leak_ratio)
+      can_attend_self = tf.logical_or(non_masked_or_func_tokens, leak_tokens)
+    else:
+      can_attend_self = non_masked_or_func_tokens
+    to_index = tf.where(can_attend_self, smallest_index, index)
+    from_index = tf.where(can_attend_self, to_index + 1, to_index)
+    # For masked tokens, can attend if i > j
+    # For context tokens, always can attend each other
+    can_attend = from_index[:, None] > to_index[None, :]
+    perm_mask = tf.cast(can_attend, tf.int32)
+    # Only masked tokens are included in the loss
+    target_mask = tf.cast(masked_tokens, tf.int32)
+    return perm_mask, target_mask, inputs, masked_tokens
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    if input_context:
+      self._num_replicas_in_sync = input_context.num_replicas_in_sync
+    reader = input_reader.InputReader(
+        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
+    return reader.read(input_context)

pretrain_dataloader_test.py ADDED Viewed

	@@ -0,0 +1,242 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for official.nlp.data.pretrain_dataloader."""
+import itertools
+import os
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf, tf_keras
+from official.nlp.data import pretrain_dataloader
+def create_int_feature(values):
+  f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  return f
+def _create_fake_bert_dataset(
+    output_path,
+    seq_length,
+    max_predictions_per_seq,
+    use_position_id,
+    use_next_sentence_label,
+    use_v2_feature_names=False):
+  """Creates a fake dataset."""
+  writer = tf.io.TFRecordWriter(output_path)
+  def create_float_feature(values):
+    f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+    return f
+  for _ in range(100):
+    features = {}
+    input_ids = np.random.randint(100, size=(seq_length))
+    features["input_mask"] = create_int_feature(np.ones_like(input_ids))
+    if use_v2_feature_names:
+      features["input_word_ids"] = create_int_feature(input_ids)
+      features["input_type_ids"] = create_int_feature(np.ones_like(input_ids))
+    else:
+      features["input_ids"] = create_int_feature(input_ids)
+      features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
+    features["masked_lm_positions"] = create_int_feature(
+        np.random.randint(100, size=(max_predictions_per_seq)))
+    features["masked_lm_ids"] = create_int_feature(
+        np.random.randint(100, size=(max_predictions_per_seq)))
+    features["masked_lm_weights"] = create_float_feature(
+        [1.0] * max_predictions_per_seq)
+    if use_next_sentence_label:
+      features["next_sentence_labels"] = create_int_feature([1])
+    if use_position_id:
+      features["position_ids"] = create_int_feature(range(0, seq_length))
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+def _create_fake_xlnet_dataset(
+    output_path, seq_length, max_predictions_per_seq):
+  """Creates a fake dataset."""
+  writer = tf.io.TFRecordWriter(output_path)
+  for _ in range(100):
+    features = {}
+    input_ids = np.random.randint(100, size=(seq_length))
+    num_boundary_indices = np.random.randint(1, seq_length)
+    if max_predictions_per_seq is not None:
+      input_mask = np.zeros_like(input_ids)
+      input_mask[:max_predictions_per_seq] = 1
+      np.random.shuffle(input_mask)
+    else:
+      input_mask = np.ones_like(input_ids)
+    features["input_mask"] = create_int_feature(input_mask)
+    features["input_word_ids"] = create_int_feature(input_ids)
+    features["input_type_ids"] = create_int_feature(np.ones_like(input_ids))
+    features["boundary_indices"] = create_int_feature(
+        sorted(np.random.randint(seq_length, size=(num_boundary_indices))))
+    features["target"] = create_int_feature(input_ids + 1)
+    features["label"] = create_int_feature([1])
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+class BertPretrainDataTest(tf.test.TestCase, parameterized.TestCase):
+  @parameterized.parameters(itertools.product(
+      (False, True),
+      (False, True),
+  ))
+  def test_load_data(self, use_next_sentence_label, use_position_id):
+    train_data_path = os.path.join(self.get_temp_dir(), "train.tf_record")
+    seq_length = 128
+    max_predictions_per_seq = 20
+    _create_fake_bert_dataset(
+        train_data_path,
+        seq_length,
+        max_predictions_per_seq,
+        use_next_sentence_label=use_next_sentence_label,
+        use_position_id=use_position_id)
+    data_config = pretrain_dataloader.BertPretrainDataConfig(
+        input_path=train_data_path,
+        max_predictions_per_seq=max_predictions_per_seq,
+        seq_length=seq_length,
+        global_batch_size=10,
+        is_training=True,
+        use_next_sentence_label=use_next_sentence_label,
+        use_position_id=use_position_id)
+    dataset = pretrain_dataloader.BertPretrainDataLoader(data_config).load()
+    features = next(iter(dataset))
+    self.assertLen(features,
+                   6 + int(use_next_sentence_label) + int(use_position_id))
+    self.assertIn("input_word_ids", features)
+    self.assertIn("input_mask", features)
+    self.assertIn("input_type_ids", features)
+    self.assertIn("masked_lm_positions", features)
+    self.assertIn("masked_lm_ids", features)
+    self.assertIn("masked_lm_weights", features)
+    self.assertEqual("next_sentence_labels" in features,
+                     use_next_sentence_label)
+    self.assertEqual("position_ids" in features, use_position_id)
+  def test_v2_feature_names(self):
+    train_data_path = os.path.join(self.get_temp_dir(), "train.tf_record")
+    seq_length = 128
+    max_predictions_per_seq = 20
+    _create_fake_bert_dataset(
+        train_data_path,
+        seq_length,
+        max_predictions_per_seq,
+        use_next_sentence_label=True,
+        use_position_id=False,
+        use_v2_feature_names=True)
+    data_config = pretrain_dataloader.BertPretrainDataConfig(
+        input_path=train_data_path,
+        max_predictions_per_seq=max_predictions_per_seq,
+        seq_length=seq_length,
+        global_batch_size=10,
+        is_training=True,
+        use_next_sentence_label=True,
+        use_position_id=False,
+        use_v2_feature_names=True)
+    dataset = pretrain_dataloader.BertPretrainDataLoader(data_config).load()
+    features = next(iter(dataset))
+    self.assertIn("input_word_ids", features)
+    self.assertIn("input_mask", features)
+    self.assertIn("input_type_ids", features)
+    self.assertIn("masked_lm_positions", features)
+    self.assertIn("masked_lm_ids", features)
+    self.assertIn("masked_lm_weights", features)
+class XLNetPretrainDataTest(parameterized.TestCase, tf.test.TestCase):
+  @parameterized.parameters(itertools.product(
+      ("single_token", "whole_word", "token_span"),
+      (0, 64),
+      (20, None),
+      ))
+  def test_load_data(
+      self, sample_strategy, reuse_length, max_predictions_per_seq):
+    train_data_path = os.path.join(self.get_temp_dir(), "train.tf_record")
+    seq_length = 128
+    batch_size = 5
+    _create_fake_xlnet_dataset(
+        train_data_path, seq_length, max_predictions_per_seq)
+    data_config = pretrain_dataloader.XLNetPretrainDataConfig(
+        input_path=train_data_path,
+        max_predictions_per_seq=max_predictions_per_seq,
+        seq_length=seq_length,
+        global_batch_size=batch_size,
+        is_training=True,
+        reuse_length=reuse_length,
+        sample_strategy=sample_strategy,
+        min_num_tokens=1,
+        max_num_tokens=2,
+        permutation_size=seq_length // 2,
+        leak_ratio=0.1)
+    if max_predictions_per_seq is None:
+      with self.assertRaises(ValueError):
+        dataset = pretrain_dataloader.XLNetPretrainDataLoader(
+            data_config).load()
+        features = next(iter(dataset))
+    else:
+      dataset = pretrain_dataloader.XLNetPretrainDataLoader(data_config).load()
+      features = next(iter(dataset))
+      self.assertIn("input_word_ids", features)
+      self.assertIn("input_type_ids", features)
+      self.assertIn("permutation_mask", features)
+      self.assertIn("masked_tokens", features)
+      self.assertIn("target", features)
+      self.assertIn("target_mask", features)
+      self.assertAllClose(features["input_word_ids"].shape,
+                          (batch_size, seq_length))
+      self.assertAllClose(features["input_type_ids"].shape,
+                          (batch_size, seq_length))
+      self.assertAllClose(features["permutation_mask"].shape,
+                          (batch_size, seq_length, seq_length))
+      self.assertAllClose(features["masked_tokens"].shape,
+                          (batch_size, seq_length,))
+      if max_predictions_per_seq is not None:
+        self.assertIn("target_mapping", features)
+        self.assertAllClose(features["target_mapping"].shape,
+                            (batch_size, max_predictions_per_seq, seq_length))
+        self.assertAllClose(features["target_mask"].shape,
+                            (batch_size, max_predictions_per_seq))
+        self.assertAllClose(features["target"].shape,
+                            (batch_size, max_predictions_per_seq))
+      else:
+        self.assertAllClose(features["target_mask"].shape,
+                            (batch_size, seq_length))
+        self.assertAllClose(features["target"].shape,
+                            (batch_size, seq_length))
+if __name__ == "__main__":
+  tf.test.main()

pretrain_dynamic_dataloader.py ADDED Viewed

	@@ -0,0 +1,223 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dataset loader for the pre-training with dynamic sequence length."""
+from typing import Optional, Tuple
+import dataclasses
+import tensorflow as tf, tf_keras
+from official.core import config_definitions as cfg
+from official.core import input_reader
+from official.nlp.data import data_loader_factory
+from official.nlp.data import pretrain_dataloader
+@dataclasses.dataclass
+class BertPretrainDataConfig(cfg.DataConfig):
+  """Data config for BERT pretraining task (tasks/masked_lm)."""
+  input_path: str = ''
+  global_batch_size: int = 512
+  is_training: bool = True
+  seq_bucket_lengths: Tuple[int, ...] = (128, 256, 384, 512,)
+  # TODO(rxsang): `seq_bucket_window_scale` is only useful when round robin
+  # tf.data service is disabled. Deprecate this flag once we always enable round
+  # robin tf.data service.
+  seq_bucket_window_scale: int = 8
+  use_next_sentence_label: bool = True
+  use_position_id: bool = False
+  deterministic: bool = False
+  enable_tf_data_service: bool = False
+  enable_round_robin_tf_data_service: bool = False
+  tf_data_service_job_name: str = 'bert_pretrain'
+  use_v2_feature_names: bool = False
+@data_loader_factory.register_data_loader_cls(BertPretrainDataConfig)
+class PretrainingDynamicDataLoader(pretrain_dataloader.BertPretrainDataLoader):
+  """Dataset loader for bert-style pretraining with dynamic sequenece length.
+  Bucketizes the input id features by the seq_bucket_lengths and features are
+  padded to the bucket boundaries. The mask features are usually short than
+  input id features and can also be dynamic. We require the mask feature lengths
+  within a bucket must be the same. For example, with [128, 256] buckets,
+  the mask features for bucket 128 should always have the length as X and
+  features for bucket 256 should always have the length as Y.
+  The dataloader does not filter out empty masks. Make sure to handle this
+  in the model.
+  """
+  def __init__(self, params):
+    self._params = params
+    if len(params.seq_bucket_lengths) < 1:
+      raise ValueError('The seq_bucket_lengths cannot be empty.')
+    self._seq_bucket_lengths = params.seq_bucket_lengths
+    self._seq_bucket_window_scale = params.seq_bucket_window_scale
+    self._global_batch_size = params.global_batch_size
+    self._use_next_sentence_label = params.use_next_sentence_label
+    self._use_position_id = params.use_position_id
+    self._drop_remainder = params.drop_remainder
+    self._enable_tf_data_service = params.enable_tf_data_service
+    self._enable_round_robin_tf_data_service = (
+        params.enable_round_robin_tf_data_service)
+    self._mask_keys = [
+        'masked_lm_positions', 'masked_lm_ids', 'masked_lm_weights'
+    ]
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    name_to_features = {
+        'input_mask': tf.io.VarLenFeature(tf.int64),
+        'masked_lm_positions': tf.io.VarLenFeature(tf.int64),
+        'masked_lm_ids': tf.io.VarLenFeature(tf.int64),
+        'masked_lm_weights': tf.io.VarLenFeature(tf.float32),
+    }
+    if self._params.use_v2_feature_names:
+      input_ids_key = 'input_word_ids'
+      segment_key = 'input_type_ids'
+      name_to_features.update({
+          input_ids_key: tf.io.VarLenFeature(tf.int64),
+          segment_key: tf.io.VarLenFeature(tf.int64),
+      })
+    else:
+      input_ids_key = 'input_ids'
+      segment_key = 'segment_ids'
+      name_to_features.update({
+          input_ids_key: tf.io.VarLenFeature(tf.int64),
+          segment_key: tf.io.VarLenFeature(tf.int64),
+      })
+    if self._use_next_sentence_label:
+      name_to_features['next_sentence_labels'] = tf.io.FixedLenFeature([1],
+                                                                       tf.int64)
+    dynamic_keys = [input_ids_key, 'input_mask', segment_key]
+    if self._use_position_id:
+      name_to_features['position_ids'] = tf.io.VarLenFeature(tf.int64)
+      dynamic_keys.append('position_ids')
+    example = tf.io.parse_single_example(record, name_to_features)
+    for key in dynamic_keys + self._mask_keys:
+      example[key] = tf.sparse.to_dense(example[key])
+    # Truncate padded data after the first non pad in the
+    # sequence length dimension.
+    # Pad before the first non pad from the back should not be removed.
+    mask = tf.math.greater(
+        tf.math.cumsum(example[input_ids_key], reverse=True), 0)
+    for key in dynamic_keys:
+      example[key] = tf.boolean_mask(example[key], mask)
+    # masked_lm_ids should be 0 padded.
+    # Change mask features to -1 padding so that we can differentiate
+    # padding from data or from bucketizing.
+    mask = tf.math.not_equal(example['masked_lm_ids'], 0)
+    example['masked_lm_ids'] = tf.where(
+        mask, example['masked_lm_ids'],
+        -tf.ones(tf.shape(example['masked_lm_ids']), dtype=example[key].dtype))
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    # tf.data service uses dataset graph fingerprint to distinguish input
+    # pipeline jobs, thus we sort the keys here to make sure they are generated
+    # in a deterministic order each time the dataset function is traced.
+    for name in sorted(list(example.keys())):
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+    return example
+  def _bucketize_and_batch(
+      self,
+      dataset,
+      input_context: Optional[tf.distribute.InputContext] = None):
+    """Bucketize by sequence length and batch the datasets."""
+    per_replica_batch_size = input_context.get_per_replica_batch_size(
+        self._global_batch_size) if input_context else self._global_batch_size
+    def element_length_func(example, seq_len_dim):
+      return tf.shape(example['input_word_ids'])[seq_len_dim]
+    bucket_boundaries = [length + 1 for length in self._seq_bucket_lengths]
+    bucket_batch_sizes = [per_replica_batch_size] * (len(bucket_boundaries) + 1)
+    # Bucketize and batch the dataset with per replica batch size first.
+    dataset = dataset.apply(
+        tf.data.experimental.bucket_by_sequence_length(
+            lambda example: tf.cast(element_length_func(example, 0), tf.int32),
+            bucket_boundaries,
+            bucket_batch_sizes,
+            pad_to_bucket_boundary=True,
+            drop_remainder=self._drop_remainder))
+    if input_context:
+      window_size = input_context.num_replicas_in_sync
+      if self._enable_tf_data_service and (
+          not self._enable_round_robin_tf_data_service):
+        # If tf.data service is enabled but round-robin behavior is not enabled,
+        # different TPU workers may fetch data from one tf.data service worker
+        # in different speed. We set the window size to be
+        # `seq_bucket_window_scale` larger to leave buffer if some workers are
+        # fetching data faster than others, so all the data within the same
+        # global batch can still have more chances to be in the same bucket.
+        window_size *= self._seq_bucket_window_scale
+      # Group `num_replicas_in_sync` batches from same bucket together, so all
+      # replicas can get the same sequence length for one global step.
+      dataset = dataset.apply(
+          tf.data.experimental.group_by_window(
+              key_func=lambda example: tf.cast(  # pylint: disable=g-long-lambda
+                  element_length_func(example, 1), tf.int64),
+              reduce_func=lambda _, x: tf.data.Dataset.from_tensors(x),
+              window_size=window_size))
+      dataset = dataset.flat_map(lambda x: x)
+    def _remove_pads_from_bucketize(features):
+      # All mask features must have the same effective length.
+      # The real masked ids padding token is -1 and 0 comes from
+      # bucket_by_sequence_length.
+      mask = tf.math.not_equal(features['masked_lm_ids'], 0)
+      mask_per_example = tf.math.reduce_sum(tf.cast(mask, tf.int32), axis=1)
+      normalized = tf.cast(
+          mask_per_example / tf.math.reduce_max(mask_per_example), tf.int32)
+      assert_op = tf.debugging.assert_equal(
+          tf.math.reduce_sum(normalized), per_replica_batch_size,
+          'Number of non padded mask tokens is not the same for each example '
+          'in the same sequence length.')
+      with tf.control_dependencies([assert_op]):
+        for key in self._mask_keys:
+          features[key] = tf.reshape(
+              tf.boolean_mask(
+                  features[key], mask), [per_replica_batch_size, -1])
+      # Revert masked_lm_ids to be 0-padded.
+      mask = tf.math.not_equal(features['masked_lm_ids'], -1)
+      features['masked_lm_ids'] = tf.where(
+          mask, features['masked_lm_ids'],
+          tf.zeros(
+              tf.shape(features['masked_lm_ids']),
+              dtype=features['masked_lm_ids'].dtype))
+      return features
+    dataset = dataset.map(_remove_pads_from_bucketize)
+    return dataset
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    reader = input_reader.InputReader(
+        params=self._params,
+        decoder_fn=self._decode,
+        parser_fn=self._parse,
+        transform_and_batch_fn=self._bucketize_and_batch)
+    return reader.read(input_context)

pretrain_dynamic_dataloader_test.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for nlp.data.pretrain_dynamic_dataloader."""
+import os
+from absl import logging
+from absl.testing import parameterized
+import numpy as np
+import orbit
+import tensorflow as tf, tf_keras
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.nlp.configs import bert
+from official.nlp.configs import encoders
+from official.nlp.data import pretrain_dataloader
+from official.nlp.data import pretrain_dynamic_dataloader
+from official.nlp.tasks import masked_lm
+def _create_fake_dataset(output_path, seq_length, num_masked_tokens,
+                         max_seq_length, num_examples):
+  """Creates a fake dataset."""
+  writer = tf.io.TFRecordWriter(output_path)
+  def create_int_feature(values):
+    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+    return f
+  def create_float_feature(values):
+    f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+    return f
+  rng = np.random.default_rng(37)
+  for _ in range(num_examples):
+    features = {}
+    padding = np.zeros(shape=(max_seq_length - seq_length), dtype=np.int32)
+    input_ids = rng.integers(low=1, high=100, size=(seq_length))
+    features['input_ids'] = create_int_feature(
+        np.concatenate((input_ids, padding)))
+    features['input_mask'] = create_int_feature(
+        np.concatenate((np.ones_like(input_ids), padding)))
+    features['segment_ids'] = create_int_feature(
+        np.concatenate((np.ones_like(input_ids), padding)))
+    features['position_ids'] = create_int_feature(
+        np.concatenate((np.ones_like(input_ids), padding)))
+    features['masked_lm_positions'] = create_int_feature(
+        rng.integers(60, size=(num_masked_tokens), dtype=np.int64))
+    features['masked_lm_ids'] = create_int_feature(
+        rng.integers(100, size=(num_masked_tokens), dtype=np.int64))
+    features['masked_lm_weights'] = create_float_feature(
+        np.ones((num_masked_tokens,), dtype=np.float32))
+    features['next_sentence_labels'] = create_int_feature(np.array([0]))
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+class PretrainDynamicDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
+  @combinations.generate(
+      combinations.combine(
+          distribution_strategy=[
+              strategy_combinations.cloud_tpu_strategy,
+          ],
+          mode='eager'))
+  def test_distribution_strategy(self, distribution_strategy):
+    max_seq_length = 128
+    batch_size = 8
+    input_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
+    _create_fake_dataset(
+        input_path,
+        seq_length=60,
+        num_masked_tokens=20,
+        max_seq_length=max_seq_length,
+        num_examples=batch_size)
+    data_config = pretrain_dynamic_dataloader.BertPretrainDataConfig(
+        is_training=False,
+        input_path=input_path,
+        seq_bucket_lengths=[64, 128],
+        global_batch_size=batch_size)
+    dataloader = pretrain_dynamic_dataloader.PretrainingDynamicDataLoader(
+        data_config)
+    distributed_ds = orbit.utils.make_distributed_dataset(
+        distribution_strategy, dataloader.load)
+    train_iter = iter(distributed_ds)
+    with distribution_strategy.scope():
+      config = masked_lm.MaskedLMConfig(
+          init_checkpoint=self.get_temp_dir(),
+          model=bert.PretrainerConfig(
+              encoders.EncoderConfig(
+                  bert=encoders.BertEncoderConfig(
+                      vocab_size=30522, num_layers=1)),
+              cls_heads=[
+                  bert.ClsHeadConfig(
+                      inner_dim=10, num_classes=2, name='next_sentence')
+              ]),
+          train_data=data_config)
+      task = masked_lm.MaskedLMTask(config)
+      model = task.build_model()
+      metrics = task.build_metrics()
+    @tf.function
+    def step_fn(features):
+      return task.validation_step(features, model, metrics=metrics)
+    distributed_outputs = distribution_strategy.run(
+        step_fn, args=(next(train_iter),))
+    local_results = tf.nest.map_structure(
+        distribution_strategy.experimental_local_results, distributed_outputs)
+    logging.info('Dynamic padding:  local_results= %s', str(local_results))
+    dynamic_metrics = {}
+    for metric in metrics:
+      dynamic_metrics[metric.name] = metric.result()
+    data_config = pretrain_dataloader.BertPretrainDataConfig(
+        is_training=False,
+        input_path=input_path,
+        seq_length=max_seq_length,
+        max_predictions_per_seq=20,
+        global_batch_size=batch_size)
+    dataloader = pretrain_dataloader.BertPretrainDataLoader(data_config)
+    distributed_ds = orbit.utils.make_distributed_dataset(
+        distribution_strategy, dataloader.load)
+    train_iter = iter(distributed_ds)
+    with distribution_strategy.scope():
+      metrics = task.build_metrics()
+    @tf.function
+    def step_fn_b(features):
+      return task.validation_step(features, model, metrics=metrics)
+    distributed_outputs = distribution_strategy.run(
+        step_fn_b, args=(next(train_iter),))
+    local_results = tf.nest.map_structure(
+        distribution_strategy.experimental_local_results, distributed_outputs)
+    logging.info('Static padding:  local_results= %s', str(local_results))
+    static_metrics = {}
+    for metric in metrics:
+      static_metrics[metric.name] = metric.result()
+    for key in static_metrics:
+      # We need to investigate the differences on losses.
+      if key != 'next_sentence_loss':
+        self.assertEqual(dynamic_metrics[key], static_metrics[key])
+  def test_load_dataset(self):
+    tf.random.set_seed(0)
+    max_seq_length = 128
+    batch_size = 2
+    input_path_1 = os.path.join(self.get_temp_dir(), 'train_1.tf_record')
+    _create_fake_dataset(
+        input_path_1,
+        seq_length=60,
+        num_masked_tokens=20,
+        max_seq_length=max_seq_length,
+        num_examples=batch_size)
+    input_path_2 = os.path.join(self.get_temp_dir(), 'train_2.tf_record')
+    _create_fake_dataset(
+        input_path_2,
+        seq_length=100,
+        num_masked_tokens=70,
+        max_seq_length=max_seq_length,
+        num_examples=batch_size)
+    input_paths = ','.join([input_path_1, input_path_2])
+    data_config = pretrain_dynamic_dataloader.BertPretrainDataConfig(
+        is_training=False,
+        input_path=input_paths,
+        seq_bucket_lengths=[64, 128],
+        use_position_id=True,
+        global_batch_size=batch_size,
+        deterministic=True)
+    dataset = pretrain_dynamic_dataloader.PretrainingDynamicDataLoader(
+        data_config).load()
+    dataset_it = iter(dataset)
+    features = next(dataset_it)
+    self.assertCountEqual([
+        'input_word_ids',
+        'input_mask',
+        'input_type_ids',
+        'next_sentence_labels',
+        'masked_lm_positions',
+        'masked_lm_ids',
+        'masked_lm_weights',
+        'position_ids',
+    ], features.keys())
+    # Sequence length dimension should be bucketized and pad to 64.
+    self.assertEqual(features['input_word_ids'].shape, (batch_size, 64))
+    self.assertEqual(features['input_mask'].shape, (batch_size, 64))
+    self.assertEqual(features['input_type_ids'].shape, (batch_size, 64))
+    self.assertEqual(features['position_ids'].shape, (batch_size, 64))
+    self.assertEqual(features['masked_lm_positions'].shape, (batch_size, 20))
+    features = next(dataset_it)
+    self.assertEqual(features['input_word_ids'].shape, (batch_size, 128))
+    self.assertEqual(features['input_mask'].shape, (batch_size, 128))
+    self.assertEqual(features['input_type_ids'].shape, (batch_size, 128))
+    self.assertEqual(features['position_ids'].shape, (batch_size, 128))
+    self.assertEqual(features['masked_lm_positions'].shape, (batch_size, 70))
+  def test_load_dataset_not_same_masks(self):
+    max_seq_length = 128
+    batch_size = 2
+    input_path_1 = os.path.join(self.get_temp_dir(), 'train_3.tf_record')
+    _create_fake_dataset(
+        input_path_1,
+        seq_length=60,
+        num_masked_tokens=20,
+        max_seq_length=max_seq_length,
+        num_examples=batch_size)
+    input_path_2 = os.path.join(self.get_temp_dir(), 'train_4.tf_record')
+    _create_fake_dataset(
+        input_path_2,
+        seq_length=60,
+        num_masked_tokens=15,
+        max_seq_length=max_seq_length,
+        num_examples=batch_size)
+    input_paths = ','.join([input_path_1, input_path_2])
+    data_config = pretrain_dynamic_dataloader.BertPretrainDataConfig(
+        is_training=False,
+        input_path=input_paths,
+        seq_bucket_lengths=[64, 128],
+        use_position_id=True,
+        global_batch_size=batch_size * 2)
+    dataset = pretrain_dynamic_dataloader.PretrainingDynamicDataLoader(
+        data_config).load()
+    dataset_it = iter(dataset)
+    with self.assertRaisesRegex(
+        tf.errors.InvalidArgumentError, '.*Number of non padded mask tokens.*'):
+      next(dataset_it)
+if __name__ == '__main__':
+  tf.test.main()

pretrain_text_dataloader.py ADDED Viewed

	@@ -0,0 +1,226 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Loads text dataset for the BERT pretraining task."""
+import dataclasses
+from typing import List, Mapping, Optional, Text
+import tensorflow as tf, tf_keras
+import tensorflow_text as tf_text
+from official.common import dataset_fn
+from official.core import config_definitions as cfg
+from official.core import input_reader
+from official.nlp.data import data_loader
+from official.nlp.data import data_loader_factory
+from official.nlp.modeling.ops import segment_extractor
+@dataclasses.dataclass
+class BertPretrainTextDataConfig(cfg.DataConfig):
+  """Data config for BERT pretraining task (tasks/masked_lm) from text."""
+  input_path: str = ""
+  doc_batch_size: int = 8
+  global_batch_size: int = 512
+  is_training: bool = True
+  seq_length: int = 512
+  max_predictions_per_seq: int = 76
+  use_next_sentence_label: bool = True
+  # The name of the text feature fields. The text features will be
+  # concatenated in order.
+  # Note: More than 1 field name is not compatible with NSP.
+  text_field_names: Optional[List[str]] = dataclasses.field(
+      default_factory=lambda: ["text"])
+  vocab_file_path: str = ""
+  masking_rate: float = 0.15
+  use_whole_word_masking: bool = False
+  file_type: str = "tfrecord"
+_CLS_TOKEN = b"[CLS]"
+_SEP_TOKEN = b"[SEP]"
+_MASK_TOKEN = b"[MASK]"
+_NUM_OOV_BUCKETS = 1
+# Accounts for [CLS] and 2 x [SEP] tokens
+_NUM_SPECIAL_TOKENS = 3
+@data_loader_factory.register_data_loader_cls(BertPretrainTextDataConfig)
+class BertPretrainTextDataLoader(data_loader.DataLoader):
+  """A class to load text dataset for BERT pretraining task."""
+  def __init__(self, params):
+    """Inits `BertPretrainTextDataLoader` class.
+    Args:
+      params: A `BertPretrainTextDataConfig` object.
+    """
+    if len(params.text_field_names) > 1 and params.use_next_sentence_label:
+      raise ValueError("Currently there is no support for more than text field "
+                       "while generating next sentence labels.")
+    self._params = params
+    self._seq_length = params.seq_length
+    self._max_predictions_per_seq = params.max_predictions_per_seq
+    self._use_next_sentence_label = params.use_next_sentence_label
+    self._masking_rate = params.masking_rate
+    self._use_whole_word_masking = params.use_whole_word_masking
+    lookup_table_init = tf.lookup.TextFileInitializer(
+        params.vocab_file_path,
+        key_dtype=tf.string,
+        key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
+        value_dtype=tf.int64,
+        value_index=tf.lookup.TextFileIndex.LINE_NUMBER)
+    self._vocab_lookup_table = tf.lookup.StaticVocabularyTable(
+        lookup_table_init,
+        num_oov_buckets=_NUM_OOV_BUCKETS,
+        lookup_key_dtype=tf.string)
+    self._cls_token = self._vocab_lookup_table.lookup(tf.constant(_CLS_TOKEN))
+    self._sep_token = self._vocab_lookup_table.lookup(tf.constant(_SEP_TOKEN))
+    self._mask_token = self._vocab_lookup_table.lookup(tf.constant(_MASK_TOKEN))
+    # -_NUM_OOV_BUCKETS to offset unused OOV bucket.
+    self._vocab_size = self._vocab_lookup_table.size() - _NUM_OOV_BUCKETS
+  def _decode(self, record: tf.Tensor) -> Mapping[Text, tf.Tensor]:
+    """Decodes a serialized tf.Example."""
+    name_to_features = {}
+    for text_field_name in self._params.text_field_names:
+      name_to_features[text_field_name] = tf.io.FixedLenFeature([], tf.string)
+    return tf.io.parse_single_example(record, name_to_features)
+  def _tokenize(self, segments):
+    """Tokenize the input segments."""
+    # Tokenize segments
+    tokenizer = tf_text.BertTokenizer(
+        self._vocab_lookup_table, token_out_type=tf.int64)
+    if self._use_whole_word_masking:
+      # tokenize the segments which should have the shape:
+      # [num_sentence, (num_words), (num_wordpieces)]
+      segments = [tokenizer.tokenize(s) for s in segments]
+    else:
+      # tokenize the segments and merge out the token dimension so that each
+      # segment has the shape: [num_sentence, (num_wordpieces)]
+      segments = [tokenizer.tokenize(s).merge_dims(-2, -1) for s in segments]
+    # Truncate inputs
+    trimmer = tf_text.WaterfallTrimmer(
+        self._seq_length - _NUM_SPECIAL_TOKENS, axis=-1)
+    truncated_segments = trimmer.trim(segments)
+    # Combine segments, get segment ids and add special tokens
+    return tf_text.combine_segments(
+        truncated_segments,
+        start_of_sequence_id=self._cls_token,
+        end_of_segment_id=self._sep_token)
+  def _bert_preprocess(self, record: Mapping[str, tf.Tensor]):
+    """Parses raw tensors into a dict of tensors to be consumed by the model."""
+    if self._use_next_sentence_label:
+      input_text = record[self._params.text_field_names[0]]
+      # Split sentences
+      sentence_breaker = tf_text.RegexSplitter()
+      sentences = sentence_breaker.split(input_text)
+      # Extract next-sentence-prediction labels and segments
+      next_or_random_segment, is_next = (
+          segment_extractor.get_next_sentence_labels(sentences))
+      # merge dims to change shape from [num_docs, (num_segments)] to
+      # [total_num_segments]
+      is_next = is_next.merge_dims(-2, -1)
+      # construct segments with shape [(num_sentence)]
+      segments = [
+          sentences.merge_dims(-2, -1),
+          next_or_random_segment.merge_dims(-2, -1)
+      ]
+    else:
+      segments = [record[name] for name in self._params.text_field_names]
+    segments_combined, segment_ids = self._tokenize(segments)
+    # Dynamic masking
+    item_selector = tf_text.RandomItemSelector(
+        self._max_predictions_per_seq,
+        selection_rate=self._masking_rate,
+        unselectable_ids=[self._cls_token, self._sep_token],
+        shuffle_fn=(tf.identity if self._params.deterministic else None))
+    values_chooser = tf_text.MaskValuesChooser(
+        vocab_size=self._vocab_size, mask_token=self._mask_token)
+    masked_input_ids, masked_lm_positions, masked_lm_ids = (
+        tf_text.mask_language_model(
+            segments_combined,
+            item_selector=item_selector,
+            mask_values_chooser=values_chooser,
+        ))
+    # Pad out to fixed shape and get input mask.
+    seq_lengths = {
+        "input_word_ids": self._seq_length,
+        "input_type_ids": self._seq_length,
+        "masked_lm_positions": self._max_predictions_per_seq,
+        "masked_lm_ids": self._max_predictions_per_seq,
+    }
+    model_inputs = {
+        "input_word_ids": masked_input_ids,
+        "input_type_ids": segment_ids,
+        "masked_lm_positions": masked_lm_positions,
+        "masked_lm_ids": masked_lm_ids,
+    }
+    padded_inputs_and_mask = tf.nest.map_structure(tf_text.pad_model_inputs,
+                                                   model_inputs, seq_lengths)
+    model_inputs = {
+        k: padded_inputs_and_mask[k][0] for k in padded_inputs_and_mask
+    }
+    model_inputs["masked_lm_weights"] = tf.cast(
+        padded_inputs_and_mask["masked_lm_ids"][1], tf.float32)
+    model_inputs["input_mask"] = padded_inputs_and_mask["input_word_ids"][1]
+    if self._use_next_sentence_label:
+      model_inputs["next_sentence_labels"] = is_next
+    for name in model_inputs:
+      t = model_inputs[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      model_inputs[name] = t
+    return model_inputs
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    def _batch_docs(dataset, input_context):
+      per_core_doc_batch_size = (
+          input_context.get_per_replica_batch_size(self._params.doc_batch_size)
+          if input_context else self._params.doc_batch_size)
+      return dataset.batch(per_core_doc_batch_size)
+    reader = input_reader.InputReader(
+        params=self._params,
+        dataset_fn=dataset_fn.pick_dataset_fn(self._params.file_type),
+        decoder_fn=self._decode if self._params.input_path else None,
+        transform_and_batch_fn=_batch_docs
+        if self._use_next_sentence_label else None,
+        postprocess_fn=self._bert_preprocess)
+    transformed_inputs = reader.read(input_context)
+    per_core_example_batch_size = (
+        input_context.get_per_replica_batch_size(self._params.global_batch_size)
+        if input_context else self._params.global_batch_size)
+    batched_inputs = transformed_inputs.unbatch().batch(
+        per_core_example_batch_size, self._params.drop_remainder)
+    return batched_inputs.prefetch(tf.data.experimental.AUTOTUNE)

question_answering_dataloader.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Loads dataset for the question answering (e.g, SQuAD) task."""
+import dataclasses
+from typing import Mapping, Optional
+import tensorflow as tf, tf_keras
+from official.common import dataset_fn
+from official.core import config_definitions as cfg
+from official.core import input_reader
+from official.nlp.data import data_loader
+from official.nlp.data import data_loader_factory
+@dataclasses.dataclass
+class QADataConfig(cfg.DataConfig):
+  """Data config for question answering task (tasks/question_answering)."""
+  # For training, `input_path` is expected to be a pre-processed TFRecord file,
+  # while for evaluation, it is expected to be a raw JSON file (b/173814590).
+  input_path: str = ''
+  global_batch_size: int = 48
+  is_training: bool = True
+  seq_length: int = 384
+  # Settings below are question answering specific.
+  version_2_with_negative: bool = False
+  # Settings below are only used for eval mode.
+  input_preprocessed_data_path: str = ''
+  doc_stride: int = 128
+  query_length: int = 64
+  # The path to the vocab file of word piece tokenizer or the
+  # model of the sentence piece tokenizer.
+  vocab_file: str = ''
+  tokenization: str = 'WordPiece'  # WordPiece or SentencePiece
+  do_lower_case: bool = True
+  xlnet_format: bool = False
+  file_type: str = 'tfrecord'
+@data_loader_factory.register_data_loader_cls(QADataConfig)
+class QuestionAnsweringDataLoader(data_loader.DataLoader):
+  """A class to load dataset for sentence prediction (classification) task."""
+  def __init__(self, params):
+    self._params = params
+    self._seq_length = params.seq_length
+    self._is_training = params.is_training
+    self._xlnet_format = params.xlnet_format
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    name_to_features = {
+        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+    }
+    if self._xlnet_format:
+      name_to_features['class_index'] = tf.io.FixedLenFeature([], tf.int64)
+      name_to_features['paragraph_mask'] = tf.io.FixedLenFeature(
+          [self._seq_length], tf.int64)
+      if self._is_training:
+        name_to_features['is_impossible'] = tf.io.FixedLenFeature([], tf.int64)
+    if self._is_training:
+      name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64)
+      name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64)
+    else:
+      name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64)
+    example = tf.io.parse_single_example(record, name_to_features)
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in example:
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+    return example
+  def _parse(self, record: Mapping[str, tf.Tensor]):
+    """Parses raw tensors into a dict of tensors to be consumed by the model."""
+    x, y = {}, {}
+    for name, tensor in record.items():
+      if name in ('start_positions', 'end_positions', 'is_impossible'):
+        y[name] = tensor
+      elif name == 'input_ids':
+        x['input_word_ids'] = tensor
+      elif name == 'segment_ids':
+        x['input_type_ids'] = tensor
+      else:
+        x[name] = tensor
+      if name == 'start_positions' and self._xlnet_format:
+        x[name] = tensor
+    return (x, y)
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    reader = input_reader.InputReader(
+        params=self._params,
+        dataset_fn=dataset_fn.pick_dataset_fn(self._params.file_type),
+        decoder_fn=self._decode,
+        parser_fn=self._parse)
+    return reader.read(input_context)

question_answering_dataloader_test.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for official.nlp.data.question_answering_dataloader."""
+import os
+import numpy as np
+import tensorflow as tf, tf_keras
+from official.nlp.data import question_answering_dataloader
+def _create_fake_dataset(output_path, seq_length):
+  """Creates a fake dataset."""
+  writer = tf.io.TFRecordWriter(output_path)
+  def create_int_feature(values):
+    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+    return f
+  for _ in range(100):
+    features = {}
+    input_ids = np.random.randint(100, size=(seq_length))
+    features['input_ids'] = create_int_feature(input_ids)
+    features['input_mask'] = create_int_feature(np.ones_like(input_ids))
+    features['segment_ids'] = create_int_feature(np.ones_like(input_ids))
+    features['start_positions'] = create_int_feature(np.array([0]))
+    features['end_positions'] = create_int_feature(np.array([10]))
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+class QuestionAnsweringDataTest(tf.test.TestCase):
+  def test_load_dataset(self):
+    seq_length = 128
+    batch_size = 10
+    input_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
+    _create_fake_dataset(input_path, seq_length)
+    data_config = question_answering_dataloader.QADataConfig(
+        is_training=True,
+        input_path=input_path,
+        seq_length=seq_length,
+        global_batch_size=batch_size)
+    dataset = question_answering_dataloader.QuestionAnsweringDataLoader(
+        data_config).load()
+    features, labels = next(iter(dataset))
+    self.assertCountEqual(['input_word_ids', 'input_mask', 'input_type_ids'],
+                          features.keys())
+    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
+    self.assertCountEqual(['start_positions', 'end_positions'], labels.keys())
+    self.assertEqual(labels['start_positions'].shape, (batch_size,))
+    self.assertEqual(labels['end_positions'].shape, (batch_size,))
+if __name__ == '__main__':
+  tf.test.main()

sentence_prediction_dataloader.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Loads dataset for the sentence prediction (classification) task."""
+import dataclasses
+import functools
+from typing import List, Mapping, Optional, Tuple
+import tensorflow as tf, tf_keras
+import tensorflow_hub as hub
+from official.common import dataset_fn
+from official.core import config_definitions as cfg
+from official.core import input_reader
+from official.nlp import modeling
+from official.nlp.data import data_loader
+from official.nlp.data import data_loader_factory
+LABEL_TYPES_MAP = {'int': tf.int64, 'float': tf.float32}
+@dataclasses.dataclass
+class SentencePredictionDataConfig(cfg.DataConfig):
+  """Data config for sentence prediction task (tasks/sentence_prediction)."""
+  input_path: str = ''
+  global_batch_size: int = 32
+  is_training: bool = True
+  seq_length: int = 128
+  label_type: str = 'int'
+  # Whether to include the example id number.
+  include_example_id: bool = False
+  label_field: str = 'label_ids'
+  # Maps the key in TfExample to feature name.
+  # E.g 'label_ids' to 'next_sentence_labels'
+  label_name: Optional[Tuple[str, str]] = None
+  # Either tfrecord, sstable, or recordio.
+  file_type: str = 'tfrecord'
+@data_loader_factory.register_data_loader_cls(SentencePredictionDataConfig)
+class SentencePredictionDataLoader(data_loader.DataLoader):
+  """A class to load dataset for sentence prediction (classification) task."""
+  def __init__(self, params):
+    self._params = params
+    self._seq_length = params.seq_length
+    self._include_example_id = params.include_example_id
+    self._label_field = params.label_field
+    if params.label_name:
+      self._label_name_mapping = dict([params.label_name])
+    else:
+      self._label_name_mapping = dict()
+  def name_to_features_spec(self):
+    """Defines features to decode. Subclass may override to append features."""
+    label_type = LABEL_TYPES_MAP[self._params.label_type]
+    name_to_features = {
+        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        self._label_field: tf.io.FixedLenFeature([], label_type),
+    }
+    if self._include_example_id:
+      name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)
+    return name_to_features
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    example = tf.io.parse_single_example(record, self.name_to_features_spec())
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in example:
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+    return example
+  def _parse(self, record: Mapping[str, tf.Tensor]):
+    """Parses raw tensors into a dict of tensors to be consumed by the model."""
+    key_mapping = {
+        'input_ids': 'input_word_ids',
+        'input_mask': 'input_mask',
+        'segment_ids': 'input_type_ids'
+    }
+    ret = {}
+    for record_key in record:
+      if record_key in key_mapping:
+        ret[key_mapping[record_key]] = record[record_key]
+      else:
+        ret[record_key] = record[record_key]
+    if self._label_field in self._label_name_mapping:
+      ret[self._label_name_mapping[self._label_field]] = record[
+          self._label_field]
+    return ret
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    reader = input_reader.InputReader(
+        dataset_fn=dataset_fn.pick_dataset_fn(self._params.file_type),
+        params=self._params,
+        decoder_fn=self._decode,
+        parser_fn=self._parse)
+    return reader.read(input_context)
+@dataclasses.dataclass
+class SentencePredictionTextDataConfig(cfg.DataConfig):
+  """Data config for sentence prediction task with raw text."""
+  # Either set `input_path`...
+  input_path: str = ''
+  # Either `int` or `float`.
+  label_type: str = 'int'
+  # ...or `tfds_name` and `tfds_split` to specify input.
+  tfds_name: str = ''
+  tfds_split: str = ''
+  # The name of the text feature fields. The text features will be
+  # concatenated in order.
+  text_fields: Optional[List[str]] = None
+  label_field: str = 'label'
+  global_batch_size: int = 32
+  seq_length: int = 128
+  is_training: bool = True
+  # Either build preprocessing with Python code by specifying these values
+  # for modeling.layers.BertTokenizer()/SentencepieceTokenizer()....
+  tokenization: str = 'WordPiece'  # WordPiece or SentencePiece
+  # Text vocab file if tokenization is WordPiece, or sentencepiece.ModelProto
+  # file if tokenization is SentencePiece.
+  vocab_file: str = ''
+  lower_case: bool = True
+  # ...or load preprocessing from a SavedModel at this location.
+  preprocessing_hub_module_url: str = ''
+  # Either tfrecord or sstsable or recordio.
+  file_type: str = 'tfrecord'
+  include_example_id: bool = False
+class TextProcessor(tf.Module):
+  """Text features processing for sentence prediction task."""
+  def __init__(self,
+               seq_length: int,
+               vocab_file: Optional[str] = None,
+               tokenization: Optional[str] = None,
+               lower_case: Optional[bool] = True,
+               preprocessing_hub_module_url: Optional[str] = None):
+    if preprocessing_hub_module_url:
+      self._preprocessing_hub_module = hub.load(preprocessing_hub_module_url)
+      self._tokenizer = self._preprocessing_hub_module.tokenize
+      self._pack_inputs = functools.partial(
+          self._preprocessing_hub_module.bert_pack_inputs,
+          seq_length=seq_length)
+      return
+    if tokenization == 'WordPiece':
+      self._tokenizer = modeling.layers.BertTokenizer(
+          vocab_file=vocab_file, lower_case=lower_case)
+    elif tokenization == 'SentencePiece':
+      self._tokenizer = modeling.layers.SentencepieceTokenizer(
+          model_file_path=vocab_file,
+          lower_case=lower_case,
+          strip_diacritics=True)  # Strip diacritics to follow ALBERT model
+    else:
+      raise ValueError('Unsupported tokenization: %s' % tokenization)
+    self._pack_inputs = modeling.layers.BertPackInputs(
+        seq_length=seq_length,
+        special_tokens_dict=self._tokenizer.get_special_tokens_dict())
+  def __call__(self, segments):
+    segments = [self._tokenizer(s) for s in segments]
+    # BertTokenizer returns a RaggedTensor with shape [batch, word, subword],
+    # and SentencepieceTokenizer returns a RaggedTensor with shape
+    # [batch, sentencepiece],
+    segments = [
+        tf.cast(x.merge_dims(1, -1) if x.shape.rank > 2 else x, tf.int32)
+        for x in segments
+    ]
+    return self._pack_inputs(segments)
+@data_loader_factory.register_data_loader_cls(SentencePredictionTextDataConfig)
+class SentencePredictionTextDataLoader(data_loader.DataLoader):
+  """Loads dataset with raw text for sentence prediction task."""
+  def __init__(self, params):
+    if bool(params.tfds_name) != bool(params.tfds_split):
+      raise ValueError('`tfds_name` and `tfds_split` should be specified or '
+                       'unspecified at the same time.')
+    if bool(params.tfds_name) == bool(params.input_path):
+      raise ValueError('Must specify either `tfds_name` and `tfds_split` '
+                       'or `input_path`.')
+    if not params.text_fields:
+      raise ValueError('Unexpected empty text fields.')
+    if bool(params.vocab_file) == bool(params.preprocessing_hub_module_url):
+      raise ValueError('Must specify exactly one of vocab_file (with matching '
+                       'lower_case flag) or preprocessing_hub_module_url.')
+    self._params = params
+    self._text_fields = params.text_fields
+    self._label_field = params.label_field
+    self._label_type = params.label_type
+    self._include_example_id = params.include_example_id
+    self._text_processor = TextProcessor(
+        seq_length=params.seq_length,
+        vocab_file=params.vocab_file,
+        tokenization=params.tokenization,
+        lower_case=params.lower_case,
+        preprocessing_hub_module_url=params.preprocessing_hub_module_url)
+  def _bert_preprocess(self, record: Mapping[str, tf.Tensor]):
+    """Berts preprocess."""
+    segments = [record[x] for x in self._text_fields]
+    model_inputs = self._text_processor(segments)
+    for key in record:
+      if key not in self._text_fields:
+        model_inputs[key] = record[key]
+    return model_inputs
+  def name_to_features_spec(self):
+    name_to_features = {}
+    for text_field in self._text_fields:
+      name_to_features[text_field] = tf.io.FixedLenFeature([], tf.string)
+    label_type = LABEL_TYPES_MAP[self._label_type]
+    name_to_features[self._label_field] = tf.io.FixedLenFeature([], label_type)
+    if self._include_example_id:
+      name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)
+    return name_to_features
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    example = tf.io.parse_single_example(record, self.name_to_features_spec())
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in example:
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+    return example
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    reader = input_reader.InputReader(
+        dataset_fn=dataset_fn.pick_dataset_fn(self._params.file_type),
+        decoder_fn=self._decode if self._params.input_path else None,
+        params=self._params,
+        postprocess_fn=self._bert_preprocess)
+    return reader.read(input_context)

sentence_prediction_dataloader_test.py ADDED Viewed

	@@ -0,0 +1,290 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for official.nlp.data.sentence_prediction_dataloader."""
+import os
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf, tf_keras
+from sentencepiece import SentencePieceTrainer
+from official.nlp.data import sentence_prediction_dataloader as loader
+def _create_fake_preprocessed_dataset(output_path, seq_length, label_type):
+  """Creates a fake dataset."""
+  writer = tf.io.TFRecordWriter(output_path)
+  def create_int_feature(values):
+    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+    return f
+  def create_float_feature(values):
+    f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+    return f
+  for _ in range(100):
+    features = {}
+    input_ids = np.random.randint(100, size=(seq_length))
+    features['input_ids'] = create_int_feature(input_ids)
+    features['input_mask'] = create_int_feature(np.ones_like(input_ids))
+    features['segment_ids'] = create_int_feature(np.ones_like(input_ids))
+    if label_type == 'int':
+      features['label_ids'] = create_int_feature([1])
+    elif label_type == 'float':
+      features['label_ids'] = create_float_feature([0.5])
+    else:
+      raise ValueError('Unsupported label_type: %s' % label_type)
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+def _create_fake_raw_dataset(output_path, text_fields, label_type):
+  """Creates a fake tf record file."""
+  writer = tf.io.TFRecordWriter(output_path)
+  def create_str_feature(value):
+    f = tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
+    return f
+  def create_int_feature(values):
+    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+    return f
+  def create_float_feature(values):
+    f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+    return f
+  for _ in range(100):
+    features = {}
+    for text_field in text_fields:
+      features[text_field] = create_str_feature([b'hello world'])
+    if label_type == 'int':
+      features['label'] = create_int_feature([0])
+    elif label_type == 'float':
+      features['label'] = create_float_feature([0.5])
+    else:
+      raise ValueError('Unexpected label_type: %s' % label_type)
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+def _create_fake_sentencepiece_model(output_dir):
+  vocab = ['a', 'b', 'c', 'd', 'e', 'abc', 'def', 'ABC', 'DEF']
+  model_prefix = os.path.join(output_dir, 'spm_model')
+  input_text_file_path = os.path.join(output_dir, 'train_input.txt')
+  with tf.io.gfile.GFile(input_text_file_path, 'w') as f:
+    f.write(' '.join(vocab + ['\n']))
+  # Add 7 more tokens: <pad>, <unk>, [CLS], [SEP], [MASK], <s>, </s>.
+  full_vocab_size = len(vocab) + 7
+  flags = dict(
+      model_prefix=model_prefix,
+      model_type='word',
+      input=input_text_file_path,
+      pad_id=0,
+      unk_id=1,
+      control_symbols='[CLS],[SEP],[MASK]',
+      vocab_size=full_vocab_size,
+      bos_id=full_vocab_size - 2,
+      eos_id=full_vocab_size - 1)
+  SentencePieceTrainer.Train(' '.join(
+      ['--{}={}'.format(k, v) for k, v in flags.items()]))
+  return model_prefix + '.model'
+def _create_fake_vocab_file(vocab_file_path):
+  tokens = ['[PAD]']
+  for i in range(1, 100):
+    tokens.append('[unused%d]' % i)
+  tokens.extend(['[UNK]', '[CLS]', '[SEP]', '[MASK]', 'hello', 'world'])
+  with tf.io.gfile.GFile(vocab_file_path, 'w') as outfile:
+    outfile.write('\n'.join(tokens))
+class SentencePredictionDataTest(tf.test.TestCase, parameterized.TestCase):
+  @parameterized.parameters(('int', tf.int32), ('float', tf.float32))
+  def test_load_dataset(self, label_type, expected_label_type):
+    input_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
+    batch_size = 10
+    seq_length = 128
+    _create_fake_preprocessed_dataset(input_path, seq_length, label_type)
+    data_config = loader.SentencePredictionDataConfig(
+        input_path=input_path,
+        seq_length=seq_length,
+        global_batch_size=batch_size,
+        label_type=label_type)
+    dataset = loader.SentencePredictionDataLoader(data_config).load()
+    features = next(iter(dataset))
+    self.assertCountEqual(
+        ['input_word_ids', 'input_type_ids', 'input_mask', 'label_ids'],
+        features.keys())
+    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['label_ids'].shape, (batch_size,))
+    self.assertEqual(features['label_ids'].dtype, expected_label_type)
+  def test_load_dataset_with_label_mapping(self):
+    input_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
+    batch_size = 10
+    seq_length = 128
+    _create_fake_preprocessed_dataset(input_path, seq_length, 'int')
+    data_config = loader.SentencePredictionDataConfig(
+        input_path=input_path,
+        seq_length=seq_length,
+        global_batch_size=batch_size,
+        label_type='int',
+        label_name=('label_ids', 'next_sentence_labels'))
+    dataset = loader.SentencePredictionDataLoader(data_config).load()
+    features = next(iter(dataset))
+    self.assertCountEqual([
+        'input_word_ids', 'input_mask', 'input_type_ids',
+        'next_sentence_labels', 'label_ids'
+    ], features.keys())
+    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['label_ids'].shape, (batch_size,))
+    self.assertEqual(features['label_ids'].dtype, tf.int32)
+    self.assertEqual(features['next_sentence_labels'].shape, (batch_size,))
+    self.assertEqual(features['next_sentence_labels'].dtype, tf.int32)
+class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
+                                           parameterized.TestCase):
+  @parameterized.parameters(True, False)
+  def test_python_wordpiece_preprocessing(self, use_tfds):
+    batch_size = 10
+    seq_length = 256  # Non-default value.
+    lower_case = True
+    tf_record_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
+    text_fields = ['sentence1', 'sentence2']
+    if not use_tfds:
+      _create_fake_raw_dataset(tf_record_path, text_fields, label_type='int')
+    vocab_file_path = os.path.join(self.get_temp_dir(), 'vocab.txt')
+    _create_fake_vocab_file(vocab_file_path)
+    data_config = loader.SentencePredictionTextDataConfig(
+        input_path='' if use_tfds else tf_record_path,
+        tfds_name='glue/mrpc' if use_tfds else '',
+        tfds_split='train' if use_tfds else '',
+        text_fields=text_fields,
+        global_batch_size=batch_size,
+        seq_length=seq_length,
+        is_training=True,
+        lower_case=lower_case,
+        vocab_file=vocab_file_path)
+    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
+    features = next(iter(dataset))
+    label_field = data_config.label_field
+    expected_keys = [
+        'input_word_ids', 'input_type_ids', 'input_mask', label_field
+    ]
+    if use_tfds:
+      expected_keys += ['idx']
+    self.assertCountEqual(expected_keys, features.keys())
+    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features[label_field].shape, (batch_size,))
+  @parameterized.parameters(True, False)
+  def test_python_sentencepiece_preprocessing(self, use_tfds):
+    batch_size = 10
+    seq_length = 256  # Non-default value.
+    lower_case = True
+    tf_record_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
+    text_fields = ['sentence1', 'sentence2']
+    if not use_tfds:
+      _create_fake_raw_dataset(tf_record_path, text_fields, label_type='int')
+    sp_model_file_path = _create_fake_sentencepiece_model(self.get_temp_dir())
+    data_config = loader.SentencePredictionTextDataConfig(
+        input_path='' if use_tfds else tf_record_path,
+        tfds_name='glue/mrpc' if use_tfds else '',
+        tfds_split='train' if use_tfds else '',
+        text_fields=text_fields,
+        global_batch_size=batch_size,
+        seq_length=seq_length,
+        is_training=True,
+        lower_case=lower_case,
+        tokenization='SentencePiece',
+        vocab_file=sp_model_file_path,
+    )
+    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
+    features = next(iter(dataset))
+    label_field = data_config.label_field
+    expected_keys = [
+        'input_word_ids', 'input_type_ids', 'input_mask', label_field
+    ]
+    if use_tfds:
+      expected_keys += ['idx']
+    self.assertCountEqual(expected_keys, features.keys())
+    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features[label_field].shape, (batch_size,))
+  @parameterized.parameters(True, False)
+  def test_saved_model_preprocessing(self, use_tfds):
+    batch_size = 10
+    seq_length = 256  # Non-default value.
+    tf_record_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
+    text_fields = ['sentence1', 'sentence2']
+    if not use_tfds:
+      _create_fake_raw_dataset(tf_record_path, text_fields, label_type='float')
+    vocab_file_path = os.path.join(self.get_temp_dir(), 'vocab.txt')
+    _create_fake_vocab_file(vocab_file_path)
+    data_config = loader.SentencePredictionTextDataConfig(
+        input_path='' if use_tfds else tf_record_path,
+        tfds_name='glue/mrpc' if use_tfds else '',
+        tfds_split='train' if use_tfds else '',
+        text_fields=text_fields,
+        global_batch_size=batch_size,
+        seq_length=seq_length,
+        is_training=True,
+        preprocessing_hub_module_url=(
+            'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'),
+        label_type='int' if use_tfds else 'float',
+    )
+    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
+    features = next(iter(dataset))
+    label_field = data_config.label_field
+    expected_keys = [
+        'input_word_ids', 'input_type_ids', 'input_mask', label_field
+    ]
+    if use_tfds:
+      expected_keys += ['idx']
+    self.assertCountEqual(expected_keys, features.keys())
+    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features[label_field].shape, (batch_size,))
+if __name__ == '__main__':
+  tf.test.main()

sentence_retrieval_lib.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT library to process data for cross lingual sentence retrieval task."""
+import os
+from absl import logging
+from official.nlp.data import classifier_data_lib
+from official.nlp.tools import tokenization
+class BuccProcessor(classifier_data_lib.DataProcessor):
+  """Procssor for Xtreme BUCC data set."""
+  supported_languages = ["de", "fr", "ru", "zh"]
+  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
+    super(BuccProcessor, self).__init__(process_text_fn)
+    self.languages = BuccProcessor.supported_languages
+  def get_dev_examples(self, data_dir, file_pattern):
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, file_pattern.format("dev"))),
+        "sample")
+  def get_test_examples(self, data_dir, file_pattern):
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, file_pattern.format("test"))),
+        "test")
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "BUCC"
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "%s-%s" % (set_type, i)
+      example_id = int(line[0].split("-")[1])
+      text_a = self.process_text_fn(line[1])
+      examples.append(
+          classifier_data_lib.InputExample(
+              guid=guid, text_a=text_a, example_id=example_id))
+    return examples
+class TatoebaProcessor(classifier_data_lib.DataProcessor):
+  """Procssor for Xtreme Tatoeba data set."""
+  supported_languages = [
+      "af", "ar", "bg", "bn", "de", "el", "es", "et", "eu", "fa", "fi", "fr",
+      "he", "hi", "hu", "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr",
+      "nl", "pt", "ru", "sw", "ta", "te", "th", "tl", "tr", "ur", "vi", "zh"
+  ]
+  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
+    super(TatoebaProcessor, self).__init__(process_text_fn)
+    self.languages = TatoebaProcessor.supported_languages
+  def get_test_examples(self, data_dir, file_path):
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, file_path)), "test")
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "TATOEBA"
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "%s-%s" % (set_type, i)
+      text_a = self.process_text_fn(line[0])
+      examples.append(
+          classifier_data_lib.InputExample(
+              guid=guid, text_a=text_a, example_id=i))
+    return examples
+def generate_sentence_retrevial_tf_record(processor,
+                                          data_dir,
+                                          tokenizer,
+                                          eval_data_output_path=None,
+                                          test_data_output_path=None,
+                                          max_seq_length=128):
+  """Generates the tf records for retrieval tasks.
+  Args:
+    processor: Input processor object to be used for generating data. Subclass
+      of `DataProcessor`.
+      data_dir: Directory that contains train/eval data to process. Data files
+        should be in from.
+      tokenizer: The tokenizer to be applied on the data.
+      eval_data_output_path: Output to which processed tf record for evaluation
+        will be saved.
+      test_data_output_path: Output to which processed tf record for testing
+        will be saved. Must be a pattern template with {} if processor has
+        language specific test data.
+      max_seq_length: Maximum sequence length of the to be generated
+        training/eval data.
+  Returns:
+      A dictionary containing input meta data.
+  """
+  assert eval_data_output_path or test_data_output_path
+  if processor.get_processor_name() == "BUCC":
+    path_pattern = "{}-en.{{}}.{}"
+  if processor.get_processor_name() == "TATOEBA":
+    path_pattern = "{}-en.{}"
+  meta_data = {
+      "processor_type": processor.get_processor_name(),
+      "max_seq_length": max_seq_length,
+      "number_eval_data": {},
+      "number_test_data": {},
+  }
+  logging.info("Start to process %s task data", processor.get_processor_name())
+  for lang_a in processor.languages:
+    for lang_b in [lang_a, "en"]:
+      if eval_data_output_path:
+        eval_input_data_examples = processor.get_dev_examples(
+            data_dir, os.path.join(path_pattern.format(lang_a, lang_b)))
+        num_eval_data = len(eval_input_data_examples)
+        logging.info("Processing %d dev examples of %s-en.%s", num_eval_data,
+                     lang_a, lang_b)
+        output_file = os.path.join(
+            eval_data_output_path,
+            "{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "dev"))
+        classifier_data_lib.file_based_convert_examples_to_features(
+            eval_input_data_examples, None, max_seq_length, tokenizer,
+            output_file, None)
+        meta_data["number_eval_data"][f"{lang_a}-en.{lang_b}"] = num_eval_data
+      if test_data_output_path:
+        test_input_data_examples = processor.get_test_examples(
+            data_dir, os.path.join(path_pattern.format(lang_a, lang_b)))
+        num_test_data = len(test_input_data_examples)
+        logging.info("Processing %d test examples of %s-en.%s", num_test_data,
+                     lang_a, lang_b)
+        output_file = os.path.join(
+            test_data_output_path,
+            "{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "test"))
+        classifier_data_lib.file_based_convert_examples_to_features(
+            test_input_data_examples, None, max_seq_length, tokenizer,
+            output_file, None)
+        meta_data["number_test_data"][f"{lang_a}-en.{lang_b}"] = num_test_data
+  return meta_data

squad_lib.py ADDED Viewed

	@@ -0,0 +1,975 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to process data for SQuAD 1.1 and SQuAD 2.0."""
+# pylint: disable=g-bad-import-order
+import collections
+import copy
+import json
+import math
+import os
+import six
+from absl import logging
+import tensorflow as tf, tf_keras
+from official.nlp.tools import tokenization
+class SquadExample(object):
+  """A single training/test example for simple sequence classification.
+  For examples without an answer, the start and end position are -1.
+  Attributes:
+    qas_id: ID of the question-answer pair.
+    question_text: Original text for the question.
+    doc_tokens: The list of tokens in the context obtained by splitting on
+      whitespace only.
+    orig_answer_text: Original text for the answer.
+    start_position: Starting index of the answer in `doc_tokens`.
+    end_position: Ending index of the answer in `doc_tokens`.
+    is_impossible: Whether the question is impossible to answer given the
+      context. Only used in SQuAD 2.0.
+  """
+  def __init__(self,
+               qas_id,
+               question_text,
+               doc_tokens,
+               orig_answer_text=None,
+               start_position=None,
+               end_position=None,
+               is_impossible=False):
+    self.qas_id = qas_id
+    self.question_text = question_text
+    self.doc_tokens = doc_tokens
+    self.orig_answer_text = orig_answer_text
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+  def __str__(self):
+    return self.__repr__()
+  def __repr__(self):
+    s = ""
+    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+    s += ", question_text: %s" % (
+        tokenization.printable_text(self.question_text))
+    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+    if self.start_position:
+      s += ", start_position: %d" % (self.start_position)
+    if self.start_position:
+      s += ", end_position: %d" % (self.end_position)
+    if self.start_position:
+      s += ", is_impossible: %r" % (self.is_impossible)
+    return s
+class InputFeatures(object):
+  """A single set of features of data."""
+  def __init__(self,
+               unique_id,
+               example_index,
+               doc_span_index,
+               tokens,
+               token_to_orig_map,
+               token_is_max_context,
+               input_ids,
+               input_mask,
+               segment_ids,
+               paragraph_mask=None,
+               class_index=None,
+               start_position=None,
+               end_position=None,
+               is_impossible=None):
+    self.unique_id = unique_id
+    self.example_index = example_index
+    self.doc_span_index = doc_span_index
+    self.tokens = tokens
+    self.token_to_orig_map = token_to_orig_map
+    self.token_is_max_context = token_is_max_context
+    self.input_ids = input_ids
+    self.input_mask = input_mask
+    self.segment_ids = segment_ids
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+    self.paragraph_mask = paragraph_mask
+    self.class_index = class_index
+class FeatureWriter(object):
+  """Writes InputFeature to TF example file."""
+  def __init__(self, filename, is_training):
+    self.filename = filename
+    self.is_training = is_training
+    self.num_features = 0
+    tf.io.gfile.makedirs(os.path.dirname(filename))
+    self._writer = tf.io.TFRecordWriter(filename)
+  def process_feature(self, feature):
+    """Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
+    self.num_features += 1
+    def create_int_feature(values):
+      feature = tf.train.Feature(
+          int64_list=tf.train.Int64List(value=list(values)))
+      return feature
+    features = collections.OrderedDict()
+    features["unique_ids"] = create_int_feature([feature.unique_id])
+    features["input_ids"] = create_int_feature(feature.input_ids)
+    features["input_mask"] = create_int_feature(feature.input_mask)
+    features["segment_ids"] = create_int_feature(feature.segment_ids)
+    if feature.paragraph_mask is not None:
+      features["paragraph_mask"] = create_int_feature(feature.paragraph_mask)
+    if feature.class_index is not None:
+      features["class_index"] = create_int_feature([feature.class_index])
+    if self.is_training:
+      features["start_positions"] = create_int_feature([feature.start_position])
+      features["end_positions"] = create_int_feature([feature.end_position])
+      impossible = 0
+      if feature.is_impossible:
+        impossible = 1
+      features["is_impossible"] = create_int_feature([impossible])
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    self._writer.write(tf_example.SerializeToString())
+  def close(self):
+    self._writer.close()
+def read_squad_examples(input_file, is_training,
+                        version_2_with_negative,
+                        translated_input_folder=None):
+  """Read a SQuAD json file into a list of SquadExample."""
+  with tf.io.gfile.GFile(input_file, "r") as reader:
+    input_data = json.load(reader)["data"]
+  if translated_input_folder is not None:
+    translated_files = tf.io.gfile.glob(
+        os.path.join(translated_input_folder, "*.json"))
+    for file in translated_files:
+      with tf.io.gfile.GFile(file, "r") as reader:
+        input_data.extend(json.load(reader)["data"])
+  def is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+      return True
+    return False
+  examples = []
+  for entry in input_data:
+    for paragraph in entry["paragraphs"]:
+      paragraph_text = paragraph["context"]
+      doc_tokens = []
+      char_to_word_offset = []
+      prev_is_whitespace = True
+      for c in paragraph_text:
+        if is_whitespace(c):
+          prev_is_whitespace = True
+        else:
+          if prev_is_whitespace:
+            doc_tokens.append(c)
+          else:
+            doc_tokens[-1] += c
+          prev_is_whitespace = False
+        char_to_word_offset.append(len(doc_tokens) - 1)
+      for qa in paragraph["qas"]:
+        qas_id = qa["id"]
+        question_text = qa["question"]
+        start_position = None
+        end_position = None
+        orig_answer_text = None
+        is_impossible = False
+        if is_training:
+          if version_2_with_negative:
+            is_impossible = qa["is_impossible"]
+          if (len(qa["answers"]) != 1) and (not is_impossible):
+            raise ValueError(
+                "For training, each question should have exactly 1 answer.")
+          if not is_impossible:
+            answer = qa["answers"][0]
+            orig_answer_text = answer["text"]
+            answer_offset = answer["answer_start"]
+            answer_length = len(orig_answer_text)
+            start_position = char_to_word_offset[answer_offset]
+            end_position = char_to_word_offset[answer_offset + answer_length -
+                                               1]
+            # Only add answers where the text can be exactly recovered from the
+            # document. If this CAN'T happen it's likely due to weird Unicode
+            # stuff so we will just skip the example.
+            #
+            # Note that this means for training mode, every example is NOT
+            # guaranteed to be preserved.
+            actual_text = " ".join(doc_tokens[start_position:(end_position +
+                                                              1)])
+            cleaned_answer_text = " ".join(
+                tokenization.whitespace_tokenize(orig_answer_text))
+            if actual_text.find(cleaned_answer_text) == -1:
+              logging.warning("Could not find answer: '%s' vs. '%s'",
+                              actual_text, cleaned_answer_text)
+              continue
+          else:
+            start_position = -1
+            end_position = -1
+            orig_answer_text = ""
+        example = SquadExample(
+            qas_id=qas_id,
+            question_text=question_text,
+            doc_tokens=doc_tokens,
+            orig_answer_text=orig_answer_text,
+            start_position=start_position,
+            end_position=end_position,
+            is_impossible=is_impossible)
+        examples.append(example)
+  return examples
+def convert_examples_to_features(examples,
+                                 tokenizer,
+                                 max_seq_length,
+                                 doc_stride,
+                                 max_query_length,
+                                 is_training,
+                                 output_fn,
+                                 xlnet_format=False,
+                                 batch_size=None):
+  """Loads a data file into a list of `InputBatch`s."""
+  base_id = 1000000000
+  unique_id = base_id
+  feature = None
+  for (example_index, example) in enumerate(examples):
+    query_tokens = tokenizer.tokenize(example.question_text)
+    if len(query_tokens) > max_query_length:
+      query_tokens = query_tokens[0:max_query_length]
+    tok_to_orig_index = []
+    orig_to_tok_index = []
+    all_doc_tokens = []
+    for (i, token) in enumerate(example.doc_tokens):
+      orig_to_tok_index.append(len(all_doc_tokens))
+      sub_tokens = tokenizer.tokenize(token)
+      for sub_token in sub_tokens:
+        tok_to_orig_index.append(i)
+        all_doc_tokens.append(sub_token)
+    tok_start_position = None
+    tok_end_position = None
+    if is_training and example.is_impossible:
+      tok_start_position = -1
+      tok_end_position = -1
+    if is_training and not example.is_impossible:
+      tok_start_position = orig_to_tok_index[example.start_position]
+      if example.end_position < len(example.doc_tokens) - 1:
+        tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+      else:
+        tok_end_position = len(all_doc_tokens) - 1
+      (tok_start_position, tok_end_position) = _improve_answer_span(
+          all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+          example.orig_answer_text)
+    # The -3 accounts for [CLS], [SEP] and [SEP]
+    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+    # We can have documents that are longer than the maximum sequence length.
+    # To deal with this we do a sliding window approach, where we take chunks
+    # of the up to our max length with a stride of `doc_stride`.
+    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+        "DocSpan", ["start", "length"])
+    doc_spans = []
+    start_offset = 0
+    while start_offset < len(all_doc_tokens):
+      length = len(all_doc_tokens) - start_offset
+      if length > max_tokens_for_doc:
+        length = max_tokens_for_doc
+      doc_spans.append(_DocSpan(start=start_offset, length=length))
+      if start_offset + length == len(all_doc_tokens):
+        break
+      start_offset += min(length, doc_stride)
+    for (doc_span_index, doc_span) in enumerate(doc_spans):
+      tokens = []
+      token_to_orig_map = {}
+      token_is_max_context = {}
+      segment_ids = []
+      # Paragraph mask used in XLNet.
+      # 1 represents paragraph and class tokens.
+      # 0 represents query and other special tokens.
+      paragraph_mask = []
+      # pylint: disable=cell-var-from-loop
+      def process_query(seg_q):
+        for token in query_tokens:
+          tokens.append(token)
+          segment_ids.append(seg_q)
+          paragraph_mask.append(0)
+        tokens.append("[SEP]")
+        segment_ids.append(seg_q)
+        paragraph_mask.append(0)
+      def process_paragraph(seg_p):
+        for i in range(doc_span.length):
+          split_token_index = doc_span.start + i
+          token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+          is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                                 split_token_index)
+          token_is_max_context[len(tokens)] = is_max_context
+          tokens.append(all_doc_tokens[split_token_index])
+          segment_ids.append(seg_p)
+          paragraph_mask.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(seg_p)
+        paragraph_mask.append(0)
+      def process_class(seg_class):
+        class_index = len(segment_ids)
+        tokens.append("[CLS]")
+        segment_ids.append(seg_class)
+        paragraph_mask.append(1)
+        return class_index
+      if xlnet_format:
+        seg_p, seg_q, seg_class, seg_pad = 0, 1, 2, 3
+        process_paragraph(seg_p)
+        process_query(seg_q)
+        class_index = process_class(seg_class)
+      else:
+        seg_p, seg_q, seg_class, seg_pad = 1, 0, 0, 0
+        class_index = process_class(seg_class)
+        process_query(seg_q)
+        process_paragraph(seg_p)
+      input_ids = tokenizer.convert_tokens_to_ids(tokens)
+      # The mask has 1 for real tokens and 0 for padding tokens. Only real
+      # tokens are attended to.
+      input_mask = [1] * len(input_ids)
+      # Zero-pad up to the sequence length.
+      while len(input_ids) < max_seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(seg_pad)
+        paragraph_mask.append(0)
+      assert len(input_ids) == max_seq_length
+      assert len(input_mask) == max_seq_length
+      assert len(segment_ids) == max_seq_length
+      assert len(paragraph_mask) == max_seq_length
+      start_position = 0
+      end_position = 0
+      span_contains_answer = False
+      if is_training and not example.is_impossible:
+        # For training, if our document chunk does not contain an annotation
+        # we throw it out, since there is nothing to predict.
+        doc_start = doc_span.start
+        doc_end = doc_span.start + doc_span.length - 1
+        span_contains_answer = (tok_start_position >= doc_start and
+                                tok_end_position <= doc_end)
+        if span_contains_answer:
+          doc_offset = 0 if xlnet_format else len(query_tokens) + 2
+          start_position = tok_start_position - doc_start + doc_offset
+          end_position = tok_end_position - doc_start + doc_offset
+      if example_index < 20:
+        logging.info("*** Example ***")
+        logging.info("unique_id: %s", (unique_id))
+        logging.info("example_index: %s", (example_index))
+        logging.info("doc_span_index: %s", (doc_span_index))
+        logging.info("tokens: %s",
+                     " ".join([tokenization.printable_text(x) for x in tokens]))
+        logging.info(
+            "token_to_orig_map: %s", " ".join([
+                "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)
+            ]))
+        logging.info(
+            "token_is_max_context: %s", " ".join([
+                "%d:%s" % (x, y)
+                for (x, y) in six.iteritems(token_is_max_context)
+            ]))
+        logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
+        logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
+        logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
+        logging.info("paragraph_mask: %s", " ".join(
+            [str(x) for x in paragraph_mask]))
+        logging.info("class_index: %d", class_index)
+        if is_training:
+          if span_contains_answer:
+            answer_text = " ".join(tokens[start_position:(end_position + 1)])
+            logging.info("start_position: %d", (start_position))
+            logging.info("end_position: %d", (end_position))
+            logging.info("answer: %s", tokenization.printable_text(answer_text))
+          else:
+            logging.info("document span doesn't contain answer")
+      feature = InputFeatures(
+          unique_id=unique_id,
+          example_index=example_index,
+          doc_span_index=doc_span_index,
+          tokens=tokens,
+          paragraph_mask=paragraph_mask,
+          class_index=class_index,
+          token_to_orig_map=token_to_orig_map,
+          token_is_max_context=token_is_max_context,
+          input_ids=input_ids,
+          input_mask=input_mask,
+          segment_ids=segment_ids,
+          start_position=start_position,
+          end_position=end_position,
+          is_impossible=not span_contains_answer)
+      # Run callback
+      if is_training:
+        output_fn(feature)
+      else:
+        output_fn(feature, is_padding=False)
+      unique_id += 1
+  if not is_training and feature:
+    assert batch_size
+    num_padding = 0
+    num_examples = unique_id - base_id
+    if unique_id % batch_size != 0:
+      num_padding = batch_size - (num_examples % batch_size)
+    logging.info("Adding padding examples to make sure no partial batch.")
+    logging.info("Adds %d padding examples for inference.", num_padding)
+    dummy_feature = copy.deepcopy(feature)
+    for _ in range(num_padding):
+      dummy_feature.unique_id = unique_id
+      # Run callback
+      output_fn(feature, is_padding=True)
+      unique_id += 1
+  return unique_id - base_id
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+  """Returns tokenized answer spans that better match the annotated answer."""
+  # The SQuAD annotations are character based. We first project them to
+  # whitespace-tokenized words. But then after WordPiece tokenization, we can
+  # often find a "better match". For example:
+  #
+  #   Question: What year was John Smith born?
+  #   Context: The leader was John Smith (1895-1943).
+  #   Answer: 1895
+  #
+  # The original whitespace-tokenized answer will be "(1895-1943).". However
+  # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+  # the exact answer, 1895.
+  #
+  # However, this is not always possible. Consider the following:
+  #
+  #   Question: What country is the top exporter of electronics?
+  #   Context: The Japanese electronics industry is the lagest in the world.
+  #   Answer: Japan
+  #
+  # In this case, the annotator chose "Japan" as a character sub-span of
+  # the word "Japanese". Since our WordPiece tokenizer does not split
+  # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+  # in SQuAD, but does happen.
+  tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+  for new_start in range(input_start, input_end + 1):
+    for new_end in range(input_end, new_start - 1, -1):
+      text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+      if text_span == tok_answer_text:
+        return (new_start, new_end)
+  return (input_start, input_end)
+def _check_is_max_context(doc_spans, cur_span_index, position):
+  """Check if this is the 'max context' doc span for the token."""
+  # Because of the sliding window approach taken to scoring documents, a single
+  # token can appear in multiple documents. E.g.
+  #  Doc: the man went to the store and bought a gallon of milk
+  #  Span A: the man went to the
+  #  Span B: to the store and bought
+  #  Span C: and bought a gallon of
+  #  ...
+  #
+  # Now the word 'bought' will have two scores from spans B and C. We only
+  # want to consider the score with "maximum context", which we define as
+  # the *minimum* of its left and right context (the *sum* of left and
+  # right context will always be the same, of course).
+  #
+  # In the example the maximum context for 'bought' would be span C since
+  # it has 1 left context and 3 right context, while span B has 4 left context
+  # and 0 right context.
+  best_score = None
+  best_span_index = None
+  for (span_index, doc_span) in enumerate(doc_spans):
+    end = doc_span.start + doc_span.length - 1
+    if position < doc_span.start:
+      continue
+    if position > end:
+      continue
+    num_left_context = position - doc_span.start
+    num_right_context = end - position
+    score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+    if best_score is None or score > best_score:
+      best_score = score
+      best_span_index = span_index
+  return cur_span_index == best_span_index
+def write_predictions(all_examples,
+                      all_features,
+                      all_results,
+                      n_best_size,
+                      max_answer_length,
+                      do_lower_case,
+                      output_prediction_file,
+                      output_nbest_file,
+                      output_null_log_odds_file,
+                      version_2_with_negative=False,
+                      null_score_diff_threshold=0.0,
+                      verbose=False):
+  """Write final predictions to the json file and log-odds of null if needed."""
+  logging.info("Writing predictions to: %s", (output_prediction_file))
+  logging.info("Writing nbest to: %s", (output_nbest_file))
+  all_predictions, all_nbest_json, scores_diff_json = (
+      postprocess_output(
+          all_examples=all_examples,
+          all_features=all_features,
+          all_results=all_results,
+          n_best_size=n_best_size,
+          max_answer_length=max_answer_length,
+          do_lower_case=do_lower_case,
+          version_2_with_negative=version_2_with_negative,
+          null_score_diff_threshold=null_score_diff_threshold,
+          verbose=verbose))
+  write_to_json_files(all_predictions, output_prediction_file)
+  write_to_json_files(all_nbest_json, output_nbest_file)
+  if version_2_with_negative:
+    write_to_json_files(scores_diff_json, output_null_log_odds_file)
+def postprocess_output(all_examples,
+                       all_features,
+                       all_results,
+                       n_best_size,
+                       max_answer_length,
+                       do_lower_case,
+                       version_2_with_negative=False,
+                       null_score_diff_threshold=0.0,
+                       xlnet_format=False,
+                       verbose=False):
+  """Postprocess model output, to form predicton results."""
+  example_index_to_features = collections.defaultdict(list)
+  for feature in all_features:
+    example_index_to_features[feature.example_index].append(feature)
+  unique_id_to_result = {}
+  for result in all_results:
+    unique_id_to_result[result.unique_id] = result
+  _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+      "PrelimPrediction",
+      ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+  all_predictions = collections.OrderedDict()
+  all_nbest_json = collections.OrderedDict()
+  scores_diff_json = collections.OrderedDict()
+  for (example_index, example) in enumerate(all_examples):
+    features = example_index_to_features[example_index]
+    prelim_predictions = []
+    # keep track of the minimum score of null start+end of position 0
+    score_null = 1000000  # large and positive
+    min_null_feature_index = 0  # the paragraph slice with min mull score
+    null_start_logit = 0  # the start logit at the slice with min null score
+    null_end_logit = 0  # the end logit at the slice with min null score
+    for (feature_index, feature) in enumerate(features):
+      if feature.unique_id not in unique_id_to_result:
+        logging.info("Skip eval example %s, not in pred.", feature.unique_id)
+        continue
+      result = unique_id_to_result[feature.unique_id]
+      # if we could have irrelevant answers, get the min score of irrelevant
+      if version_2_with_negative:
+        if xlnet_format:
+          feature_null_score = result.class_logits
+        else:
+          feature_null_score = result.start_logits[0] + result.end_logits[0]
+        if feature_null_score < score_null:
+          score_null = feature_null_score
+          min_null_feature_index = feature_index
+          null_start_logit = result.start_logits[0]
+          null_end_logit = result.end_logits[0]
+      for (start_index, start_logit,
+           end_index, end_logit) in _get_best_indexes_and_logits(
+               result=result,
+               n_best_size=n_best_size,
+               xlnet_format=xlnet_format):
+        # We could hypothetically create invalid predictions, e.g., predict
+        # that the start of the span is in the question. We throw out all
+        # invalid predictions.
+        if start_index >= len(feature.tokens):
+          continue
+        if end_index >= len(feature.tokens):
+          continue
+        if start_index not in feature.token_to_orig_map:
+          continue
+        if end_index not in feature.token_to_orig_map:
+          continue
+        if not feature.token_is_max_context.get(start_index, False):
+          continue
+        if end_index < start_index:
+          continue
+        length = end_index - start_index + 1
+        if length > max_answer_length:
+          continue
+        prelim_predictions.append(
+            _PrelimPrediction(
+                feature_index=feature_index,
+                start_index=start_index,
+                end_index=end_index,
+                start_logit=start_logit,
+                end_logit=end_logit))
+    if version_2_with_negative and not xlnet_format:
+      prelim_predictions.append(
+          _PrelimPrediction(
+              feature_index=min_null_feature_index,
+              start_index=0,
+              end_index=0,
+              start_logit=null_start_logit,
+              end_logit=null_end_logit))
+    prelim_predictions = sorted(
+        prelim_predictions,
+        key=lambda x: (x.start_logit + x.end_logit),
+        reverse=True)
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_logit", "end_logit"])
+    seen_predictions = {}
+    nbest = []
+    for pred in prelim_predictions:
+      if len(nbest) >= n_best_size:
+        break
+      feature = features[pred.feature_index]
+      if pred.start_index > 0 or xlnet_format:  # this is a non-null prediction
+        tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+        orig_doc_start = feature.token_to_orig_map[pred.start_index]
+        orig_doc_end = feature.token_to_orig_map[pred.end_index]
+        orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+        tok_text = " ".join(tok_tokens)
+        # De-tokenize WordPieces that have been split off.
+        tok_text = tok_text.replace(" ##", "")
+        tok_text = tok_text.replace("##", "")
+        # Clean whitespace
+        tok_text = tok_text.strip()
+        tok_text = " ".join(tok_text.split())
+        orig_text = " ".join(orig_tokens)
+        final_text = get_final_text(
+            tok_text, orig_text, do_lower_case, verbose=verbose)
+        if final_text in seen_predictions:
+          continue
+        seen_predictions[final_text] = True
+      else:
+        final_text = ""
+        seen_predictions[final_text] = True
+      nbest.append(
+          _NbestPrediction(
+              text=final_text,
+              start_logit=pred.start_logit,
+              end_logit=pred.end_logit))
+    # if we didn't include the empty option in the n-best, include it
+    if version_2_with_negative and not xlnet_format:
+      if "" not in seen_predictions:
+        nbest.append(
+            _NbestPrediction(
+                text="", start_logit=null_start_logit,
+                end_logit=null_end_logit))
+    # In very rare edge cases we could have no valid predictions. So we
+    # just create a nonce prediction in this case to avoid failure.
+    if not nbest:
+      nbest.append(
+          _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+    assert len(nbest) >= 1
+    total_scores = []
+    best_non_null_entry = None
+    for entry in nbest:
+      total_scores.append(entry.start_logit + entry.end_logit)
+      if not best_non_null_entry:
+        if entry.text:
+          best_non_null_entry = entry
+    probs = _compute_softmax(total_scores)
+    nbest_json = []
+    for (i, entry) in enumerate(nbest):
+      output = collections.OrderedDict()
+      output["text"] = entry.text
+      output["probability"] = probs[i]
+      output["start_logit"] = entry.start_logit
+      output["end_logit"] = entry.end_logit
+      nbest_json.append(output)
+    assert len(nbest_json) >= 1
+    if not version_2_with_negative:
+      all_predictions[example.qas_id] = nbest_json[0]["text"]
+    else:
+      # pytype: disable=attribute-error
+      # predict "" iff the null score - the score of best non-null > threshold
+      if best_non_null_entry is not None:
+        if xlnet_format:
+          score_diff = score_null
+          scores_diff_json[example.qas_id] = score_diff
+          all_predictions[example.qas_id] = best_non_null_entry.text
+        else:
+          score_diff = score_null - best_non_null_entry.start_logit - (
+              best_non_null_entry.end_logit)
+          scores_diff_json[example.qas_id] = score_diff
+          if score_diff > null_score_diff_threshold:
+            all_predictions[example.qas_id] = ""
+          else:
+            all_predictions[example.qas_id] = best_non_null_entry.text
+      else:
+        logging.warning("best_non_null_entry is None")
+        scores_diff_json[example.qas_id] = score_null
+        all_predictions[example.qas_id] = ""
+      # pytype: enable=attribute-error
+    all_nbest_json[example.qas_id] = nbest_json
+  return all_predictions, all_nbest_json, scores_diff_json
+def write_to_json_files(json_records, json_file):
+  with tf.io.gfile.GFile(json_file, "w") as writer:
+    writer.write(json.dumps(json_records, indent=4) + "\n")
+def get_final_text(pred_text, orig_text, do_lower_case, verbose=False):
+  """Project the tokenized prediction back to the original text."""
+  # When we created the data, we kept track of the alignment between original
+  # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+  # now `orig_text` contains the span of our original text corresponding to the
+  # span that we predicted.
+  #
+  # However, `orig_text` may contain extra characters that we don't want in
+  # our prediction.
+  #
+  # For example, let's say:
+  #   pred_text = steve smith
+  #   orig_text = Steve Smith's
+  #
+  # We don't want to return `orig_text` because it contains the extra "'s".
+  #
+  # We don't want to return `pred_text` because it's already been normalized
+  # (the SQuAD eval script also does punctuation stripping/lower casing but
+  # our tokenizer does additional normalization like stripping accent
+  # characters).
+  #
+  # What we really want to return is "Steve Smith".
+  #
+  # Therefore, we have to apply a semi-complicated alignment heruistic between
+  # `pred_text` and `orig_text` to get a character-to-character alignment. This
+  # can fail in certain cases in which case we just return `orig_text`.
+  def _strip_spaces(text):
+    ns_chars = []
+    ns_to_s_map = collections.OrderedDict()
+    for (i, c) in enumerate(text):
+      if c == " ":
+        continue
+      ns_to_s_map[len(ns_chars)] = i
+      ns_chars.append(c)
+    ns_text = "".join(ns_chars)
+    return (ns_text, ns_to_s_map)
+  # We first tokenize `orig_text`, strip whitespace from the result
+  # and `pred_text`, and check if they are the same length. If they are
+  # NOT the same length, the heuristic has failed. If they are the same
+  # length, we assume the characters are one-to-one aligned.
+  tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+  tok_text = " ".join(tokenizer.tokenize(orig_text))
+  start_position = tok_text.find(pred_text)
+  if start_position == -1:
+    if verbose:
+      logging.info("Unable to find text: '%s' in '%s'", pred_text, orig_text)
+    return orig_text
+  end_position = start_position + len(pred_text) - 1
+  (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+  (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+  if len(orig_ns_text) != len(tok_ns_text):
+    if verbose:
+      logging.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                   orig_ns_text, tok_ns_text)
+    return orig_text
+  # We then project the characters in `pred_text` back to `orig_text` using
+  # the character-to-character alignment.
+  tok_s_to_ns_map = {}
+  for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+    tok_s_to_ns_map[tok_index] = i
+  orig_start_position = None
+  if start_position in tok_s_to_ns_map:
+    ns_start_position = tok_s_to_ns_map[start_position]
+    if ns_start_position in orig_ns_to_s_map:
+      orig_start_position = orig_ns_to_s_map[ns_start_position]
+  if orig_start_position is None:
+    if verbose:
+      logging.info("Couldn't map start position")
+    return orig_text
+  orig_end_position = None
+  if end_position in tok_s_to_ns_map:
+    ns_end_position = tok_s_to_ns_map[end_position]
+    if ns_end_position in orig_ns_to_s_map:
+      orig_end_position = orig_ns_to_s_map[ns_end_position]
+  if orig_end_position is None:
+    if verbose:
+      logging.info("Couldn't map end position")
+    return orig_text
+  output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+  return output_text
+def _get_best_indexes_and_logits(result,
+                                 n_best_size,
+                                 xlnet_format=False):
+  """Generates the n-best indexes and logits from a list."""
+  if xlnet_format:
+    for i in range(n_best_size):
+      for j in range(n_best_size):
+        j_index = i * n_best_size + j
+        yield (result.start_indexes[i], result.start_logits[i],
+               result.end_indexes[j_index], result.end_logits[j_index])
+  else:
+    start_index_and_score = sorted(enumerate(result.start_logits),
+                                   key=lambda x: x[1], reverse=True)
+    end_index_and_score = sorted(enumerate(result.end_logits),
+                                 key=lambda x: x[1], reverse=True)
+    for i in range(len(start_index_and_score)):
+      if i >= n_best_size:
+        break
+      for j in range(len(end_index_and_score)):
+        if j >= n_best_size:
+          break
+        yield (start_index_and_score[i][0], start_index_and_score[i][1],
+               end_index_and_score[j][0], end_index_and_score[j][1])
+def _compute_softmax(scores):
+  """Compute softmax probability over raw logits."""
+  if not scores:
+    return []
+  max_score = None
+  for score in scores:
+    if max_score is None or score > max_score:
+      max_score = score
+  exp_scores = []
+  total_sum = 0.0
+  for score in scores:
+    x = math.exp(score - max_score)
+    exp_scores.append(x)
+    total_sum += x
+  probs = []
+  for score in exp_scores:
+    probs.append(score / total_sum)
+  return probs
+def generate_tf_record_from_json_file(input_file_path,
+                                      vocab_file_path,
+                                      output_path,
+                                      translated_input_folder=None,
+                                      max_seq_length=384,
+                                      do_lower_case=True,
+                                      max_query_length=64,
+                                      doc_stride=128,
+                                      version_2_with_negative=False,
+                                      xlnet_format=False):
+  """Generates and saves training data into a tf record file."""
+  train_examples = read_squad_examples(
+      input_file=input_file_path,
+      is_training=True,
+      version_2_with_negative=version_2_with_negative,
+      translated_input_folder=translated_input_folder)
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=vocab_file_path, do_lower_case=do_lower_case)
+  train_writer = FeatureWriter(filename=output_path, is_training=True)
+  number_of_examples = convert_examples_to_features(
+      examples=train_examples,
+      tokenizer=tokenizer,
+      max_seq_length=max_seq_length,
+      doc_stride=doc_stride,
+      max_query_length=max_query_length,
+      is_training=True,
+      output_fn=train_writer.process_feature,
+      xlnet_format=xlnet_format)
+  train_writer.close()
+  meta_data = {
+      "task_type": "bert_squad",
+      "train_data_size": number_of_examples,
+      "max_seq_length": max_seq_length,
+      "max_query_length": max_query_length,
+      "doc_stride": doc_stride,
+      "version_2_with_negative": version_2_with_negative,
+  }
+  return meta_data

squad_lib_sp.py ADDED Viewed

	@@ -0,0 +1,976 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run ALBERT on SQuAD 1.1 and SQuAD 2.0 using sentence piece tokenization.
+The file is forked from:
+https://github.com/google-research/ALBERT/blob/master/run_squad_sp.py
+"""
+import collections
+import copy
+import json
+import math
+import os
+from absl import logging
+import numpy as np
+import tensorflow as tf, tf_keras
+from official.nlp.tools import tokenization
+class SquadExample(object):
+  """A single training/test example for simple sequence classification.
+     For examples without an answer, the start and end position are -1.
+  """
+  def __init__(self,
+               qas_id,
+               question_text,
+               paragraph_text,
+               orig_answer_text=None,
+               start_position=None,
+               end_position=None,
+               is_impossible=False):
+    self.qas_id = qas_id
+    self.question_text = question_text
+    self.paragraph_text = paragraph_text
+    self.orig_answer_text = orig_answer_text
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+  def __str__(self):
+    return self.__repr__()
+  def __repr__(self):
+    s = ""
+    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+    s += ", question_text: %s" % (
+        tokenization.printable_text(self.question_text))
+    s += ", paragraph_text: [%s]" % (" ".join(self.paragraph_text))
+    if self.start_position:
+      s += ", start_position: %d" % (self.start_position,)
+    if self.start_position:
+      s += ", end_position: %d" % (self.end_position)
+    if self.start_position:
+      s += ", is_impossible: %r" % (self.is_impossible)
+    return s
+class InputFeatures(object):
+  """A single set of features of data."""
+  def __init__(self,
+               unique_id,
+               example_index,
+               doc_span_index,
+               tok_start_to_orig_index,
+               tok_end_to_orig_index,
+               token_is_max_context,
+               tokens,
+               input_ids,
+               input_mask,
+               segment_ids,
+               paragraph_len,
+               class_index=None,
+               paragraph_mask=None,
+               start_position=None,
+               end_position=None,
+               is_impossible=None):
+    self.unique_id = unique_id
+    self.example_index = example_index
+    self.doc_span_index = doc_span_index
+    self.tok_start_to_orig_index = tok_start_to_orig_index
+    self.tok_end_to_orig_index = tok_end_to_orig_index
+    self.token_is_max_context = token_is_max_context
+    self.tokens = tokens
+    self.input_ids = input_ids
+    self.input_mask = input_mask
+    self.paragraph_mask = paragraph_mask
+    self.segment_ids = segment_ids
+    self.paragraph_len = paragraph_len
+    self.class_index = class_index
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+def read_squad_examples(input_file,
+                        is_training,
+                        version_2_with_negative,
+                        translated_input_folder=None):
+  """Read a SQuAD json file into a list of SquadExample."""
+  del version_2_with_negative
+  with tf.io.gfile.GFile(input_file, "r") as reader:
+    input_data = json.load(reader)["data"]
+  if translated_input_folder is not None:
+    translated_files = tf.io.gfile.glob(
+        os.path.join(translated_input_folder, "*.json"))
+    for file in translated_files:
+      with tf.io.gfile.GFile(file, "r") as reader:
+        input_data.extend(json.load(reader)["data"])
+  examples = []
+  for entry in input_data:
+    for paragraph in entry["paragraphs"]:
+      paragraph_text = paragraph["context"]
+      for qa in paragraph["qas"]:
+        qas_id = qa["id"]
+        question_text = qa["question"]
+        start_position = None
+        orig_answer_text = None
+        is_impossible = False
+        if is_training:
+          is_impossible = qa.get("is_impossible", False)
+          if (len(qa["answers"]) != 1) and (not is_impossible):
+            raise ValueError(
+                "For training, each question should have exactly 1 answer.")
+          if not is_impossible:
+            answer = qa["answers"][0]
+            orig_answer_text = answer["text"]
+            start_position = answer["answer_start"]
+          else:
+            start_position = -1
+            orig_answer_text = ""
+        example = SquadExample(
+            qas_id=qas_id,
+            question_text=question_text,
+            paragraph_text=paragraph_text,
+            orig_answer_text=orig_answer_text,
+            start_position=start_position,
+            is_impossible=is_impossible)
+        examples.append(example)
+  return examples
+def _convert_index(index, pos, m=None, is_start=True):
+  """Converts index."""
+  if index[pos] is not None:
+    return index[pos]
+  n = len(index)
+  rear = pos
+  while rear < n - 1 and index[rear] is None:
+    rear += 1
+  front = pos
+  while front > 0 and index[front] is None:
+    front -= 1
+  assert index[front] is not None or index[rear] is not None
+  if index[front] is None:
+    if index[rear] >= 1:  # pytype: disable=unsupported-operands
+      if is_start:
+        return 0
+      else:
+        return index[rear] - 1
+    return index[rear]
+  if index[rear] is None:
+    if m is not None and index[front] < m - 1:
+      if is_start:
+        return index[front] + 1
+      else:
+        return m - 1
+    return index[front]
+  if is_start:
+    if index[rear] > index[front] + 1:
+      return index[front] + 1
+    else:
+      return index[rear]
+  else:
+    if index[rear] > index[front] + 1:
+      return index[rear] - 1
+    else:
+      return index[front]
+def convert_examples_to_features(examples,
+                                 tokenizer,
+                                 max_seq_length,
+                                 doc_stride,
+                                 max_query_length,
+                                 is_training,
+                                 output_fn,
+                                 do_lower_case,
+                                 xlnet_format=False,
+                                 batch_size=None):
+  """Loads a data file into a list of `InputBatch`s."""
+  cnt_pos, cnt_neg = 0, 0
+  base_id = 1000000000
+  unique_id = base_id
+  max_n, max_m = 1024, 1024
+  f = np.zeros((max_n, max_m), dtype=np.float32)
+  for (example_index, example) in enumerate(examples):
+    if example_index % 100 == 0:
+      logging.info("Converting %d/%d pos %d neg %d", example_index,
+                   len(examples), cnt_pos, cnt_neg)
+    query_tokens = tokenization.encode_ids(
+        tokenizer.sp_model,
+        tokenization.preprocess_text(
+            example.question_text, lower=do_lower_case))
+    if len(query_tokens) > max_query_length:
+      query_tokens = query_tokens[0:max_query_length]
+    paragraph_text = example.paragraph_text
+    para_tokens = tokenization.encode_pieces(
+        tokenizer.sp_model,
+        tokenization.preprocess_text(
+            example.paragraph_text, lower=do_lower_case))
+    chartok_to_tok_index = []
+    tok_start_to_chartok_index = []
+    tok_end_to_chartok_index = []
+    char_cnt = 0
+    for i, token in enumerate(para_tokens):
+      new_token = token.replace(tokenization.SPIECE_UNDERLINE, " ")
+      chartok_to_tok_index.extend([i] * len(new_token))
+      tok_start_to_chartok_index.append(char_cnt)
+      char_cnt += len(new_token)
+      tok_end_to_chartok_index.append(char_cnt - 1)
+    tok_cat_text = "".join(para_tokens).replace(tokenization.SPIECE_UNDERLINE,
+                                                " ")
+    n, m = len(paragraph_text), len(tok_cat_text)
+    if n > max_n or m > max_m:
+      max_n = max(n, max_n)
+      max_m = max(m, max_m)
+      f = np.zeros((max_n, max_m), dtype=np.float32)
+    g = {}
+    # pylint: disable=cell-var-from-loop
+    def _lcs_match(max_dist, n=n, m=m):
+      """Longest-common-substring algorithm."""
+      f.fill(0)
+      g.clear()
+      ### longest common sub sequence
+      # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j))
+      for i in range(n):
+        # unlike standard LCS, this is specifically optimized for the setting
+        # because the mismatch between sentence pieces and original text will
+        # be small
+        for j in range(i - max_dist, i + max_dist):
+          if j >= m or j < 0:
+            continue
+          if i > 0:
+            g[(i, j)] = 0
+            f[i, j] = f[i - 1, j]
+          if j > 0 and f[i, j - 1] > f[i, j]:
+            g[(i, j)] = 1
+            f[i, j] = f[i, j - 1]
+          f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0
+          if (tokenization.preprocess_text(
+              paragraph_text[i], lower=do_lower_case,
+              remove_space=False) == tok_cat_text[j] and f_prev + 1 > f[i, j]):
+            g[(i, j)] = 2
+            f[i, j] = f_prev + 1
+    # pylint: enable=cell-var-from-loop
+    max_dist = abs(n - m) + 5
+    for _ in range(2):
+      _lcs_match(max_dist)
+      if f[n - 1, m - 1] > 0.8 * n:
+        break
+      max_dist *= 2
+    orig_to_chartok_index = [None] * n
+    chartok_to_orig_index = [None] * m
+    i, j = n - 1, m - 1
+    while i >= 0 and j >= 0:
+      if (i, j) not in g:
+        break
+      if g[(i, j)] == 2:
+        orig_to_chartok_index[i] = j
+        chartok_to_orig_index[j] = i
+        i, j = i - 1, j - 1
+      elif g[(i, j)] == 1:
+        j = j - 1
+      else:
+        i = i - 1
+    if (all(v is None for v in orig_to_chartok_index) or
+        f[n - 1, m - 1] < 0.8 * n):
+      logging.info("MISMATCH DETECTED!")
+      continue
+    tok_start_to_orig_index = []
+    tok_end_to_orig_index = []
+    for i in range(len(para_tokens)):
+      start_chartok_pos = tok_start_to_chartok_index[i]
+      end_chartok_pos = tok_end_to_chartok_index[i]
+      start_orig_pos = _convert_index(
+          chartok_to_orig_index, start_chartok_pos, n, is_start=True)
+      end_orig_pos = _convert_index(
+          chartok_to_orig_index, end_chartok_pos, n, is_start=False)
+      tok_start_to_orig_index.append(start_orig_pos)
+      tok_end_to_orig_index.append(end_orig_pos)
+    if not is_training:
+      tok_start_position = tok_end_position = None
+    if is_training and example.is_impossible:
+      tok_start_position = 0
+      tok_end_position = 0
+    if is_training and not example.is_impossible:
+      start_position = example.start_position
+      end_position = start_position + len(example.orig_answer_text) - 1
+      start_chartok_pos = _convert_index(
+          orig_to_chartok_index, start_position, is_start=True)
+      tok_start_position = chartok_to_tok_index[start_chartok_pos]
+      end_chartok_pos = _convert_index(
+          orig_to_chartok_index, end_position, is_start=False)
+      tok_end_position = chartok_to_tok_index[end_chartok_pos]
+      assert tok_start_position <= tok_end_position
+    def _piece_to_id(x):
+      return tokenizer.sp_model.PieceToId(x)
+    all_doc_tokens = list(map(_piece_to_id, para_tokens))
+    # The -3 accounts for [CLS], [SEP] and [SEP]
+    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+    # We can have documents that are longer than the maximum sequence length.
+    # To deal with this we do a sliding window approach, where we take chunks
+    # of the up to our max length with a stride of `doc_stride`.
+    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+        "DocSpan", ["start", "length"])
+    doc_spans = []
+    start_offset = 0
+    while start_offset < len(all_doc_tokens):
+      length = len(all_doc_tokens) - start_offset
+      if length > max_tokens_for_doc:
+        length = max_tokens_for_doc
+      doc_spans.append(_DocSpan(start=start_offset, length=length))
+      if start_offset + length == len(all_doc_tokens):
+        break
+      start_offset += min(length, doc_stride)
+    for (doc_span_index, doc_span) in enumerate(doc_spans):
+      tokens = []
+      token_is_max_context = {}
+      segment_ids = []
+      # Paragraph mask used in XLNet.
+      # 1 represents paragraph and class tokens.
+      # 0 represents query and other special tokens.
+      paragraph_mask = []
+      cur_tok_start_to_orig_index = []
+      cur_tok_end_to_orig_index = []
+      # pylint: disable=cell-var-from-loop
+      def process_query(seg_q):
+        for token in query_tokens:
+          tokens.append(token)
+          segment_ids.append(seg_q)
+          paragraph_mask.append(0)
+        tokens.append(tokenizer.sp_model.PieceToId("[SEP]"))
+        segment_ids.append(seg_q)
+        paragraph_mask.append(0)
+      def process_paragraph(seg_p):
+        for i in range(doc_span.length):
+          split_token_index = doc_span.start + i
+          cur_tok_start_to_orig_index.append(
+              tok_start_to_orig_index[split_token_index])
+          cur_tok_end_to_orig_index.append(
+              tok_end_to_orig_index[split_token_index])
+          is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                                 split_token_index)
+          token_is_max_context[len(tokens)] = is_max_context
+          tokens.append(all_doc_tokens[split_token_index])
+          segment_ids.append(seg_p)
+          paragraph_mask.append(1)
+        tokens.append(tokenizer.sp_model.PieceToId("[SEP]"))
+        segment_ids.append(seg_p)
+        paragraph_mask.append(0)
+        return len(tokens)
+      def process_class(seg_class):
+        class_index = len(segment_ids)
+        tokens.append(tokenizer.sp_model.PieceToId("[CLS]"))
+        segment_ids.append(seg_class)
+        paragraph_mask.append(1)
+        return class_index
+      if xlnet_format:
+        seg_p, seg_q, seg_class, seg_pad = 0, 1, 2, 3
+        paragraph_len = process_paragraph(seg_p)
+        process_query(seg_q)
+        class_index = process_class(seg_class)
+      else:
+        seg_p, seg_q, seg_class, seg_pad = 1, 0, 0, 0
+        class_index = process_class(seg_class)
+        process_query(seg_q)
+        paragraph_len = process_paragraph(seg_p)
+      input_ids = tokens
+      # The mask has 1 for real tokens and 0 for padding tokens. Only real
+      # tokens are attended to.
+      input_mask = [1] * len(input_ids)
+      # Zero-pad up to the sequence length.
+      while len(input_ids) < max_seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(seg_pad)
+        paragraph_mask.append(0)
+      assert len(input_ids) == max_seq_length
+      assert len(input_mask) == max_seq_length
+      assert len(segment_ids) == max_seq_length
+      assert len(paragraph_mask) == max_seq_length
+      span_is_impossible = example.is_impossible
+      start_position = None
+      end_position = None
+      if is_training and not span_is_impossible:
+        # For training, if our document chunk does not contain an annotation
+        # we throw it out, since there is nothing to predict.
+        doc_start = doc_span.start
+        doc_end = doc_span.start + doc_span.length - 1
+        out_of_span = False
+        if not (tok_start_position >= doc_start and
+                tok_end_position <= doc_end):
+          out_of_span = True
+        if out_of_span:
+          # continue
+          start_position = 0
+          end_position = 0
+          span_is_impossible = True
+        else:
+          doc_offset = 0 if xlnet_format else len(query_tokens) + 2
+          start_position = tok_start_position - doc_start + doc_offset
+          end_position = tok_end_position - doc_start + doc_offset
+      if is_training and span_is_impossible:
+        start_position = class_index
+        end_position = class_index
+      if example_index < 20:
+        logging.info("*** Example ***")
+        logging.info("unique_id: %s", (unique_id))
+        logging.info("example_index: %s", (example_index))
+        logging.info("doc_span_index: %s", (doc_span_index))
+        logging.info("tok_start_to_orig_index: %s",
+                     " ".join([str(x) for x in cur_tok_start_to_orig_index]))
+        logging.info("tok_end_to_orig_index: %s",
+                     " ".join([str(x) for x in cur_tok_end_to_orig_index]))
+        logging.info(
+            "token_is_max_context: %s", " ".join(
+                ["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()]))
+        logging.info(
+            "input_pieces: %s",
+            " ".join([tokenizer.sp_model.IdToPiece(x) for x in tokens]))
+        logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
+        logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
+        logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
+        logging.info("paragraph_mask: %s", " ".join(
+            [str(x) for x in paragraph_mask]))
+        logging.info("class_index: %d", class_index)
+        if is_training and span_is_impossible:
+          logging.info("impossible example span")
+        if is_training and not span_is_impossible:
+          pieces = [
+              tokenizer.sp_model.IdToPiece(token)
+              for token in tokens[start_position:(end_position + 1)]
+          ]
+          answer_text = tokenizer.sp_model.DecodePieces(pieces)
+          logging.info("start_position: %d", (start_position))
+          logging.info("end_position: %d", (end_position))
+          logging.info("answer: %s", (tokenization.printable_text(answer_text)))
+          # With multi processing, the example_index is actually the index
+          # within the current process therefore we use example_index=None
+          # to avoid being used in the future.
+          # The current code does not use example_index of training data.
+      if is_training:
+        feat_example_index = None
+      else:
+        feat_example_index = example_index
+      feature = InputFeatures(
+          unique_id=unique_id,
+          example_index=feat_example_index,
+          doc_span_index=doc_span_index,
+          tok_start_to_orig_index=cur_tok_start_to_orig_index,
+          tok_end_to_orig_index=cur_tok_end_to_orig_index,
+          token_is_max_context=token_is_max_context,
+          tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens],
+          input_ids=input_ids,
+          input_mask=input_mask,
+          paragraph_mask=paragraph_mask,
+          segment_ids=segment_ids,
+          paragraph_len=paragraph_len,
+          class_index=class_index,
+          start_position=start_position,
+          end_position=end_position,
+          is_impossible=span_is_impossible)
+      # Run callback
+      if is_training:
+        output_fn(feature)
+      else:
+        output_fn(feature, is_padding=False)
+      unique_id += 1
+      if span_is_impossible:
+        cnt_neg += 1
+      else:
+        cnt_pos += 1
+  if not is_training and feature:
+    assert batch_size
+    num_padding = 0
+    num_examples = unique_id - base_id
+    if unique_id % batch_size != 0:
+      num_padding = batch_size - (num_examples % batch_size)
+    dummy_feature = copy.deepcopy(feature)
+    for _ in range(num_padding):
+      dummy_feature.unique_id = unique_id
+      # Run callback
+      output_fn(feature, is_padding=True)
+      unique_id += 1
+  logging.info("Total number of instances: %d = pos %d neg %d",
+               cnt_pos + cnt_neg, cnt_pos, cnt_neg)
+  return unique_id - base_id
+def _check_is_max_context(doc_spans, cur_span_index, position):
+  """Check if this is the 'max context' doc span for the token."""
+  # Because of the sliding window approach taken to scoring documents, a single
+  # token can appear in multiple documents. E.g.
+  #  Doc: the man went to the store and bought a gallon of milk
+  #  Span A: the man went to the
+  #  Span B: to the store and bought
+  #  Span C: and bought a gallon of
+  #  ...
+  #
+  # Now the word 'bought' will have two scores from spans B and C. We only
+  # want to consider the score with "maximum context", which we define as
+  # the *minimum* of its left and right context (the *sum* of left and
+  # right context will always be the same, of course).
+  #
+  # In the example the maximum context for 'bought' would be span C since
+  # it has 1 left context and 3 right context, while span B has 4 left context
+  # and 0 right context.
+  best_score = None
+  best_span_index = None
+  for (span_index, doc_span) in enumerate(doc_spans):
+    end = doc_span.start + doc_span.length - 1
+    if position < doc_span.start:
+      continue
+    if position > end:
+      continue
+    num_left_context = position - doc_span.start
+    num_right_context = end - position
+    score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+    if best_score is None or score > best_score:
+      best_score = score
+      best_span_index = span_index
+  return cur_span_index == best_span_index
+def write_predictions(all_examples,
+                      all_features,
+                      all_results,
+                      n_best_size,
+                      max_answer_length,
+                      do_lower_case,
+                      output_prediction_file,
+                      output_nbest_file,
+                      output_null_log_odds_file,
+                      version_2_with_negative=False,
+                      null_score_diff_threshold=0.0,
+                      verbose=False):
+  """Write final predictions to the json file and log-odds of null if needed."""
+  logging.info("Writing predictions to: %s", (output_prediction_file))
+  logging.info("Writing nbest to: %s", (output_nbest_file))
+  all_predictions, all_nbest_json, scores_diff_json = (
+      postprocess_output(
+          all_examples=all_examples,
+          all_features=all_features,
+          all_results=all_results,
+          n_best_size=n_best_size,
+          max_answer_length=max_answer_length,
+          do_lower_case=do_lower_case,
+          version_2_with_negative=version_2_with_negative,
+          null_score_diff_threshold=null_score_diff_threshold,
+          verbose=verbose))
+  write_to_json_files(all_predictions, output_prediction_file)
+  write_to_json_files(all_nbest_json, output_nbest_file)
+  if version_2_with_negative:
+    write_to_json_files(scores_diff_json, output_null_log_odds_file)
+def postprocess_output(all_examples,
+                       all_features,
+                       all_results,
+                       n_best_size,
+                       max_answer_length,
+                       do_lower_case,
+                       version_2_with_negative=False,
+                       null_score_diff_threshold=0.0,
+                       xlnet_format=False,
+                       verbose=False):
+  """Postprocess model output, to form predicton results."""
+  del do_lower_case, verbose
+  example_index_to_features = collections.defaultdict(list)
+  for feature in all_features:
+    example_index_to_features[feature.example_index].append(feature)
+  unique_id_to_result = {}
+  for result in all_results:
+    unique_id_to_result[result.unique_id] = result
+  _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+      "PrelimPrediction",
+      ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+  all_predictions = collections.OrderedDict()
+  all_nbest_json = collections.OrderedDict()
+  scores_diff_json = collections.OrderedDict()
+  for (example_index, example) in enumerate(all_examples):
+    features = example_index_to_features[example_index]
+    prelim_predictions = []
+    # keep track of the minimum score of null start+end of position 0
+    score_null = 1000000  # large and positive
+    min_null_feature_index = 0  # the paragraph slice with min mull score
+    null_start_logit = 0  # the start logit at the slice with min null score
+    null_end_logit = 0  # the end logit at the slice with min null score
+    for (feature_index, feature) in enumerate(features):
+      if feature.unique_id not in unique_id_to_result:
+        logging.info("Skip eval example %s, not in pred.", feature.unique_id)
+        continue
+      result = unique_id_to_result[feature.unique_id]
+      # if we could have irrelevant answers, get the min score of irrelevant
+      if version_2_with_negative:
+        if xlnet_format:
+          feature_null_score = result.class_logits
+        else:
+          feature_null_score = result.start_logits[0] + result.end_logits[0]
+        if feature_null_score < score_null:
+          score_null = feature_null_score
+          min_null_feature_index = feature_index
+          null_start_logit = result.start_logits[0]
+          null_end_logit = result.end_logits[0]
+      doc_offset = 0 if xlnet_format else feature.tokens.index("[SEP]") + 1
+      for (start_index, start_logit,
+           end_index, end_logit) in _get_best_indexes_and_logits(
+               result=result,
+               n_best_size=n_best_size,
+               xlnet_format=xlnet_format):
+        # We could hypothetically create invalid predictions, e.g., predict
+        # that the start of the span is in the question. We throw out all
+        # invalid predictions.
+        if start_index - doc_offset >= len(feature.tok_start_to_orig_index):
+          continue
+        if end_index - doc_offset >= len(feature.tok_end_to_orig_index):
+          continue
+        if not feature.token_is_max_context.get(start_index, False):
+          continue
+        if end_index < start_index:
+          continue
+        length = end_index - start_index + 1
+        if length > max_answer_length:
+          continue
+        prelim_predictions.append(
+            _PrelimPrediction(
+                feature_index=feature_index,
+                start_index=start_index - doc_offset,
+                end_index=end_index - doc_offset,
+                start_logit=start_logit,
+                end_logit=end_logit))
+    if version_2_with_negative and not xlnet_format:
+      prelim_predictions.append(
+          _PrelimPrediction(
+              feature_index=min_null_feature_index,
+              start_index=-1,
+              end_index=-1,
+              start_logit=null_start_logit,
+              end_logit=null_end_logit))
+    prelim_predictions = sorted(
+        prelim_predictions,
+        key=lambda x: (x.start_logit + x.end_logit),
+        reverse=True)
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_logit", "end_logit"])
+    seen_predictions = {}
+    nbest = []
+    for pred in prelim_predictions:
+      if len(nbest) >= n_best_size:
+        break
+      feature = features[pred.feature_index]
+      if pred.start_index >= 0 or xlnet_format:  # this is a non-null prediction
+        tok_start_to_orig_index = feature.tok_start_to_orig_index
+        tok_end_to_orig_index = feature.tok_end_to_orig_index
+        start_orig_pos = tok_start_to_orig_index[pred.start_index]
+        end_orig_pos = tok_end_to_orig_index[pred.end_index]
+        paragraph_text = example.paragraph_text
+        final_text = paragraph_text[start_orig_pos:end_orig_pos + 1].strip()
+        if final_text in seen_predictions:
+          continue
+        seen_predictions[final_text] = True
+      else:
+        final_text = ""
+        seen_predictions[final_text] = True
+      nbest.append(
+          _NbestPrediction(
+              text=final_text,
+              start_logit=pred.start_logit,
+              end_logit=pred.end_logit))
+    # if we didn't include the empty option in the n-best, include it
+    if version_2_with_negative and not xlnet_format:
+      if "" not in seen_predictions:
+        nbest.append(
+            _NbestPrediction(
+                text="", start_logit=null_start_logit,
+                end_logit=null_end_logit))
+    # In very rare edge cases we could have no valid predictions. So we
+    # just create a nonce prediction in this case to avoid failure.
+    if not nbest:
+      nbest.append(
+          _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+    assert len(nbest) >= 1
+    total_scores = []
+    best_non_null_entry = None
+    for entry in nbest:
+      total_scores.append(entry.start_logit + entry.end_logit)
+      if not best_non_null_entry:
+        if entry.text:
+          best_non_null_entry = entry
+    probs = _compute_softmax(total_scores)
+    nbest_json = []
+    for (i, entry) in enumerate(nbest):
+      output = collections.OrderedDict()
+      output["text"] = entry.text
+      output["probability"] = probs[i]
+      output["start_logit"] = entry.start_logit
+      output["end_logit"] = entry.end_logit
+      nbest_json.append(output)
+    assert len(nbest_json) >= 1
+    if not version_2_with_negative:
+      all_predictions[example.qas_id] = nbest_json[0]["text"]
+    else:
+      assert best_non_null_entry is not None
+      if xlnet_format:
+        score_diff = score_null
+        scores_diff_json[example.qas_id] = score_diff
+        all_predictions[example.qas_id] = best_non_null_entry.text
+      else:
+        # predict "" iff the null score - the score of best non-null > threshold
+        score_diff = score_null - best_non_null_entry.start_logit - (
+            best_non_null_entry.end_logit)
+        scores_diff_json[example.qas_id] = score_diff
+        if score_diff > null_score_diff_threshold:
+          all_predictions[example.qas_id] = ""
+        else:
+          all_predictions[example.qas_id] = best_non_null_entry.text
+    all_nbest_json[example.qas_id] = nbest_json
+  return all_predictions, all_nbest_json, scores_diff_json
+def write_to_json_files(json_records, json_file):
+  with tf.io.gfile.GFile(json_file, "w") as writer:
+    writer.write(json.dumps(json_records, indent=4) + "\n")
+def _get_best_indexes_and_logits(result,
+                                 n_best_size,
+                                 xlnet_format=False):
+  """Generates the n-best indexes and logits from a list."""
+  if xlnet_format:
+    for i in range(n_best_size):
+      for j in range(n_best_size):
+        j_index = i * n_best_size + j
+        yield (result.start_indexes[i], result.start_logits[i],
+               result.end_indexes[j_index], result.end_logits[j_index])
+  else:
+    start_index_and_score = sorted(enumerate(result.start_logits),
+                                   key=lambda x: x[1], reverse=True)
+    end_index_and_score = sorted(enumerate(result.end_logits),
+                                 key=lambda x: x[1], reverse=True)
+    for i in range(len(start_index_and_score)):
+      if i >= n_best_size:
+        break
+      for j in range(len(end_index_and_score)):
+        if j >= n_best_size:
+          break
+        yield (start_index_and_score[i][0], start_index_and_score[i][1],
+               end_index_and_score[j][0], end_index_and_score[j][1])
+def _compute_softmax(scores):
+  """Compute softmax probability over raw logits."""
+  if not scores:
+    return []
+  max_score = None
+  for score in scores:
+    if max_score is None or score > max_score:
+      max_score = score
+  exp_scores = []
+  total_sum = 0.0
+  for score in scores:
+    x = math.exp(score - max_score)
+    exp_scores.append(x)
+    total_sum += x
+  probs = []
+  for score in exp_scores:
+    probs.append(score / total_sum)
+  return probs
+class FeatureWriter(object):
+  """Writes InputFeature to TF example file."""
+  def __init__(self, filename, is_training):
+    self.filename = filename
+    self.is_training = is_training
+    self.num_features = 0
+    tf.io.gfile.makedirs(os.path.dirname(filename))
+    self._writer = tf.io.TFRecordWriter(filename)
+  def process_feature(self, feature):
+    """Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
+    self.num_features += 1
+    def create_int_feature(values):
+      feature = tf.train.Feature(
+          int64_list=tf.train.Int64List(value=list(values)))
+      return feature
+    features = collections.OrderedDict()
+    features["unique_ids"] = create_int_feature([feature.unique_id])
+    features["input_ids"] = create_int_feature(feature.input_ids)
+    features["input_mask"] = create_int_feature(feature.input_mask)
+    features["segment_ids"] = create_int_feature(feature.segment_ids)
+    if feature.paragraph_mask is not None:
+      features["paragraph_mask"] = create_int_feature(feature.paragraph_mask)
+    if feature.class_index is not None:
+      features["class_index"] = create_int_feature([feature.class_index])
+    if self.is_training:
+      features["start_positions"] = create_int_feature([feature.start_position])
+      features["end_positions"] = create_int_feature([feature.end_position])
+      impossible = 0
+      if feature.is_impossible:
+        impossible = 1
+      features["is_impossible"] = create_int_feature([impossible])
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    self._writer.write(tf_example.SerializeToString())
+  def close(self):
+    self._writer.close()
+def generate_tf_record_from_json_file(input_file_path,
+                                      sp_model_file,
+                                      output_path,
+                                      translated_input_folder=None,
+                                      max_seq_length=384,
+                                      do_lower_case=True,
+                                      max_query_length=64,
+                                      doc_stride=128,
+                                      xlnet_format=False,
+                                      version_2_with_negative=False):
+  """Generates and saves training data into a tf record file."""
+  train_examples = read_squad_examples(
+      input_file=input_file_path,
+      is_training=True,
+      version_2_with_negative=version_2_with_negative,
+      translated_input_folder=translated_input_folder)
+  tokenizer = tokenization.FullSentencePieceTokenizer(
+      sp_model_file=sp_model_file)
+  train_writer = FeatureWriter(
+      filename=output_path, is_training=True)
+  number_of_examples = convert_examples_to_features(
+      examples=train_examples,
+      tokenizer=tokenizer,
+      max_seq_length=max_seq_length,
+      doc_stride=doc_stride,
+      max_query_length=max_query_length,
+      is_training=True,
+      output_fn=train_writer.process_feature,
+      xlnet_format=xlnet_format,
+      do_lower_case=do_lower_case)
+  train_writer.close()
+  meta_data = {
+      "task_type": "bert_squad",
+      "train_data_size": number_of_examples,
+      "max_seq_length": max_seq_length,
+      "max_query_length": max_query_length,
+      "doc_stride": doc_stride,
+      "version_2_with_negative": version_2_with_negative,
+  }
+  return meta_data

tagging_data_lib.py ADDED Viewed

	@@ -0,0 +1,426 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to process data for tagging task such as NER/POS."""
+import collections
+import os
+from absl import logging
+import tensorflow as tf, tf_keras
+from official.nlp.data import classifier_data_lib
+from official.nlp.tools import tokenization
+# A negative label id for the padding label, which will not contribute
+# to loss/metrics in training.
+_PADDING_LABEL_ID = -1
+# The special unknown token, used to substitute a word which has too many
+# subwords after tokenization.
+_UNK_TOKEN = "[UNK]"
+class InputExample(object):
+  """A single training/test example for token classification."""
+  def __init__(self,
+               sentence_id,
+               sub_sentence_id=0,
+               words=None,
+               label_ids=None):
+    """Constructs an InputExample."""
+    self.sentence_id = sentence_id
+    self.sub_sentence_id = sub_sentence_id
+    self.words = words if words else []
+    self.label_ids = label_ids if label_ids else []
+  def add_word_and_label_id(self, word, label_id):
+    """Adds word and label_id pair in the example."""
+    self.words.append(word)
+    self.label_ids.append(label_id)
+def _read_one_file(file_name, label_list):
+  """Reads one file and returns a list of `InputExample` instances."""
+  lines = tf.io.gfile.GFile(file_name, "r").readlines()
+  examples = []
+  label_id_map = {label: i for i, label in enumerate(label_list)}
+  sentence_id = 0
+  example = InputExample(sentence_id=0)
+  for line in lines:
+    line = line.strip("\n")
+    if line:
+      # The format is: <token>\t<label> for train/dev set and <token> for test.
+      items = line.split("\t")
+      assert len(items) == 2 or len(items) == 1
+      token = items[0].strip()
+      # Assign a dummy label_id for test set
+      label_id = label_id_map[items[1].strip()] if len(items) == 2 else 0
+      example.add_word_and_label_id(token, label_id)
+    else:
+      # Empty line indicates a new sentence.
+      if example.words:
+        examples.append(example)
+        sentence_id += 1
+        example = InputExample(sentence_id=sentence_id)
+  if example.words:
+    examples.append(example)
+  return examples
+class PanxProcessor(classifier_data_lib.DataProcessor):
+  """Processor for the Panx data set."""
+  supported_languages = [
+      "ar", "he", "vi", "id", "jv", "ms", "tl", "eu", "ml", "ta", "te", "af",
+      "nl", "en", "de", "el", "bn", "hi", "mr", "ur", "fa", "fr", "it", "pt",
+      "es", "bg", "ru", "ja", "ka", "ko", "th", "sw", "yo", "my", "zh", "kk",
+      "tr", "et", "fi", "hu"
+  ]
+  def __init__(self,
+               process_text_fn=tokenization.convert_to_unicode,
+               only_use_en_train=True,
+               only_use_en_dev=True):
+    """See base class.
+    Args:
+      process_text_fn: See base class.
+      only_use_en_train: If True, only use english training data. Otherwise, use
+        training data from all languages.
+      only_use_en_dev: If True, only use english dev data. Otherwise, use dev
+        data from all languages.
+    """
+    super(PanxProcessor, self).__init__(process_text_fn)
+    self.only_use_en_train = only_use_en_train
+    self.only_use_en_dev = only_use_en_dev
+  def get_train_examples(self, data_dir):
+    examples = _read_one_file(
+        os.path.join(data_dir, "train-en.tsv"), self.get_labels())
+    if not self.only_use_en_train:
+      for language in self.supported_languages:
+        if language == "en":
+          continue
+        examples.extend(
+            _read_one_file(
+                os.path.join(data_dir, f"train-{language}.tsv"),
+                self.get_labels()))
+    return examples
+  def get_dev_examples(self, data_dir):
+    examples = _read_one_file(
+        os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
+    if not self.only_use_en_dev:
+      for language in self.supported_languages:
+        if language == "en":
+          continue
+        examples.extend(
+            _read_one_file(
+                os.path.join(data_dir, f"dev-{language}.tsv"),
+                self.get_labels()))
+    return examples
+  def get_test_examples(self, data_dir):
+    examples_dict = {}
+    for language in self.supported_languages:
+      examples_dict[language] = _read_one_file(
+          os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
+    return examples_dict
+  def get_labels(self):
+    return ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]
+  @staticmethod
+  def get_processor_name():
+    return "panx"
+class UdposProcessor(classifier_data_lib.DataProcessor):
+  """Processor for the Udpos data set."""
+  supported_languages = [
+      "af", "ar", "bg", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr",
+      "he", "hi", "hu", "id", "it", "ja", "kk", "ko", "mr", "nl", "pt", "ru",
+      "ta", "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"
+  ]
+  def __init__(self,
+               process_text_fn=tokenization.convert_to_unicode,
+               only_use_en_train=True,
+               only_use_en_dev=True):
+    """See base class.
+    Args:
+      process_text_fn: See base class.
+      only_use_en_train: If True, only use english training data. Otherwise, use
+        training data from all languages.
+      only_use_en_dev: If True, only use english dev data. Otherwise, use dev
+        data from all languages.
+    """
+    super(UdposProcessor, self).__init__(process_text_fn)
+    self.only_use_en_train = only_use_en_train
+    self.only_use_en_dev = only_use_en_dev
+  def get_train_examples(self, data_dir):
+    if self.only_use_en_train:
+      examples = _read_one_file(
+          os.path.join(data_dir, "train-en.tsv"), self.get_labels())
+    else:
+      examples = []
+      # Uses glob because some languages are missing in train.
+      for filepath in tf.io.gfile.glob(os.path.join(data_dir, "train-*.tsv")):
+        examples.extend(
+            _read_one_file(
+                filepath,
+                self.get_labels()))
+    return examples
+  def get_dev_examples(self, data_dir):
+    if self.only_use_en_dev:
+      examples = _read_one_file(
+          os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
+    else:
+      examples = []
+      for filepath in tf.io.gfile.glob(os.path.join(data_dir, "dev-*.tsv")):
+        examples.extend(
+            _read_one_file(
+                filepath,
+                self.get_labels()))
+    return examples
+  def get_test_examples(self, data_dir):
+    examples_dict = {}
+    for language in self.supported_languages:
+      examples_dict[language] = _read_one_file(
+          os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
+    return examples_dict
+  def get_labels(self):
+    return [
+        "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM",
+        "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"
+    ]
+  @staticmethod
+  def get_processor_name():
+    return "udpos"
+def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
+  """Tokenizes words and breaks long example into short ones."""
+  # Needs additional [CLS] and [SEP] tokens.
+  max_length = max_length - 2
+  new_examples = []
+  new_example = InputExample(sentence_id=example.sentence_id, sub_sentence_id=0)
+  if any([x < 0 for x in example.label_ids]):
+    raise ValueError("Unexpected negative label_id: %s" % example.label_ids)
+  for i, word in enumerate(example.words):
+    if text_preprocessing:
+      word = text_preprocessing(word)
+    subwords = tokenizer.tokenize(word)
+    if (not subwords or len(subwords) > max_length) and word:
+      subwords = [_UNK_TOKEN]
+    if len(subwords) + len(new_example.words) > max_length:
+      # Start a new example.
+      new_examples.append(new_example)
+      last_sub_sentence_id = new_example.sub_sentence_id
+      new_example = InputExample(
+          sentence_id=example.sentence_id,
+          sub_sentence_id=last_sub_sentence_id + 1)
+    for j, subword in enumerate(subwords):
+      # Use the real label for the first subword, and pad label for
+      # the remainings.
+      subword_label = example.label_ids[i] if j == 0 else _PADDING_LABEL_ID
+      new_example.add_word_and_label_id(subword, subword_label)
+  if new_example.words:
+    new_examples.append(new_example)
+  return new_examples
+def _convert_single_example(example, max_seq_length, tokenizer):
+  """Converts an `InputExample` instance to a `tf.train.Example` instance."""
+  tokens = ["[CLS]"]
+  tokens.extend(example.words)
+  tokens.append("[SEP]")
+  input_ids = tokenizer.convert_tokens_to_ids(tokens)
+  label_ids = [_PADDING_LABEL_ID]
+  label_ids.extend(example.label_ids)
+  label_ids.append(_PADDING_LABEL_ID)
+  segment_ids = [0] * len(input_ids)
+  input_mask = [1] * len(input_ids)
+  # Pad up to the sequence length.
+  while len(input_ids) < max_seq_length:
+    input_ids.append(0)
+    input_mask.append(0)
+    segment_ids.append(0)
+    label_ids.append(_PADDING_LABEL_ID)
+  def create_int_feature(values):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  features = collections.OrderedDict()
+  features["input_ids"] = create_int_feature(input_ids)
+  features["input_mask"] = create_int_feature(input_mask)
+  features["segment_ids"] = create_int_feature(segment_ids)
+  features["label_ids"] = create_int_feature(label_ids)
+  features["sentence_id"] = create_int_feature([example.sentence_id])
+  features["sub_sentence_id"] = create_int_feature([example.sub_sentence_id])
+  tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+  return tf_example
+def write_example_to_file(examples,
+                          tokenizer,
+                          max_seq_length,
+                          output_file,
+                          text_preprocessing=None):
+  """Writes `InputExample`s into a tfrecord file with `tf.train.Example` protos.
+  Note that the words inside each example will be tokenized and be applied by
+  `text_preprocessing` if available. Also, if the length of sentence (plus
+  special [CLS] and [SEP] tokens) exceeds `max_seq_length`, the long sentence
+  will be broken into multiple short examples. For example:
+  Example (text_preprocessing=lowercase, max_seq_length=5)
+    words:        ["What", "a", "great", "weekend"]
+    labels:       [     7,   5,       9,        10]
+    sentence_id:  0
+    preprocessed: ["what", "a", "great", "weekend"]
+    tokenized:    ["what", "a", "great", "week", "##end"]
+  will result in two tf.example protos:
+    tokens:      ["[CLS]", "what", "a", "great", "[SEP]"]
+    label_ids:   [-1,       7,     5,     9,     -1]
+    input_mask:  [ 1,       1,     1,     1,      1]
+    segment_ids: [ 0,       0,     0,     0,      0]
+    input_ids:   [ tokenizer.convert_tokens_to_ids(tokens) ]
+    sentence_id: 0
+    tokens:      ["[CLS]", "week", "##end", "[SEP]", "[PAD]"]
+    label_ids:   [-1,       10,     -1,    -1,       -1]
+    input_mask:  [ 1,       1,       1,     0,        0]
+    segment_ids: [ 0,       0,       0,     0,        0]
+    input_ids:   [ tokenizer.convert_tokens_to_ids(tokens) ]
+    sentence_id: 0
+    Note the use of -1 in `label_ids` to indicate that a token should not be
+    considered for classification (e.g., trailing ## wordpieces or special
+    token). Token classification models should accordingly ignore these when
+    calculating loss, metrics, etc...
+  Args:
+    examples: A list of `InputExample` instances.
+    tokenizer: The tokenizer to be applied on the data.
+    max_seq_length: Maximum length of generated sequences.
+    output_file: The name of the output tfrecord file.
+    text_preprocessing: optional preprocessing run on each word prior to
+      tokenization.
+  Returns:
+    The total number of tf.train.Example proto written to file.
+  """
+  tf.io.gfile.makedirs(os.path.dirname(output_file))
+  writer = tf.io.TFRecordWriter(output_file)
+  num_tokenized_examples = 0
+  for (ex_index, example) in enumerate(examples):
+    if ex_index % 10000 == 0:
+      logging.info("Writing example %d of %d to %s", ex_index, len(examples),
+                   output_file)
+    tokenized_examples = _tokenize_example(example, max_seq_length, tokenizer,
+                                           text_preprocessing)
+    num_tokenized_examples += len(tokenized_examples)
+    for per_tokenized_example in tokenized_examples:
+      tf_example = _convert_single_example(per_tokenized_example,
+                                           max_seq_length, tokenizer)
+      writer.write(tf_example.SerializeToString())
+  writer.close()
+  return num_tokenized_examples
+def token_classification_meta_data(train_data_size,
+                                   max_seq_length,
+                                   num_labels,
+                                   eval_data_size=None,
+                                   test_data_size=None,
+                                   label_list=None,
+                                   processor_type=None):
+  """Creates metadata for tagging (token classification) datasets."""
+  meta_data = {
+      "train_data_size": train_data_size,
+      "max_seq_length": max_seq_length,
+      "num_labels": num_labels,
+      "task_type": "tagging",
+      "label_type": "int",
+      "label_shape": [max_seq_length],
+  }
+  if eval_data_size:
+    meta_data["eval_data_size"] = eval_data_size
+  if test_data_size:
+    meta_data["test_data_size"] = test_data_size
+  if label_list:
+    meta_data["label_list"] = label_list
+  if processor_type:
+    meta_data["processor_type"] = processor_type
+  return meta_data
+def generate_tf_record_from_data_file(processor, data_dir, tokenizer,
+                                      max_seq_length, train_data_output_path,
+                                      eval_data_output_path,
+                                      test_data_output_path,
+                                      text_preprocessing):
+  """Generates tfrecord files from the raw data."""
+  common_kwargs = dict(
+      tokenizer=tokenizer,
+      max_seq_length=max_seq_length,
+      text_preprocessing=text_preprocessing)
+  train_examples = processor.get_train_examples(data_dir)
+  train_data_size = write_example_to_file(
+      train_examples, output_file=train_data_output_path, **common_kwargs)
+  eval_examples = processor.get_dev_examples(data_dir)
+  eval_data_size = write_example_to_file(
+      eval_examples, output_file=eval_data_output_path, **common_kwargs)
+  test_input_data_examples = processor.get_test_examples(data_dir)
+  test_data_size = {}
+  for language, examples in test_input_data_examples.items():
+    test_data_size[language] = write_example_to_file(
+        examples,
+        output_file=test_data_output_path.format(language),
+        **common_kwargs)
+  labels = processor.get_labels()
+  meta_data = token_classification_meta_data(
+      train_data_size,
+      max_seq_length,
+      len(labels),
+      eval_data_size,
+      test_data_size,
+      label_list=labels,
+      processor_type=processor.get_processor_name())
+  return meta_data

tagging_data_lib_test.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for official.nlp.data.tagging_data_lib."""
+import os
+import random
+from absl.testing import parameterized
+import tensorflow as tf, tf_keras
+from official.nlp.data import tagging_data_lib
+from official.nlp.tools import tokenization
+def _create_fake_file(filename, labels, is_test):
+  def write_one_sentence(writer, length):
+    for _ in range(length):
+      line = "hiworld"
+      if not is_test:
+        line += "\t%s" % (labels[random.randint(0, len(labels) - 1)])
+      writer.write(line + "\n")
+  # Writes two sentences with length of 3 and 12 respectively.
+  with tf.io.gfile.GFile(filename, "w") as writer:
+    write_one_sentence(writer, 3)
+    writer.write("\n")
+    write_one_sentence(writer, 12)
+class TaggingDataLibTest(tf.test.TestCase, parameterized.TestCase):
+  def setUp(self):
+    super(TaggingDataLibTest, self).setUp()
+    self.processors = {
+        "panx": tagging_data_lib.PanxProcessor,
+        "udpos": tagging_data_lib.UdposProcessor,
+    }
+    self.vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
+    with tf.io.gfile.GFile(self.vocab_file, "w") as writer:
+      writer.write("\n".join(["[CLS]", "[SEP]", "hi", "##world", "[UNK]"]))
+  @parameterized.parameters(
+      {"task_type": "panx"},
+      {"task_type": "udpos"},
+  )
+  def test_generate_tf_record(self, task_type):
+    processor = self.processors[task_type]()
+    input_data_dir = os.path.join(self.get_temp_dir(), task_type)
+    tf.io.gfile.mkdir(input_data_dir)
+    # Write fake train file.
+    _create_fake_file(
+        os.path.join(input_data_dir, "train-en.tsv"),
+        processor.get_labels(),
+        is_test=False)
+    # Write fake dev file.
+    _create_fake_file(
+        os.path.join(input_data_dir, "dev-en.tsv"),
+        processor.get_labels(),
+        is_test=False)
+    # Write fake test files.
+    for lang in processor.supported_languages:
+      _create_fake_file(
+          os.path.join(input_data_dir, "test-%s.tsv" % lang),
+          processor.get_labels(),
+          is_test=True)
+    output_path = os.path.join(self.get_temp_dir(), task_type, "output")
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=self.vocab_file, do_lower_case=True)
+    metadata = tagging_data_lib.generate_tf_record_from_data_file(
+        processor,
+        input_data_dir,
+        tokenizer,
+        max_seq_length=8,
+        train_data_output_path=os.path.join(output_path, "train.tfrecord"),
+        eval_data_output_path=os.path.join(output_path, "eval.tfrecord"),
+        test_data_output_path=os.path.join(output_path, "test_{}.tfrecord"),
+        text_preprocessing=tokenization.convert_to_unicode)
+    self.assertEqual(metadata["train_data_size"], 5)
+    files = tf.io.gfile.glob(output_path + "/*")
+    expected_files = []
+    expected_files.append(os.path.join(output_path, "train.tfrecord"))
+    expected_files.append(os.path.join(output_path, "eval.tfrecord"))
+    for lang in processor.supported_languages:
+      expected_files.append(
+          os.path.join(output_path, "test_%s.tfrecord" % lang))
+    self.assertCountEqual(files, expected_files)
+if __name__ == "__main__":
+  tf.test.main()

tagging_dataloader.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Loads dataset for the tagging (e.g., NER/POS) task."""
+import dataclasses
+from typing import Mapping, Optional
+import tensorflow as tf, tf_keras
+from official.common import dataset_fn
+from official.core import config_definitions as cfg
+from official.core import input_reader
+from official.nlp.data import data_loader
+from official.nlp.data import data_loader_factory
+@dataclasses.dataclass
+class TaggingDataConfig(cfg.DataConfig):
+  """Data config for tagging (tasks/tagging)."""
+  is_training: bool = True
+  seq_length: int = 128
+  include_sentence_id: bool = False
+  file_type: str = 'tfrecord'
+@data_loader_factory.register_data_loader_cls(TaggingDataConfig)
+class TaggingDataLoader(data_loader.DataLoader):
+  """A class to load dataset for tagging (e.g., NER and POS) task."""
+  def __init__(self, params: TaggingDataConfig):
+    self._params = params
+    self._seq_length = params.seq_length
+    self._include_sentence_id = params.include_sentence_id
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    name_to_features = {
+        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'label_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+    }
+    if self._include_sentence_id:
+      name_to_features['sentence_id'] = tf.io.FixedLenFeature([], tf.int64)
+      name_to_features['sub_sentence_id'] = tf.io.FixedLenFeature([], tf.int64)
+    example = tf.io.parse_single_example(record, name_to_features)
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in example:
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+    return example
+  def _parse(self, record: Mapping[str, tf.Tensor]):
+    """Parses raw tensors into a dict of tensors to be consumed by the model."""
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids']
+    }
+    if self._include_sentence_id:
+      x['sentence_id'] = record['sentence_id']
+      x['sub_sentence_id'] = record['sub_sentence_id']
+    y = record['label_ids']
+    return (x, y)
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    reader = input_reader.InputReader(
+        params=self._params,
+        dataset_fn=dataset_fn.pick_dataset_fn(self._params.file_type),
+        decoder_fn=self._decode,
+        parser_fn=self._parse)
+    return reader.read(input_context)

tagging_dataloader_test.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for official.nlp.data.tagging_data_loader."""
+import os
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf, tf_keras
+from official.nlp.data import tagging_dataloader
+def _create_fake_dataset(output_path, seq_length, include_sentence_id):
+  """Creates a fake dataset."""
+  writer = tf.io.TFRecordWriter(output_path)
+  def create_int_feature(values):
+    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+    return f
+  for i in range(100):
+    features = {}
+    input_ids = np.random.randint(100, size=(seq_length))
+    features['input_ids'] = create_int_feature(input_ids)
+    features['input_mask'] = create_int_feature(np.ones_like(input_ids))
+    features['segment_ids'] = create_int_feature(np.ones_like(input_ids))
+    features['label_ids'] = create_int_feature(
+        np.random.randint(10, size=(seq_length)))
+    if include_sentence_id:
+      features['sentence_id'] = create_int_feature([i])
+      features['sub_sentence_id'] = create_int_feature([0])
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+class TaggingDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
+  @parameterized.parameters(True, False)
+  def test_load_dataset(self, include_sentence_id):
+    seq_length = 16
+    batch_size = 10
+    train_data_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
+    _create_fake_dataset(train_data_path, seq_length, include_sentence_id)
+    data_config = tagging_dataloader.TaggingDataConfig(
+        input_path=train_data_path,
+        seq_length=seq_length,
+        global_batch_size=batch_size,
+        include_sentence_id=include_sentence_id)
+    dataset = tagging_dataloader.TaggingDataLoader(data_config).load()
+    features, labels = next(iter(dataset))
+    expected_keys = ['input_word_ids', 'input_mask', 'input_type_ids']
+    if include_sentence_id:
+      expected_keys.extend(['sentence_id', 'sub_sentence_id'])
+    self.assertCountEqual(expected_keys, features.keys())
+    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
+    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
+    self.assertEqual(labels.shape, (batch_size, seq_length))
+    if include_sentence_id:
+      self.assertEqual(features['sentence_id'].shape, (batch_size,))
+      self.assertEqual(features['sub_sentence_id'].shape, (batch_size,))
+if __name__ == '__main__':
+  tf.test.main()

train_sentencepiece.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A script to train sentencepiece model from tensorflow datasets.
+Reserved tokens:
+pad: 0,
+eos: 1,
+unk: 2
+(bos is not reserved)
+"""
+import os
+import tempfile
+from typing import List, Tuple
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf, tf_keras
+import tensorflow_datasets as tfds
+from sentencepiece import SentencePieceTrainer
+FLAGS = flags.FLAGS
+flags.DEFINE_string("output_model_path", None,
+                    "Path to save the sentencepiece model.")
+flags.mark_flag_as_required("output_model_path")
+flags.DEFINE_string("tfds_dir", None, "Directory of the tfds.")
+flags.DEFINE_string("tfds_name", "wmt14_translate/de-en",
+                    "Name of the dataset we generate vacabulay from.")
+flags.DEFINE_string("tfds_split", "train", "Split of the dataset.")
+flags.DEFINE_integer("vocab_size", 32000, "Size of vocabulary.")
+flags.DEFINE_integer(
+    "max_char", -1,
+    "Maximum number of characters to use. "
+    "If a non-positive number is provided, all sentences are used.")
+flags.DEFINE_string("model_type", "bpe",
+                    "Model algorithm: unigram, bpe, word or char.")
+flags.DEFINE_float("character_coverage", 0.9995,
+                   "Character coverage to determine the minimum symbols")
+flags.DEFINE_list(
+    "data_keys", ["en", "de"],
+    "Comma-separated list of keys to use for training the vocabulary.")
+def dump_chars_to_textfile(dataset: tf.data.Dataset,
+                           data_keys: Tuple[str],
+                           max_char: int = -1):
+  """Write part of a TFDS sentence dataset to lines in a text file.
+  Args:
+    dataset: tf.dataset containing string-data.
+    data_keys: what keys in dataset to dump from.
+    max_char: max character to dump to text file.
+  Returns:
+    name of temp file with dataset bytes, exact number of characters dumped.
+  """
+  ds_iter = dataset.as_numpy_iterator()
+  with tempfile.NamedTemporaryFile(delete=False) as outfp:
+    char_count = 0
+    while True:
+      example = next(ds_iter, None)
+      if example is None or (
+          max_char > 0 and char_count > max_char):
+        break
+      for k in data_keys:
+        line = example[k] + b"\n"
+        char_count += len(line)
+        outfp.write(line)
+  return outfp.name
+def train_sentencepiece(
+    file_path: str,
+    model_path: str,
+    vocab_size: int,
+    character_coverage: float,
+    model_type: str):
+  """Train SentencePiece tokenizer from subset of tf dataset.
+  Args:
+    file_path: path of data to train sentencepiece.
+    model_path: path of model file to save vocab model to.
+    vocab_size: size of vocab tokens to train.
+    character_coverage: amount of characters covered by the model, good defaults
+      are 0.9995 for languages with rich character set like Japanese or Chinese
+      and 1.0 for other languages with small character set.
+    model_type: type of sentencepiece vocab to train.
+  Returns:
+    path to the trained sentencepiece vocabulary model.
+  """
+  argstr = " ".join([
+      f"--input={file_path}", f"--vocab_size={vocab_size}",
+      f"--character_coverage={character_coverage}",
+      f"--model_prefix={model_path}", f"--model_type={model_type}",
+      "--bos_id=-1", "--pad_id=0", "--eos_id=1", "--unk_id=2"
+  ])
+  SentencePieceTrainer.Train(argstr)
+def main(argv: List[str]):
+  del argv
+  builder = tfds.builder(FLAGS.tfds_name, data_dir=FLAGS.tfds_dir)
+  ds = builder.as_dataset(split=FLAGS.tfds_split)
+  tmp_filename = dump_chars_to_textfile(ds, FLAGS.data_keys, FLAGS.max_char)
+  logging.info("Sentencepiece model will be placed here: %s",
+               FLAGS.output_model_path)
+  train_sentencepiece(tmp_filename,
+                      FLAGS.output_model_path,
+                      FLAGS.vocab_size,
+                      FLAGS.character_coverage,
+                      FLAGS.model_type)
+  os.remove(tmp_filename)
+if __name__ == "__main__":
+  app.run(main)

wmt_dataloader.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Input pipeline for the transformer model to read, filter, and batch examples.
+Batching scheme
+   Prior to batching, elements in the dataset are grouped by length (max between
+   'inputs' and 'targets' length). Each group is then batched such that:
+     group_batch_size * length <= batch_size.
+   Another way to view batch_size is the maximum number of tokens in each batch.
+   Once batched, each element in the dataset will have the shape:
+     {'inputs': [group_batch_size, padded_input_length],
+      'targets': [group_batch_size, padded_target_length]}
+   Lengths are padded to the longest 'inputs' or 'targets' sequence in the batch
+   (padded_input_length and padded_target_length can be different).
+   This batching scheme decreases the fraction of padding tokens per training
+   batch, thus improving the training speed significantly.
+"""
+from typing import Dict, Optional
+import dataclasses
+import tensorflow as tf, tf_keras
+import tensorflow_text as tftxt
+from official.core import config_definitions as cfg
+from official.core import input_reader
+from official.nlp.data import data_loader
+from official.nlp.data import data_loader_factory
+# Example grouping constants. Defines length boundaries for each group.
+# These values are the defaults used in Tensor2Tensor.
+_MIN_BOUNDARY = 8
+_BOUNDARY_SCALE = 1.1
+def _get_example_length(example):
+  """Returns the maximum length between the example inputs and targets."""
+  length = tf.maximum(tf.shape(example[0])[0], tf.shape(example[1])[0])
+  return length
+def _create_min_max_boundaries(max_length,
+                               min_boundary=_MIN_BOUNDARY,
+                               boundary_scale=_BOUNDARY_SCALE):
+  """Create min and max boundary lists up to max_length.
+  For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
+  returned values will be:
+    buckets_min = [0, 4, 8, 16]
+    buckets_max = [4, 8, 16, 25]
+  Args:
+    max_length: The maximum length of example in dataset.
+    min_boundary: Minimum length in boundary.
+    boundary_scale: Amount to scale consecutive boundaries in the list.
+  Returns:
+    min and max boundary lists
+  """
+  # Create bucket boundaries list by scaling the previous boundary or adding 1
+  # (to ensure increasing boundary sizes).
+  bucket_boundaries = []
+  x = min_boundary
+  while x < max_length:
+    bucket_boundaries.append(x)
+    x = max(x + 1, int(x * boundary_scale))
+  # Create min and max boundary lists from the initial list.
+  buckets_min = [0] + bucket_boundaries
+  buckets_max = bucket_boundaries + [max_length + 1]
+  return buckets_min, buckets_max
+def _batch_examples(dataset, batch_size, max_length):
+  """Group examples by similar lengths, and return batched dataset.
+  Each batch of similar-length examples are padded to the same length, and may
+  have different number of elements in each batch, such that:
+    group_batch_size * padded_length <= batch_size.
+  This decreases the number of padding tokens per batch, which improves the
+  training speed.
+  Args:
+    dataset: Dataset of unbatched examples.
+    batch_size: Max number of tokens per batch of examples.
+    max_length: Max number of tokens in an example input or target sequence.
+  Returns:
+    Dataset of batched examples with similar lengths.
+  """
+  # Get min and max boundary lists for each example. These are used to calculate
+  # the `bucket_id`, which is the index at which:
+  # buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
+  # Note that using both min and max lists improves the performance.
+  buckets_min, buckets_max = _create_min_max_boundaries(max_length)
+  # Create list of batch sizes for each bucket_id, so that
+  # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
+  bucket_batch_sizes = [int(batch_size) // x for x in buckets_max]
+  # Validates bucket batch sizes.
+  if any([batch_size <= 0 for batch_size in bucket_batch_sizes]):
+    raise ValueError(
+        'The token budget, global batch size, is too small to yield 0 bucket '
+        'window: %s' % str(bucket_batch_sizes))
+  # bucket_id will be a tensor, so convert this list to a tensor as well.
+  bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
+  def example_to_bucket_id(example):
+    """Return int64 bucket id for this example, calculated based on length."""
+    example_input = example['inputs']
+    example_target = example['targets']
+    seq_length = _get_example_length((example_input, example_target))
+    conditions_c = tf.logical_and(
+        tf.less_equal(buckets_min, seq_length), tf.less(seq_length,
+                                                        buckets_max))
+    bucket_id = tf.reduce_min(tf.where(conditions_c))
+    return bucket_id
+  def window_size_fn(bucket_id):
+    """Return number of examples to be grouped when given a bucket id."""
+    return bucket_batch_sizes[bucket_id]
+  def batching_fn(bucket_id, grouped_dataset):
+    """Batch and add padding to a dataset of elements with similar lengths."""
+    bucket_batch_size = window_size_fn(bucket_id)
+    # Batch the dataset and add padding so that all input sequences in the
+    # examples have the same length, and all target sequences have the same
+    # lengths as well. Resulting lengths of inputs and targets can differ.
+    padded_shapes = dict([
+        (name, [None] * len(spec.shape))
+        for name, spec in grouped_dataset.element_spec.items()
+    ])
+    return grouped_dataset.padded_batch(bucket_batch_size, padded_shapes)
+  return dataset.apply(
+      tf.data.experimental.group_by_window(
+          key_func=example_to_bucket_id,
+          reduce_func=batching_fn,
+          window_size=None,
+          window_size_func=window_size_fn))
+@dataclasses.dataclass
+class WMTDataConfig(cfg.DataConfig):
+  """Data config for WMT translation."""
+  max_seq_length: int = 64
+  static_batch: bool = False
+  sentencepiece_model_path: str = ''
+  src_lang: str = ''
+  tgt_lang: str = ''
+  transform_and_batch: bool = True
+  has_unique_id: bool = False
+@data_loader_factory.register_data_loader_cls(WMTDataConfig)
+class WMTDataLoader(data_loader.DataLoader):
+  """A class to load dataset for WMT translation task."""
+  def __init__(self, params: WMTDataConfig):
+    self._params = params
+    self._max_seq_length = params.max_seq_length
+    self._static_batch = params.static_batch
+    self._global_batch_size = params.global_batch_size
+    if self._params.transform_and_batch:
+      self._tokenizer = tftxt.SentencepieceTokenizer(
+          model=tf.io.gfile.GFile(params.sentencepiece_model_path, 'rb').read(),
+          add_eos=True)
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    name_to_features = {
+        self._params.src_lang: tf.io.FixedLenFeature([], tf.string),
+        self._params.tgt_lang: tf.io.FixedLenFeature([], tf.string),
+    }
+    if self._params.has_unique_id:
+      name_to_features['unique_id'] = tf.io.FixedLenFeature([], tf.int64)
+    example = tf.io.parse_single_example(record, name_to_features)
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in example:
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+    return example
+  def _tokenize(self, inputs) -> Dict[str, tf.Tensor]:
+    tokenized_inputs = {}
+    for k, v in inputs.items():
+      if k == self._params.src_lang:
+        tokenized_inputs['inputs'] = self._tokenizer.tokenize(v)
+      elif k == self._params.tgt_lang:
+        tokenized_inputs['targets'] = self._tokenizer.tokenize(v)
+      else:
+        tokenized_inputs[k] = v
+    print(tokenized_inputs)
+    return tokenized_inputs
+  def _filter_max_length(self, inputs):
+    # return tf.constant(True)
+    return tf.logical_and(
+        tf.shape(inputs['inputs'])[0] <= self._max_seq_length,
+        tf.shape(inputs['targets'])[0] <= self._max_seq_length)
+  def _maybe_truncate(self, inputs):
+    truncated_inputs = {}
+    for k, v in inputs.items():
+      if k == 'inputs' or k == 'targets':
+        truncated_inputs[k] = tf.pad(
+            v[:self._max_seq_length - 1], [[0, 1]],
+            constant_values=1) if tf.shape(v)[0] > self._max_seq_length else v
+      else:
+        truncated_inputs[k] = v
+    return truncated_inputs
+  def _tokenize_bucketize_and_batch(
+      self,
+      dataset,
+      input_context: Optional[tf.distribute.InputContext] = None):
+    dataset = dataset.map(
+        self._tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    if self._params.is_training:
+      dataset = dataset.filter(self._filter_max_length)
+    else:
+      dataset = dataset.map(
+          self._maybe_truncate,
+          num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    per_replica_batch_size = input_context.get_per_replica_batch_size(
+        self._global_batch_size) if input_context else self._global_batch_size
+    if self._static_batch:
+      padded_shapes = {}
+      for name, _ in dataset.element_spec.items():
+        if name == 'unique_id':
+          padded_shapes[name] = []
+        else:
+          padded_shapes[name] = [self._max_seq_length
+                                ] if self._static_batch else [None]
+      batch_size = per_replica_batch_size
+      if self._params.is_training:
+        batch_size = int(batch_size // self._max_seq_length)
+      dataset = dataset.padded_batch(
+          batch_size,
+          padded_shapes,
+          drop_remainder=True)
+    else:
+      # Group and batch such that each batch has examples of similar length.
+      dataset = _batch_examples(dataset, per_replica_batch_size,
+                                self._max_seq_length)
+    # Prefetch the next element to improve speed of input pipeline.
+    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+    return dataset
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    decoder_fn = None
+    # Only decode for TFRecords.
+    if self._params.input_path:
+      decoder_fn = self._decode
+    def _identity(
+        dataset, input_context: Optional[tf.distribute.InputContext] = None):
+      del input_context
+      return dataset
+    transform_and_batch_fn = _identity
+    if self._params.transform_and_batch:
+      transform_and_batch_fn = self._tokenize_bucketize_and_batch
+    reader = input_reader.InputReader(
+        params=self._params,
+        decoder_fn=decoder_fn,
+        transform_and_batch_fn=transform_and_batch_fn)
+    return reader.read(input_context)

wmt_dataloader_test.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for official.nlp.data.wmt_dataloader."""
+import os
+from absl.testing import parameterized
+import tensorflow as tf, tf_keras
+from sentencepiece import SentencePieceTrainer
+from official.nlp.data import wmt_dataloader
+def _generate_line_file(filepath, lines):
+  with tf.io.gfile.GFile(filepath, 'w') as f:
+    for l in lines:
+      f.write('{}\n'.format(l))
+def _generate_record_file(filepath, src_lines, tgt_lines, unique_id=False):
+  writer = tf.io.TFRecordWriter(filepath)
+  for i, (src, tgt) in enumerate(zip(src_lines, tgt_lines)):
+    features = {
+        'en': tf.train.Feature(
+            bytes_list=tf.train.BytesList(
+                value=[src.encode()])),
+        'reverse_en': tf.train.Feature(
+            bytes_list=tf.train.BytesList(
+                value=[tgt.encode()])),
+    }
+    if unique_id:
+      features['unique_id'] = tf.train.Feature(
+          int64_list=tf.train.Int64List(value=[i]))
+    example = tf.train.Example(
+        features=tf.train.Features(
+            feature=features))
+    writer.write(example.SerializeToString())
+  writer.close()
+def _train_sentencepiece(input_path, vocab_size, model_path, eos_id=1):
+  argstr = ' '.join([
+      f'--input={input_path}', f'--vocab_size={vocab_size}',
+      '--character_coverage=0.995',
+      f'--model_prefix={model_path}', '--model_type=bpe',
+      '--bos_id=-1', '--pad_id=0', f'--eos_id={eos_id}', '--unk_id=2'
+  ])
+  SentencePieceTrainer.Train(argstr)
+class WMTDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
+  def setUp(self):
+    super(WMTDataLoaderTest, self).setUp()
+    self._temp_dir = self.get_temp_dir()
+    src_lines = [
+        'abc ede fg',
+        'bbcd ef a g',
+        'de f a a g'
+    ]
+    tgt_lines = [
+        'dd cc a ef  g',
+        'bcd ef a g',
+        'gef cd ba'
+    ]
+    self._record_train_input_path = os.path.join(self._temp_dir, 'train.record')
+    _generate_record_file(self._record_train_input_path, src_lines, tgt_lines)
+    self._record_test_input_path = os.path.join(self._temp_dir, 'test.record')
+    _generate_record_file(self._record_test_input_path, src_lines, tgt_lines,
+                          unique_id=True)
+    self._sentencepeice_input_path = os.path.join(self._temp_dir, 'inputs.txt')
+    _generate_line_file(self._sentencepeice_input_path, src_lines + tgt_lines)
+    sentencepeice_model_prefix = os.path.join(self._temp_dir, 'sp')
+    _train_sentencepiece(self._sentencepeice_input_path, 20,
+                         sentencepeice_model_prefix)
+    self._sentencepeice_model_path = '{}.model'.format(
+        sentencepeice_model_prefix)
+  @parameterized.named_parameters(
+      ('train_static', True, True, 100, (2, 35)),
+      ('train_non_static', True, False, 100, (12, 7)),
+      ('non_train_static', False, True, 3, (3, 35)),
+      ('non_train_non_static', False, False, 50, (2, 7)),)
+  def test_load_dataset(
+      self, is_training, static_batch, batch_size, expected_shape):
+    data_config = wmt_dataloader.WMTDataConfig(
+        input_path=self._record_train_input_path
+        if is_training else self._record_test_input_path,
+        max_seq_length=35,
+        global_batch_size=batch_size,
+        is_training=is_training,
+        static_batch=static_batch,
+        src_lang='en',
+        tgt_lang='reverse_en',
+        sentencepiece_model_path=self._sentencepeice_model_path)
+    dataset = wmt_dataloader.WMTDataLoader(data_config).load()
+    examples = next(iter(dataset))
+    inputs, targets = examples['inputs'], examples['targets']
+    self.assertEqual(inputs.shape, expected_shape)
+    self.assertEqual(targets.shape, expected_shape)
+  def test_load_dataset_raise_invalid_window(self):
+    batch_tokens_size = 10  # this is too small to form buckets.
+    data_config = wmt_dataloader.WMTDataConfig(
+        input_path=self._record_train_input_path,
+        max_seq_length=100,
+        global_batch_size=batch_tokens_size,
+        is_training=True,
+        static_batch=False,
+        src_lang='en',
+        tgt_lang='reverse_en',
+        sentencepiece_model_path=self._sentencepeice_model_path)
+    with self.assertRaisesRegex(
+        ValueError, 'The token budget, global batch size, is too small.*'):
+      _ = wmt_dataloader.WMTDataLoader(data_config).load()
+if __name__ == '__main__':
+  tf.test.main()