Spaces:

NLPV
/

ISCO-code-predictor-api

Running

App Files Files

Pradeep Kumar commited on Aug 15, 2024

Commit

f891af0

verified ·

1 Parent(s): 69d32b7

Delete export_tfhub_lib_test.py

Browse files

Files changed (1) hide show

export_tfhub_lib_test.py +0 -1080

export_tfhub_lib_test.py DELETED Viewed

@@ -1,1080 +0,0 @@
-# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests export_tfhub_lib."""
-import os
-import tempfile
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf, tf_keras
-from tensorflow import estimator as tf_estimator
-import tensorflow_hub as hub
-import tensorflow_text as text
-from sentencepiece import SentencePieceTrainer
-from official.legacy.bert import configs
-from official.modeling import tf_utils
-from official.nlp.configs import encoders
-from official.nlp.modeling import layers
-from official.nlp.modeling import models
-from official.nlp.tools import export_tfhub_lib
-def _get_bert_config_or_encoder_config(use_bert_config,
-                                       hidden_size,
-                                       num_hidden_layers,
-                                       encoder_type="albert",
-                                       vocab_size=100):
-  """Generates config args for export_tfhub_lib._create_model().
-  Args:
-    use_bert_config: bool. If True, returns legacy BertConfig.
-    hidden_size: int.
-    num_hidden_layers: int.
-    encoder_type: str. Can be ['albert', 'bert', 'bert_v2']. If use_bert_config
-      == True, then model_type is not used.
-    vocab_size: int.
-  Returns:
-    bert_config, encoder_config. Only one is not None. If
-      `use_bert_config` == True, the first config is valid. Otherwise
-      `bert_config` == None.
-  """
-  if use_bert_config:
-    bert_config = configs.BertConfig(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        intermediate_size=32,
-        max_position_embeddings=128,
-        num_attention_heads=2,
-        num_hidden_layers=num_hidden_layers)
-    encoder_config = None
-  else:
-    bert_config = None
-    if encoder_type == "albert":
-      encoder_config = encoders.EncoderConfig(
-          type="albert",
-          albert=encoders.AlbertEncoderConfig(
-              vocab_size=vocab_size,
-              embedding_width=16,
-              hidden_size=hidden_size,
-              intermediate_size=32,
-              max_position_embeddings=128,
-              num_attention_heads=2,
-              num_layers=num_hidden_layers,
-              dropout_rate=0.1))
-    else:
-      # encoder_type can be 'bert' or 'bert_v2'.
-      model_config = encoders.BertEncoderConfig(
-          vocab_size=vocab_size,
-          embedding_size=16,
-          hidden_size=hidden_size,
-          intermediate_size=32,
-          max_position_embeddings=128,
-          num_attention_heads=2,
-          num_layers=num_hidden_layers,
-          dropout_rate=0.1)
-      kwargs = {"type": encoder_type, encoder_type: model_config}
-      encoder_config = encoders.EncoderConfig(**kwargs)
-  return bert_config, encoder_config
-def _get_vocab_or_sp_model_dummy(temp_dir, use_sp_model):
-  """Returns tokenizer asset args for export_tfhub_lib.export_model()."""
-  dummy_file = os.path.join(temp_dir, "dummy_file.txt")
-  with tf.io.gfile.GFile(dummy_file, "w") as f:
-    f.write("dummy content")
-  if use_sp_model:
-    vocab_file, sp_model_file = None, dummy_file
-  else:
-    vocab_file, sp_model_file = dummy_file, None
-  return vocab_file, sp_model_file
-def _read_asset(asset: tf.saved_model.Asset):
-  return tf.io.gfile.GFile(asset.asset_path.numpy()).read()
-def _find_lambda_layers(layer):
-  """Returns list of all Lambda layers in a Keras model."""
-  if isinstance(layer, tf_keras.layers.Lambda):
-    return [layer]
-  elif hasattr(layer, "layers"):  # It's nested, like a Model.
-    result = []
-    for l in layer.layers:
-      result += _find_lambda_layers(l)
-    return result
-  else:
-    return []
-class ExportModelTest(tf.test.TestCase, parameterized.TestCase):
-  """Tests exporting a Transformer Encoder model as a SavedModel.
-  This covers export from an Encoder checkpoint to a SavedModel without
-  the .mlm subobject. This is no longer preferred, but still useful
-    for models like Electra that are trained without the MLM task.
-  The export code is generic. This test focuses on two main cases
-  (the most important ones in practice when this was written in 2020):
-    - BERT built from a legacy BertConfig, for use with BertTokenizer.
-    - ALBERT built from an EncoderConfig (as a representative of all other
-      choices beyond BERT, for use with SentencepieceTokenizer (the one
-      alternative to BertTokenizer).
-  """
-  @parameterized.named_parameters(
-      ("Bert_Legacy", True, None), ("Albert", False, "albert"),
-      ("BertEncoder", False, "bert"), ("BertEncoderV2", False, "bert_v2"))
-  def test_export_model(self, use_bert, encoder_type):
-    # Create the encoder and export it.
-    hidden_size = 16
-    num_hidden_layers = 1
-    bert_config, encoder_config = _get_bert_config_or_encoder_config(
-        use_bert,
-        hidden_size=hidden_size,
-        num_hidden_layers=num_hidden_layers,
-        encoder_type=encoder_type)
-    bert_model, encoder = export_tfhub_lib._create_model(
-        bert_config=bert_config, encoder_config=encoder_config, with_mlm=False)
-    self.assertEmpty(
-        _find_lambda_layers(bert_model),
-        "Lambda layers are non-portable since they serialize Python bytecode.")
-    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
-    checkpoint = tf.train.Checkpoint(encoder=encoder)
-    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
-    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
-    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
-        self.get_temp_dir(), use_sp_model=not use_bert)
-    export_path = os.path.join(self.get_temp_dir(), "hub")
-    export_tfhub_lib.export_model(
-        export_path=export_path,
-        bert_config=bert_config,
-        encoder_config=encoder_config,
-        model_checkpoint_path=model_checkpoint_path,
-        with_mlm=False,
-        vocab_file=vocab_file,
-        sp_model_file=sp_model_file,
-        do_lower_case=True)
-    # Restore the exported model.
-    hub_layer = hub.KerasLayer(export_path, trainable=True)
-    # Check legacy tokenization data.
-    if use_bert:
-      self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
-      self.assertEqual("dummy content",
-                       _read_asset(hub_layer.resolved_object.vocab_file))
-      self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
-    else:
-      self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
-      self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
-      self.assertEqual("dummy content",
-                       _read_asset(hub_layer.resolved_object.sp_model_file))
-    # Check restored weights.
-    self.assertEqual(
-        len(bert_model.trainable_weights), len(hub_layer.trainable_weights))
-    for source_weight, hub_weight in zip(bert_model.trainable_weights,
-                                         hub_layer.trainable_weights):
-      self.assertAllClose(source_weight.numpy(), hub_weight.numpy())
-    # Check computation.
-    seq_length = 10
-    dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
-    input_dict = dict(
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids)
-    hub_output = hub_layer(input_dict)
-    source_output = bert_model(input_dict)
-    encoder_output = encoder(input_dict)
-    self.assertEqual(hub_output["pooled_output"].shape, (2, hidden_size))
-    self.assertEqual(hub_output["sequence_output"].shape,
-                     (2, seq_length, hidden_size))
-    self.assertLen(hub_output["encoder_outputs"], num_hidden_layers)
-    for key in ("pooled_output", "sequence_output", "encoder_outputs"):
-      self.assertAllClose(source_output[key], hub_output[key])
-      self.assertAllClose(source_output[key], encoder_output[key])
-    # The "default" output of BERT as a text representation is pooled_output.
-    self.assertAllClose(hub_output["pooled_output"], hub_output["default"])
-    # Test that training=True makes a difference (activates dropout).
-    def _dropout_mean_stddev(training, num_runs=20):
-      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
-      input_dict = dict(
-          input_word_ids=input_ids,
-          input_mask=np.ones_like(input_ids),
-          input_type_ids=np.zeros_like(input_ids))
-      outputs = np.concatenate([
-          hub_layer(input_dict, training=training)["pooled_output"]
-          for _ in range(num_runs)
-      ])
-      return np.mean(np.std(outputs, axis=0))
-    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
-    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
-    # Test propagation of seq_length in shape inference.
-    input_word_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_mask = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_type_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_dict = dict(
-        input_word_ids=input_word_ids,
-        input_mask=input_mask,
-        input_type_ids=input_type_ids)
-    output_dict = hub_layer(input_dict)
-    pooled_output = output_dict["pooled_output"]
-    sequence_output = output_dict["sequence_output"]
-    encoder_outputs = output_dict["encoder_outputs"]
-    self.assertEqual(pooled_output.shape.as_list(), [None, hidden_size])
-    self.assertEqual(sequence_output.shape.as_list(),
-                     [None, seq_length, hidden_size])
-    self.assertLen(encoder_outputs, num_hidden_layers)
-class ExportModelWithMLMTest(tf.test.TestCase, parameterized.TestCase):
-  """Tests exporting a Transformer Encoder model as a SavedModel.
-  This covers export from a Pretrainer checkpoint to a SavedModel including
-  the .mlm subobject, which is the preferred way since 2020.
-  The export code is generic. This test focuses on two main cases
-  (the most important ones in practice when this was written in 2020):
-    - BERT built from a legacy BertConfig, for use with BertTokenizer.
-    - ALBERT built from an EncoderConfig (as a representative of all other
-      choices beyond BERT, for use with SentencepieceTokenizer (the one
-      alternative to BertTokenizer).
-  """
-  def test_copy_pooler_dense_to_encoder(self):
-    encoder_config = encoders.EncoderConfig(
-        type="bert",
-        bert=encoders.BertEncoderConfig(
-            hidden_size=24, intermediate_size=48, num_layers=2))
-    cls_heads = [
-        layers.ClassificationHead(
-            inner_dim=24, num_classes=2, name="next_sentence")
-    ]
-    encoder = encoders.build_encoder(encoder_config)
-    pretrainer = models.BertPretrainerV2(
-        encoder_network=encoder,
-        classification_heads=cls_heads,
-        mlm_activation=tf_utils.get_activation(
-            encoder_config.get().hidden_activation))
-    # Makes sure the pretrainer variables are created.
-    _ = pretrainer(pretrainer.inputs)
-    checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
-    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
-    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
-    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
-        self.get_temp_dir(), use_sp_model=True)
-    export_path = os.path.join(self.get_temp_dir(), "hub")
-    export_tfhub_lib.export_model(
-        export_path=export_path,
-        encoder_config=encoder_config,
-        model_checkpoint_path=tf.train.latest_checkpoint(model_checkpoint_dir),
-        with_mlm=True,
-        copy_pooler_dense_to_encoder=True,
-        vocab_file=vocab_file,
-        sp_model_file=sp_model_file,
-        do_lower_case=True)
-    # Restores a hub KerasLayer.
-    hub_layer = hub.KerasLayer(export_path, trainable=True)
-    dummy_ids = np.zeros((2, 10), dtype=np.int32)
-    input_dict = dict(
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids)
-    hub_pooled_output = hub_layer(input_dict)["pooled_output"]
-    encoder_outputs = encoder(input_dict)
-    # Verify that hub_layer's pooled_output is the same as the output of next
-    # sentence prediction's dense layer.
-    pretrained_pooled_output = cls_heads[0].dense(
-        (encoder_outputs["sequence_output"][:, 0, :]))
-    self.assertAllClose(hub_pooled_output, pretrained_pooled_output)
-    # But the pooled_output between encoder and hub_layer are not the same.
-    encoder_pooled_output = encoder_outputs["pooled_output"]
-    self.assertNotAllClose(hub_pooled_output, encoder_pooled_output)
-  @parameterized.named_parameters(
-      ("Bert", True),
-      ("Albert", False),
-  )
-  def test_export_model_with_mlm(self, use_bert):
-    # Create the encoder and export it.
-    hidden_size = 16
-    num_hidden_layers = 2
-    bert_config, encoder_config = _get_bert_config_or_encoder_config(
-        use_bert, hidden_size, num_hidden_layers)
-    bert_model, pretrainer = export_tfhub_lib._create_model(
-        bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
-    self.assertEmpty(
-        _find_lambda_layers(bert_model),
-        "Lambda layers are non-portable since they serialize Python bytecode.")
-    bert_model_with_mlm = bert_model.mlm
-    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
-    checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
-    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
-    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
-    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
-        self.get_temp_dir(), use_sp_model=not use_bert)
-    export_path = os.path.join(self.get_temp_dir(), "hub")
-    export_tfhub_lib.export_model(
-        export_path=export_path,
-        bert_config=bert_config,
-        encoder_config=encoder_config,
-        model_checkpoint_path=model_checkpoint_path,
-        with_mlm=True,
-        vocab_file=vocab_file,
-        sp_model_file=sp_model_file,
-        do_lower_case=True)
-    # Restore the exported model.
-    hub_layer = hub.KerasLayer(export_path, trainable=True)
-    # Check legacy tokenization data.
-    if use_bert:
-      self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
-      self.assertEqual("dummy content",
-                       _read_asset(hub_layer.resolved_object.vocab_file))
-      self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
-    else:
-      self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
-      self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
-      self.assertEqual("dummy content",
-                       _read_asset(hub_layer.resolved_object.sp_model_file))
-    # Check restored weights.
-    # Note that we set `_auto_track_sub_layers` to False when exporting the
-    # SavedModel, so hub_layer has the same number of weights as bert_model;
-    # otherwise, hub_layer will have extra weights from its `mlm` subobject.
-    self.assertEqual(
-        len(bert_model.trainable_weights), len(hub_layer.trainable_weights))
-    for source_weight, hub_weight in zip(bert_model.trainable_weights,
-                                         hub_layer.trainable_weights):
-      self.assertAllClose(source_weight, hub_weight)
-    # Check computation.
-    seq_length = 10
-    dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
-    input_dict = dict(
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids)
-    hub_outputs_dict = hub_layer(input_dict)
-    source_outputs_dict = bert_model(input_dict)
-    encoder_outputs_dict = pretrainer.encoder_network(
-        [dummy_ids, dummy_ids, dummy_ids])
-    self.assertEqual(hub_outputs_dict["pooled_output"].shape, (2, hidden_size))
-    self.assertEqual(hub_outputs_dict["sequence_output"].shape,
-                     (2, seq_length, hidden_size))
-    for output_key in ("pooled_output", "sequence_output", "encoder_outputs"):
-      self.assertAllClose(source_outputs_dict[output_key],
-                          hub_outputs_dict[output_key])
-      self.assertAllClose(source_outputs_dict[output_key],
-                          encoder_outputs_dict[output_key])
-    # The "default" output of BERT as a text representation is pooled_output.
-    self.assertAllClose(hub_outputs_dict["pooled_output"],
-                        hub_outputs_dict["default"])
-    # Test that training=True makes a difference (activates dropout).
-    def _dropout_mean_stddev(training, num_runs=20):
-      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
-      input_dict = dict(
-          input_word_ids=input_ids,
-          input_mask=np.ones_like(input_ids),
-          input_type_ids=np.zeros_like(input_ids))
-      outputs = np.concatenate([
-          hub_layer(input_dict, training=training)["pooled_output"]
-          for _ in range(num_runs)
-      ])
-      return np.mean(np.std(outputs, axis=0))
-    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
-    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
-    # Checks sub-object `mlm`.
-    self.assertTrue(hasattr(hub_layer.resolved_object, "mlm"))
-    self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
-                   len(bert_model_with_mlm.trainable_weights))
-    self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
-                   len(pretrainer.trainable_weights))
-    for source_weight, hub_weight, pretrainer_weight in zip(
-        bert_model_with_mlm.trainable_weights,
-        hub_layer.resolved_object.mlm.trainable_variables,
-        pretrainer.trainable_weights):
-      self.assertAllClose(source_weight, hub_weight)
-      self.assertAllClose(source_weight, pretrainer_weight)
-    max_predictions_per_seq = 4
-    mlm_positions = np.zeros((2, max_predictions_per_seq), dtype=np.int32)
-    input_dict = dict(
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids,
-        masked_lm_positions=mlm_positions)
-    hub_mlm_outputs_dict = hub_layer.resolved_object.mlm(input_dict)
-    source_mlm_outputs_dict = bert_model_with_mlm(input_dict)
-    for output_key in ("pooled_output", "sequence_output", "mlm_logits",
-                       "encoder_outputs"):
-      self.assertAllClose(hub_mlm_outputs_dict[output_key],
-                          source_mlm_outputs_dict[output_key])
-    pretrainer_mlm_logits_output = pretrainer(input_dict)["mlm_logits"]
-    self.assertAllClose(hub_mlm_outputs_dict["mlm_logits"],
-                        pretrainer_mlm_logits_output)
-    # Test that training=True makes a difference (activates dropout).
-    def _dropout_mean_stddev_mlm(training, num_runs=20):
-      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
-      mlm_position_ids = np.array([[1, 2, 3, 4]], np.int32)
-      input_dict = dict(
-          input_word_ids=input_ids,
-          input_mask=np.ones_like(input_ids),
-          input_type_ids=np.zeros_like(input_ids),
-          masked_lm_positions=mlm_position_ids)
-      outputs = np.concatenate([
-          hub_layer.resolved_object.mlm(input_dict,
-                                        training=training)["pooled_output"]
-          for _ in range(num_runs)
-      ])
-      return np.mean(np.std(outputs, axis=0))
-    self.assertLess(_dropout_mean_stddev_mlm(training=False), 1e-6)
-    self.assertGreater(_dropout_mean_stddev_mlm(training=True), 1e-3)
-    # Test propagation of seq_length in shape inference.
-    input_word_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_mask = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_type_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_dict = dict(
-        input_word_ids=input_word_ids,
-        input_mask=input_mask,
-        input_type_ids=input_type_ids)
-    hub_outputs_dict = hub_layer(input_dict)
-    self.assertEqual(hub_outputs_dict["pooled_output"].shape.as_list(),
-                     [None, hidden_size])
-    self.assertEqual(hub_outputs_dict["sequence_output"].shape.as_list(),
-                     [None, seq_length, hidden_size])
-_STRING_NOT_TO_LEAK = "private_path_component_"
-class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
-  def _make_vocab_file(self, vocab, filename="vocab.txt", add_mask_token=False):
-    """Creates wordpiece vocab file with given words plus special tokens.
-    The tokens of the resulting model are, in this order:
-        [PAD], [UNK], [CLS], [SEP], [MASK]*, ...vocab...
-    *=if requested by args.
-    This function also accepts wordpieces that start with the ## continuation
-    marker, but avoiding those makes this function interchangeable with
-    _make_sp_model_file(), up to the extra dimension returned by BertTokenizer.
-    Args:
-      vocab: a list of strings with the words or wordpieces to put into the
-        model's vocabulary. Do not include special tokens here.
-      filename: Optionally, a filename (relative to the temporary directory
-        created by this function).
-      add_mask_token: an optional bool, whether to include a [MASK] token.
-    Returns:
-      The absolute filename of the created vocab file.
-    """
-    full_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"
-                 ] + ["[MASK]"] * add_mask_token + vocab
-    path = os.path.join(
-        tempfile.mkdtemp(
-            dir=self.get_temp_dir(),  # New subdir each time.
-            prefix=_STRING_NOT_TO_LEAK),
-        filename)
-    with tf.io.gfile.GFile(path, "w") as f:
-      f.write("\n".join(full_vocab + [""]))
-    return path
-  def _make_sp_model_file(self, vocab, prefix="spm", add_mask_token=False):
-    """Creates Sentencepiece word model with given words plus special tokens.
-    The tokens of the resulting model are, in this order:
-        <pad>, <unk>, [CLS], [SEP], [MASK]*, ...vocab..., <s>, </s>
-    *=if requested by args.
-    The words in the input vocab are plain text, without the whitespace marker.
-    That makes this function interchangeable with _make_vocab_file().
-    Args:
-      vocab: a list of strings with the words to put into the model's
-        vocabulary. Do not include special tokens here.
-      prefix: an optional string, to change the filename prefix for the model
-        (relative to the temporary directory created by this function).
-      add_mask_token: an optional bool, whether to include a [MASK] token.
-    Returns:
-      The absolute filename of the created Sentencepiece model file.
-    """
-    model_prefix = os.path.join(
-        tempfile.mkdtemp(dir=self.get_temp_dir()),  # New subdir each time.
-        prefix)
-    input_file = model_prefix + "_train_input.txt"
-    # Create input text for training the sp model from the tokens provided.
-    # Repeat tokens, the earlier the more, because they are sorted by frequency.
-    input_text = []
-    for i, token in enumerate(vocab):
-      input_text.append(" ".join([token] * (len(vocab) - i)))
-    with tf.io.gfile.GFile(input_file, "w") as f:
-      f.write("\n".join(input_text + [""]))
-    control_symbols = "[CLS],[SEP]"
-    full_vocab_size = len(vocab) + 6  # <pad>, <unk>, [CLS], [SEP], <s>, </s>.
-    if add_mask_token:
-      control_symbols += ",[MASK]"
-      full_vocab_size += 1
-    flags = dict(
-        model_prefix=model_prefix,
-        model_type="word",
-        input=input_file,
-        pad_id=0,
-        unk_id=1,
-        control_symbols=control_symbols,
-        vocab_size=full_vocab_size,
-        bos_id=full_vocab_size - 2,
-        eos_id=full_vocab_size - 1)
-    SentencePieceTrainer.Train(" ".join(
-        ["--{}={}".format(k, v) for k, v in flags.items()]))
-    return model_prefix + ".model"
-  def _do_export(self,
-                 vocab,
-                 do_lower_case,
-                 default_seq_length=128,
-                 tokenize_with_offsets=True,
-                 use_sp_model=False,
-                 experimental_disable_assert=False,
-                 add_mask_token=False):
-    """Runs SavedModel export and returns the export_path."""
-    export_path = tempfile.mkdtemp(dir=self.get_temp_dir())
-    vocab_file = sp_model_file = None
-    if use_sp_model:
-      sp_model_file = self._make_sp_model_file(
-          vocab, add_mask_token=add_mask_token)
-    else:
-      vocab_file = self._make_vocab_file(vocab, add_mask_token=add_mask_token)
-    export_tfhub_lib.export_preprocessing(
-        export_path,
-        vocab_file=vocab_file,
-        sp_model_file=sp_model_file,
-        do_lower_case=do_lower_case,
-        tokenize_with_offsets=tokenize_with_offsets,
-        default_seq_length=default_seq_length,
-        experimental_disable_assert=experimental_disable_assert)
-    # Invalidate the original filename to verify loading from the SavedModel.
-    tf.io.gfile.remove(sp_model_file or vocab_file)
-    return export_path
-  def test_no_leaks(self):
-    """Tests not leaking the path to the original vocab file."""
-    path = self._do_export(["d", "ef", "abc", "xy"],
-                           do_lower_case=True,
-                           use_sp_model=False)
-    with tf.io.gfile.GFile(os.path.join(path, "saved_model.pb"), "rb") as f:
-      self.assertFalse(  # pylint: disable=g-generic-assert
-          _STRING_NOT_TO_LEAK.encode("ascii") in f.read())
-  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
-  def test_exported_callables(self, use_sp_model):
-    preprocess = tf.saved_model.load(
-        self._do_export(
-            ["d", "ef", "abc", "xy"],
-            do_lower_case=True,
-            # TODO(b/181866850): drop this.
-            tokenize_with_offsets=not use_sp_model,
-            # TODO(b/175369555): drop this.
-            experimental_disable_assert=True,
-            use_sp_model=use_sp_model))
-    def fold_dim(rt):
-      """Removes the word/subword distinction of BertTokenizer."""
-      return rt if use_sp_model else rt.merge_dims(1, 2)
-    # .tokenize()
-    inputs = tf.constant(["abc d ef", "ABC D EF d"])
-    token_ids = preprocess.tokenize(inputs)
-    self.assertAllEqual(
-        fold_dim(token_ids), tf.ragged.constant([[6, 4, 5], [6, 4, 5, 4]]))
-    special_tokens_dict = {
-        k: v.numpy().item()  # Expecting eager Tensor, converting to Python.
-        for k, v in preprocess.tokenize.get_special_tokens_dict().items()
-    }
-    self.assertDictEqual(
-        special_tokens_dict,
-        dict(
-            padding_id=0,
-            start_of_sequence_id=2,
-            end_of_segment_id=3,
-            vocab_size=4 + 6 if use_sp_model else 4 + 4))
-    # .tokenize_with_offsets()
-    if use_sp_model:
-      # TODO(b/181866850): Enable tokenize_with_offsets when it works and test.
-      self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
-    else:
-      token_ids, start_offsets, limit_offsets = (
-          preprocess.tokenize_with_offsets(inputs))
-      self.assertAllEqual(
-          fold_dim(token_ids), tf.ragged.constant([[6, 4, 5], [6, 4, 5, 4]]))
-      self.assertAllEqual(
-          fold_dim(start_offsets), tf.ragged.constant([[0, 4, 6], [0, 4, 6,
-                                                                   9]]))
-      self.assertAllEqual(
-          fold_dim(limit_offsets), tf.ragged.constant([[3, 5, 8], [3, 5, 8,
-                                                                   10]]))
-      self.assertIs(preprocess.tokenize.get_special_tokens_dict,
-                    preprocess.tokenize_with_offsets.get_special_tokens_dict)
-    # Root callable.
-    bert_inputs = preprocess(inputs)
-    self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 128])
-    self.assertAllEqual(
-        bert_inputs["input_word_ids"][:, :10],
-        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
-                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
-    self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 128])
-    self.assertAllEqual(
-        bert_inputs["input_mask"][:, :10],
-        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
-                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
-    self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 128])
-    self.assertAllEqual(
-        bert_inputs["input_type_ids"][:, :10],
-        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
-    # .bert_pack_inputs()
-    inputs_2 = tf.constant(["d xy", "xy abc"])
-    token_ids_2 = preprocess.tokenize(inputs_2)
-    bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2],
-                                              seq_length=256)
-    self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 256])
-    self.assertAllEqual(
-        bert_inputs["input_word_ids"][:, :10],
-        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
-                     [2, 6, 4, 5, 4, 3, 7, 6, 3, 0]]))
-    self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 256])
-    self.assertAllEqual(
-        bert_inputs["input_mask"][:, :10],
-        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
-                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]))
-    self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 256])
-    self.assertAllEqual(
-        bert_inputs["input_type_ids"][:, :10],
-        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
-                     [0, 0, 0, 0, 0, 0, 1, 1, 1, 0]]))
-  # For BertTokenizer only: repeat relevant parts for do_lower_case=False,
-  # default_seq_length=10, experimental_disable_assert=False,
-  # tokenize_with_offsets=False, and without folding the word/subword dimension.
-  def test_cased_length10(self):
-    preprocess = tf.saved_model.load(
-        self._do_export(["d", "##ef", "abc", "ABC"],
-                        do_lower_case=False,
-                        default_seq_length=10,
-                        tokenize_with_offsets=False,
-                        use_sp_model=False,
-                        experimental_disable_assert=False))
-    inputs = tf.constant(["abc def", "ABC DEF"])
-    token_ids = preprocess.tokenize(inputs)
-    self.assertAllEqual(token_ids,
-                        tf.ragged.constant([[[6], [4, 5]], [[7], [1]]]))
-    self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
-    bert_inputs = preprocess(inputs)
-    self.assertAllEqual(
-        bert_inputs["input_word_ids"],
-        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
-                     [2, 7, 1, 3, 0, 0, 0, 0, 0, 0]]))
-    self.assertAllEqual(
-        bert_inputs["input_mask"],
-        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
-                     [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]))
-    self.assertAllEqual(
-        bert_inputs["input_type_ids"],
-        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
-    inputs_2 = tf.constant(["d ABC", "ABC abc"])
-    token_ids_2 = preprocess.tokenize(inputs_2)
-    bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2])
-    # Test default seq_length=10.
-    self.assertAllEqual(
-        bert_inputs["input_word_ids"],
-        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
-                     [2, 7, 1, 3, 7, 6, 3, 0, 0, 0]]))
-    self.assertAllEqual(
-        bert_inputs["input_mask"],
-        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
-                     [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]))
-    self.assertAllEqual(
-        bert_inputs["input_type_ids"],
-        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
-                     [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]]))
-  # XLA requires fixed shapes for tensors found in graph mode.
-  # Statically known shapes in Python are a particularly firm way to
-  # guarantee that, and they are generally more convenient to work with.
-  # We test that the exported SavedModel plays well with TF's shape
-  # inference when applied to fully or partially known input shapes.
-  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
-  def test_shapes(self, use_sp_model):
-    preprocess = tf.saved_model.load(
-        self._do_export(
-            ["abc", "def"],
-            do_lower_case=True,
-            # TODO(b/181866850): drop this.
-            tokenize_with_offsets=not use_sp_model,
-            # TODO(b/175369555): drop this.
-            experimental_disable_assert=True,
-            use_sp_model=use_sp_model))
-    def expected_bert_input_shapes(batch_size, seq_length):
-      return dict(
-          input_word_ids=[batch_size, seq_length],
-          input_mask=[batch_size, seq_length],
-          input_type_ids=[batch_size, seq_length])
-    for batch_size in [7, None]:
-      if use_sp_model:
-        token_out_shape = [batch_size, None]  # No word/subword distinction.
-      else:
-        token_out_shape = [batch_size, None, None]
-      self.assertEqual(
-          _result_shapes_in_tf_function(preprocess.tokenize,
-                                        tf.TensorSpec([batch_size], tf.string)),
-          token_out_shape, "with batch_size=%s" % batch_size)
-      # TODO(b/181866850): Enable tokenize_with_offsets when it works and test.
-      if use_sp_model:
-        self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
-      else:
-        self.assertEqual(
-            _result_shapes_in_tf_function(
-                preprocess.tokenize_with_offsets,
-                tf.TensorSpec([batch_size], tf.string)), [token_out_shape] * 3,
-            "with batch_size=%s" % batch_size)
-      self.assertEqual(
-          _result_shapes_in_tf_function(
-              preprocess.bert_pack_inputs,
-              [tf.RaggedTensorSpec([batch_size, None, None], tf.int32)] * 2,
-              seq_length=256), expected_bert_input_shapes(batch_size, 256),
-          "with batch_size=%s" % batch_size)
-      self.assertEqual(
-          _result_shapes_in_tf_function(preprocess,
-                                        tf.TensorSpec([batch_size], tf.string)),
-          expected_bert_input_shapes(batch_size, 128),
-          "with batch_size=%s" % batch_size)
-  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
-  def test_reexport(self, use_sp_model):
-    """Test that preprocess keeps working after another save/load cycle."""
-    path1 = self._do_export(
-        ["d", "ef", "abc", "xy"],
-        do_lower_case=True,
-        default_seq_length=10,
-        tokenize_with_offsets=False,
-        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
-        use_sp_model=use_sp_model)
-    path2 = path1.rstrip("/") + ".2"
-    model1 = tf.saved_model.load(path1)
-    tf.saved_model.save(model1, path2)
-    # Delete the first SavedModel to test that the sceond one loads by itself.
-    # https://github.com/tensorflow/tensorflow/issues/46456 reports such a
-    # failure case for BertTokenizer.
-    tf.io.gfile.rmtree(path1)
-    model2 = tf.saved_model.load(path2)
-    inputs = tf.constant(["abc d ef", "ABC D EF d"])
-    bert_inputs = model2(inputs)
-    self.assertAllEqual(
-        bert_inputs["input_word_ids"],
-        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
-                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
-    self.assertAllEqual(
-        bert_inputs["input_mask"],
-        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
-                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
-    self.assertAllEqual(
-        bert_inputs["input_type_ids"],
-        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
-  @parameterized.named_parameters(("Bert", True), ("Albert", False))
-  def test_preprocessing_for_mlm(self, use_bert):
-    """Combines both SavedModel types and TF.text helpers for MLM."""
-    # Create the preprocessing SavedModel with a [MASK] token.
-    non_special_tokens = [
-        "hello", "world", "nice", "movie", "great", "actors", "quick", "fox",
-        "lazy", "dog"
-    ]
-    preprocess = tf.saved_model.load(
-        self._do_export(
-            non_special_tokens,
-            do_lower_case=True,
-            tokenize_with_offsets=use_bert,  # TODO(b/181866850): drop this.
-            experimental_disable_assert=True,  # TODO(b/175369555): drop this.
-            add_mask_token=True,
-            use_sp_model=not use_bert))
-    vocab_size = len(non_special_tokens) + (5 if use_bert else 7)
-    # Create the encoder SavedModel with an .mlm subobject.
-    hidden_size = 16
-    num_hidden_layers = 2
-    bert_config, encoder_config = _get_bert_config_or_encoder_config(
-        use_bert_config=use_bert,
-        hidden_size=hidden_size,
-        num_hidden_layers=num_hidden_layers,
-        vocab_size=vocab_size)
-    _, pretrainer = export_tfhub_lib._create_model(
-        bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
-    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
-    checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
-    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
-    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
-    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(  # Not used below.
-        self.get_temp_dir(), use_sp_model=not use_bert)
-    encoder_export_path = os.path.join(self.get_temp_dir(), "encoder_export")
-    export_tfhub_lib.export_model(
-        export_path=encoder_export_path,
-        bert_config=bert_config,
-        encoder_config=encoder_config,
-        model_checkpoint_path=model_checkpoint_path,
-        with_mlm=True,
-        vocab_file=vocab_file,
-        sp_model_file=sp_model_file,
-        do_lower_case=True)
-    encoder = tf.saved_model.load(encoder_export_path)
-    # Get special tokens from the vocab (and vocab size).
-    special_tokens_dict = preprocess.tokenize.get_special_tokens_dict()
-    self.assertEqual(int(special_tokens_dict["vocab_size"]), vocab_size)
-    padding_id = int(special_tokens_dict["padding_id"])
-    self.assertEqual(padding_id, 0)
-    start_of_sequence_id = int(special_tokens_dict["start_of_sequence_id"])
-    self.assertEqual(start_of_sequence_id, 2)
-    end_of_segment_id = int(special_tokens_dict["end_of_segment_id"])
-    self.assertEqual(end_of_segment_id, 3)
-    mask_id = int(special_tokens_dict["mask_id"])
-    self.assertEqual(mask_id, 4)
-    # A batch of 3 segment pairs.
-    raw_segments = [
-        tf.constant(["hello", "nice movie", "quick fox"]),
-        tf.constant(["world", "great actors", "lazy dog"])
-    ]
-    batch_size = 3
-    # Misc hyperparameters.
-    seq_length = 10
-    max_selections_per_seq = 2
-    # Tokenize inputs.
-    tokenized_segments = [preprocess.tokenize(s) for s in raw_segments]
-    # Trim inputs to eventually fit seq_lentgh.
-    num_special_tokens = len(raw_segments) + 1
-    trimmed_segments = text.WaterfallTrimmer(
-        seq_length - num_special_tokens).trim(tokenized_segments)
-    # Combine input segments into one input sequence.
-    input_ids, segment_ids = text.combine_segments(
-        trimmed_segments,
-        start_of_sequence_id=start_of_sequence_id,
-        end_of_segment_id=end_of_segment_id)
-    # Apply random masking controlled by policy objects.
-    (masked_input_ids, masked_lm_positions,
-     masked_ids) = text.mask_language_model(
-         input_ids=input_ids,
-         item_selector=text.RandomItemSelector(
-             max_selections_per_seq,
-             selection_rate=0.5,  # Adjusted for the short test examples.
-             unselectable_ids=[start_of_sequence_id, end_of_segment_id]),
-         mask_values_chooser=text.MaskValuesChooser(
-             vocab_size=vocab_size,
-             mask_token=mask_id,
-             # Always put [MASK] to have a predictable result.
-             mask_token_rate=1.0,
-             random_token_rate=0.0))
-    # Pad to fixed-length Transformer encoder inputs.
-    input_word_ids, _ = text.pad_model_inputs(
-        masked_input_ids, seq_length, pad_value=padding_id)
-    input_type_ids, input_mask = text.pad_model_inputs(
-        segment_ids, seq_length, pad_value=0)
-    masked_lm_positions, _ = text.pad_model_inputs(
-        masked_lm_positions, max_selections_per_seq, pad_value=0)
-    masked_lm_positions = tf.cast(masked_lm_positions, tf.int32)
-    num_predictions = int(tf.shape(masked_lm_positions)[1])
-    # Test transformer inputs.
-    self.assertEqual(num_predictions, max_selections_per_seq)
-    expected_word_ids = np.array([
-        # [CLS] hello [SEP] world [SEP]
-        [2, 5, 3, 6, 3, 0, 0, 0, 0, 0],
-        # [CLS] nice movie [SEP] great actors [SEP]
-        [2, 7, 8, 3, 9, 10, 3, 0, 0, 0],
-        # [CLS] brown fox [SEP] lazy dog [SEP]
-        [2, 11, 12, 3, 13, 14, 3, 0, 0, 0]
-    ])
-    for i in range(batch_size):
-      for j in range(num_predictions):
-        k = int(masked_lm_positions[i, j])
-        if k != 0:
-          expected_word_ids[i, k] = 4  # [MASK]
-    self.assertAllEqual(input_word_ids, expected_word_ids)
-    # Call the MLM head of the Transformer encoder.
-    mlm_inputs = dict(
-        input_word_ids=input_word_ids,
-        input_mask=input_mask,
-        input_type_ids=input_type_ids,
-        masked_lm_positions=masked_lm_positions,
-    )
-    mlm_outputs = encoder.mlm(mlm_inputs)
-    self.assertEqual(mlm_outputs["pooled_output"].shape,
-                     (batch_size, hidden_size))
-    self.assertEqual(mlm_outputs["sequence_output"].shape,
-                     (batch_size, seq_length, hidden_size))
-    self.assertEqual(mlm_outputs["mlm_logits"].shape,
-                     (batch_size, num_predictions, vocab_size))
-    self.assertLen(mlm_outputs["encoder_outputs"], num_hidden_layers)
-    # A real trainer would now compute the loss of mlm_logits
-    # trying to predict the masked_ids.
-    del masked_ids  # Unused.
-  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
-  def test_special_tokens_in_estimator(self, use_sp_model):
-    """Tests getting special tokens without an Eager init context."""
-    preprocess_export_path = self._do_export(["d", "ef", "abc", "xy"],
-                                             do_lower_case=True,
-                                             use_sp_model=use_sp_model,
-                                             tokenize_with_offsets=False)
-    def _get_special_tokens_dict(obj):
-      """Returns special tokens of restored tokenizer as Python values."""
-      if tf.executing_eagerly():
-        special_tokens_numpy = {
-            k: v.numpy() for k, v in obj.get_special_tokens_dict()
-        }
-      else:
-        with tf.Graph().as_default():
-          # This code expects `get_special_tokens_dict()` to be a tf.function
-          # with no dependencies (bound args) from the context it was loaded in,
-          # and boldly assumes that it can just be called in a dfferent context.
-          special_tokens_tensors = obj.get_special_tokens_dict()
-          with tf.compat.v1.Session() as sess:
-            special_tokens_numpy = sess.run(special_tokens_tensors)
-      return {
-          k: v.item()  # Numpy to Python.
-          for k, v in special_tokens_numpy.items()
-      }
-    def input_fn():
-      self.assertFalse(tf.executing_eagerly())
-      # Build a preprocessing Model.
-      sentences = tf_keras.layers.Input(shape=[], dtype=tf.string)
-      preprocess = tf.saved_model.load(preprocess_export_path)
-      tokenize = hub.KerasLayer(preprocess.tokenize)
-      special_tokens_dict = _get_special_tokens_dict(tokenize.resolved_object)
-      for k, v in special_tokens_dict.items():
-        self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
-      tokens = tokenize(sentences)
-      packed_inputs = layers.BertPackInputs(
-          4, special_tokens_dict=special_tokens_dict)(
-              tokens)
-      preprocessing = tf_keras.Model(sentences, packed_inputs)
-      # Map the dataset.
-      ds = tf.data.Dataset.from_tensors(
-          (tf.constant(["abc", "D EF"]), tf.constant([0, 1])))
-      ds = ds.map(lambda features, labels: (preprocessing(features), labels))
-      return ds
-    def model_fn(features, labels, mode):
-      del labels  # Unused.
-      return tf_estimator.EstimatorSpec(
-          mode=mode, predictions=features["input_word_ids"])
-    estimator = tf_estimator.Estimator(model_fn=model_fn)
-    outputs = list(estimator.predict(input_fn))
-    self.assertAllEqual(outputs, np.array([[2, 6, 3, 0], [2, 4, 5, 3]]))
-  # TODO(b/175369555): Remove that code and its test.
-  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
-  def test_check_no_assert(self, use_sp_model):
-    """Tests the self-check during export without assertions."""
-    preprocess_export_path = self._do_export(["d", "ef", "abc", "xy"],
-                                             do_lower_case=True,
-                                             use_sp_model=use_sp_model,
-                                             tokenize_with_offsets=False,
-                                             experimental_disable_assert=False)
-    with self.assertRaisesRegex(AssertionError,
-                                r"failed to suppress \d+ Assert ops"):
-      export_tfhub_lib._check_no_assert(preprocess_export_path)
-def _result_shapes_in_tf_function(fn, *args, **kwargs):
-  """Returns shapes (as lists) observed on the result of `fn`.
-  Args:
-    fn: A callable.
-    *args: TensorSpecs for Tensor-valued arguments and actual values for
-      Python-valued arguments to fn.
-    **kwargs: Same for keyword arguments.
-  Returns:
-    The nest of partial tensor shapes (as lists) that is statically known inside
-    tf.function(fn)(*args, **kwargs) for the nest of its results.
-  """
-  # Use a captured mutable container for a side outout from the wrapper.
-  uninitialized = "uninitialized!"
-  result_shapes_container = [uninitialized]
-  assert result_shapes_container[0] is uninitialized
-  @tf.function
-  def shape_reporting_wrapper(*args, **kwargs):
-    result = fn(*args, **kwargs)
-    result_shapes_container[0] = tf.nest.map_structure(
-        lambda x: x.shape.as_list(), result)
-    return result
-  shape_reporting_wrapper.get_concrete_function(*args, **kwargs)
-  assert result_shapes_container[0] is not uninitialized
-  return result_shapes_container[0]
-if __name__ == "__main__":
-  tf.test.main()