Pradeep Kumar commited on
Commit
f891af0
·
verified ·
1 Parent(s): 69d32b7

Delete export_tfhub_lib_test.py

Browse files
Files changed (1) hide show
  1. export_tfhub_lib_test.py +0 -1080
export_tfhub_lib_test.py DELETED
@@ -1,1080 +0,0 @@
1
- # Copyright 2024 The TensorFlow Authors. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- """Tests export_tfhub_lib."""
16
-
17
- import os
18
- import tempfile
19
-
20
- from absl.testing import parameterized
21
- import numpy as np
22
- import tensorflow as tf, tf_keras
23
- from tensorflow import estimator as tf_estimator
24
- import tensorflow_hub as hub
25
- import tensorflow_text as text
26
-
27
- from sentencepiece import SentencePieceTrainer
28
- from official.legacy.bert import configs
29
- from official.modeling import tf_utils
30
- from official.nlp.configs import encoders
31
- from official.nlp.modeling import layers
32
- from official.nlp.modeling import models
33
- from official.nlp.tools import export_tfhub_lib
34
-
35
-
36
- def _get_bert_config_or_encoder_config(use_bert_config,
37
- hidden_size,
38
- num_hidden_layers,
39
- encoder_type="albert",
40
- vocab_size=100):
41
- """Generates config args for export_tfhub_lib._create_model().
42
-
43
- Args:
44
- use_bert_config: bool. If True, returns legacy BertConfig.
45
- hidden_size: int.
46
- num_hidden_layers: int.
47
- encoder_type: str. Can be ['albert', 'bert', 'bert_v2']. If use_bert_config
48
- == True, then model_type is not used.
49
- vocab_size: int.
50
-
51
- Returns:
52
- bert_config, encoder_config. Only one is not None. If
53
- `use_bert_config` == True, the first config is valid. Otherwise
54
- `bert_config` == None.
55
- """
56
- if use_bert_config:
57
- bert_config = configs.BertConfig(
58
- vocab_size=vocab_size,
59
- hidden_size=hidden_size,
60
- intermediate_size=32,
61
- max_position_embeddings=128,
62
- num_attention_heads=2,
63
- num_hidden_layers=num_hidden_layers)
64
- encoder_config = None
65
- else:
66
- bert_config = None
67
- if encoder_type == "albert":
68
- encoder_config = encoders.EncoderConfig(
69
- type="albert",
70
- albert=encoders.AlbertEncoderConfig(
71
- vocab_size=vocab_size,
72
- embedding_width=16,
73
- hidden_size=hidden_size,
74
- intermediate_size=32,
75
- max_position_embeddings=128,
76
- num_attention_heads=2,
77
- num_layers=num_hidden_layers,
78
- dropout_rate=0.1))
79
- else:
80
- # encoder_type can be 'bert' or 'bert_v2'.
81
- model_config = encoders.BertEncoderConfig(
82
- vocab_size=vocab_size,
83
- embedding_size=16,
84
- hidden_size=hidden_size,
85
- intermediate_size=32,
86
- max_position_embeddings=128,
87
- num_attention_heads=2,
88
- num_layers=num_hidden_layers,
89
- dropout_rate=0.1)
90
- kwargs = {"type": encoder_type, encoder_type: model_config}
91
- encoder_config = encoders.EncoderConfig(**kwargs)
92
-
93
- return bert_config, encoder_config
94
-
95
-
96
- def _get_vocab_or_sp_model_dummy(temp_dir, use_sp_model):
97
- """Returns tokenizer asset args for export_tfhub_lib.export_model()."""
98
- dummy_file = os.path.join(temp_dir, "dummy_file.txt")
99
- with tf.io.gfile.GFile(dummy_file, "w") as f:
100
- f.write("dummy content")
101
- if use_sp_model:
102
- vocab_file, sp_model_file = None, dummy_file
103
- else:
104
- vocab_file, sp_model_file = dummy_file, None
105
- return vocab_file, sp_model_file
106
-
107
-
108
- def _read_asset(asset: tf.saved_model.Asset):
109
- return tf.io.gfile.GFile(asset.asset_path.numpy()).read()
110
-
111
-
112
- def _find_lambda_layers(layer):
113
- """Returns list of all Lambda layers in a Keras model."""
114
- if isinstance(layer, tf_keras.layers.Lambda):
115
- return [layer]
116
- elif hasattr(layer, "layers"): # It's nested, like a Model.
117
- result = []
118
- for l in layer.layers:
119
- result += _find_lambda_layers(l)
120
- return result
121
- else:
122
- return []
123
-
124
-
125
- class ExportModelTest(tf.test.TestCase, parameterized.TestCase):
126
- """Tests exporting a Transformer Encoder model as a SavedModel.
127
-
128
- This covers export from an Encoder checkpoint to a SavedModel without
129
- the .mlm subobject. This is no longer preferred, but still useful
130
- for models like Electra that are trained without the MLM task.
131
-
132
- The export code is generic. This test focuses on two main cases
133
- (the most important ones in practice when this was written in 2020):
134
- - BERT built from a legacy BertConfig, for use with BertTokenizer.
135
- - ALBERT built from an EncoderConfig (as a representative of all other
136
- choices beyond BERT, for use with SentencepieceTokenizer (the one
137
- alternative to BertTokenizer).
138
- """
139
-
140
- @parameterized.named_parameters(
141
- ("Bert_Legacy", True, None), ("Albert", False, "albert"),
142
- ("BertEncoder", False, "bert"), ("BertEncoderV2", False, "bert_v2"))
143
- def test_export_model(self, use_bert, encoder_type):
144
- # Create the encoder and export it.
145
- hidden_size = 16
146
- num_hidden_layers = 1
147
- bert_config, encoder_config = _get_bert_config_or_encoder_config(
148
- use_bert,
149
- hidden_size=hidden_size,
150
- num_hidden_layers=num_hidden_layers,
151
- encoder_type=encoder_type)
152
- bert_model, encoder = export_tfhub_lib._create_model(
153
- bert_config=bert_config, encoder_config=encoder_config, with_mlm=False)
154
- self.assertEmpty(
155
- _find_lambda_layers(bert_model),
156
- "Lambda layers are non-portable since they serialize Python bytecode.")
157
- model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
158
- checkpoint = tf.train.Checkpoint(encoder=encoder)
159
- checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
160
- model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
161
-
162
- vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
163
- self.get_temp_dir(), use_sp_model=not use_bert)
164
- export_path = os.path.join(self.get_temp_dir(), "hub")
165
- export_tfhub_lib.export_model(
166
- export_path=export_path,
167
- bert_config=bert_config,
168
- encoder_config=encoder_config,
169
- model_checkpoint_path=model_checkpoint_path,
170
- with_mlm=False,
171
- vocab_file=vocab_file,
172
- sp_model_file=sp_model_file,
173
- do_lower_case=True)
174
-
175
- # Restore the exported model.
176
- hub_layer = hub.KerasLayer(export_path, trainable=True)
177
-
178
- # Check legacy tokenization data.
179
- if use_bert:
180
- self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
181
- self.assertEqual("dummy content",
182
- _read_asset(hub_layer.resolved_object.vocab_file))
183
- self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
184
- else:
185
- self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
186
- self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
187
- self.assertEqual("dummy content",
188
- _read_asset(hub_layer.resolved_object.sp_model_file))
189
-
190
- # Check restored weights.
191
- self.assertEqual(
192
- len(bert_model.trainable_weights), len(hub_layer.trainable_weights))
193
- for source_weight, hub_weight in zip(bert_model.trainable_weights,
194
- hub_layer.trainable_weights):
195
- self.assertAllClose(source_weight.numpy(), hub_weight.numpy())
196
-
197
- # Check computation.
198
- seq_length = 10
199
- dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
200
- input_dict = dict(
201
- input_word_ids=dummy_ids,
202
- input_mask=dummy_ids,
203
- input_type_ids=dummy_ids)
204
- hub_output = hub_layer(input_dict)
205
- source_output = bert_model(input_dict)
206
- encoder_output = encoder(input_dict)
207
- self.assertEqual(hub_output["pooled_output"].shape, (2, hidden_size))
208
- self.assertEqual(hub_output["sequence_output"].shape,
209
- (2, seq_length, hidden_size))
210
- self.assertLen(hub_output["encoder_outputs"], num_hidden_layers)
211
-
212
- for key in ("pooled_output", "sequence_output", "encoder_outputs"):
213
- self.assertAllClose(source_output[key], hub_output[key])
214
- self.assertAllClose(source_output[key], encoder_output[key])
215
-
216
- # The "default" output of BERT as a text representation is pooled_output.
217
- self.assertAllClose(hub_output["pooled_output"], hub_output["default"])
218
-
219
- # Test that training=True makes a difference (activates dropout).
220
- def _dropout_mean_stddev(training, num_runs=20):
221
- input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
222
- input_dict = dict(
223
- input_word_ids=input_ids,
224
- input_mask=np.ones_like(input_ids),
225
- input_type_ids=np.zeros_like(input_ids))
226
- outputs = np.concatenate([
227
- hub_layer(input_dict, training=training)["pooled_output"]
228
- for _ in range(num_runs)
229
- ])
230
- return np.mean(np.std(outputs, axis=0))
231
-
232
- self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
233
- self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
234
-
235
- # Test propagation of seq_length in shape inference.
236
- input_word_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
237
- input_mask = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
238
- input_type_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
239
- input_dict = dict(
240
- input_word_ids=input_word_ids,
241
- input_mask=input_mask,
242
- input_type_ids=input_type_ids)
243
- output_dict = hub_layer(input_dict)
244
- pooled_output = output_dict["pooled_output"]
245
- sequence_output = output_dict["sequence_output"]
246
- encoder_outputs = output_dict["encoder_outputs"]
247
-
248
- self.assertEqual(pooled_output.shape.as_list(), [None, hidden_size])
249
- self.assertEqual(sequence_output.shape.as_list(),
250
- [None, seq_length, hidden_size])
251
- self.assertLen(encoder_outputs, num_hidden_layers)
252
-
253
-
254
- class ExportModelWithMLMTest(tf.test.TestCase, parameterized.TestCase):
255
- """Tests exporting a Transformer Encoder model as a SavedModel.
256
-
257
- This covers export from a Pretrainer checkpoint to a SavedModel including
258
- the .mlm subobject, which is the preferred way since 2020.
259
-
260
- The export code is generic. This test focuses on two main cases
261
- (the most important ones in practice when this was written in 2020):
262
- - BERT built from a legacy BertConfig, for use with BertTokenizer.
263
- - ALBERT built from an EncoderConfig (as a representative of all other
264
- choices beyond BERT, for use with SentencepieceTokenizer (the one
265
- alternative to BertTokenizer).
266
- """
267
-
268
- def test_copy_pooler_dense_to_encoder(self):
269
- encoder_config = encoders.EncoderConfig(
270
- type="bert",
271
- bert=encoders.BertEncoderConfig(
272
- hidden_size=24, intermediate_size=48, num_layers=2))
273
- cls_heads = [
274
- layers.ClassificationHead(
275
- inner_dim=24, num_classes=2, name="next_sentence")
276
- ]
277
- encoder = encoders.build_encoder(encoder_config)
278
- pretrainer = models.BertPretrainerV2(
279
- encoder_network=encoder,
280
- classification_heads=cls_heads,
281
- mlm_activation=tf_utils.get_activation(
282
- encoder_config.get().hidden_activation))
283
- # Makes sure the pretrainer variables are created.
284
- _ = pretrainer(pretrainer.inputs)
285
- checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
286
- model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
287
- checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
288
-
289
- vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
290
- self.get_temp_dir(), use_sp_model=True)
291
- export_path = os.path.join(self.get_temp_dir(), "hub")
292
- export_tfhub_lib.export_model(
293
- export_path=export_path,
294
- encoder_config=encoder_config,
295
- model_checkpoint_path=tf.train.latest_checkpoint(model_checkpoint_dir),
296
- with_mlm=True,
297
- copy_pooler_dense_to_encoder=True,
298
- vocab_file=vocab_file,
299
- sp_model_file=sp_model_file,
300
- do_lower_case=True)
301
- # Restores a hub KerasLayer.
302
- hub_layer = hub.KerasLayer(export_path, trainable=True)
303
- dummy_ids = np.zeros((2, 10), dtype=np.int32)
304
- input_dict = dict(
305
- input_word_ids=dummy_ids,
306
- input_mask=dummy_ids,
307
- input_type_ids=dummy_ids)
308
- hub_pooled_output = hub_layer(input_dict)["pooled_output"]
309
- encoder_outputs = encoder(input_dict)
310
- # Verify that hub_layer's pooled_output is the same as the output of next
311
- # sentence prediction's dense layer.
312
- pretrained_pooled_output = cls_heads[0].dense(
313
- (encoder_outputs["sequence_output"][:, 0, :]))
314
- self.assertAllClose(hub_pooled_output, pretrained_pooled_output)
315
- # But the pooled_output between encoder and hub_layer are not the same.
316
- encoder_pooled_output = encoder_outputs["pooled_output"]
317
- self.assertNotAllClose(hub_pooled_output, encoder_pooled_output)
318
-
319
- @parameterized.named_parameters(
320
- ("Bert", True),
321
- ("Albert", False),
322
- )
323
- def test_export_model_with_mlm(self, use_bert):
324
- # Create the encoder and export it.
325
- hidden_size = 16
326
- num_hidden_layers = 2
327
- bert_config, encoder_config = _get_bert_config_or_encoder_config(
328
- use_bert, hidden_size, num_hidden_layers)
329
- bert_model, pretrainer = export_tfhub_lib._create_model(
330
- bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
331
- self.assertEmpty(
332
- _find_lambda_layers(bert_model),
333
- "Lambda layers are non-portable since they serialize Python bytecode.")
334
- bert_model_with_mlm = bert_model.mlm
335
- model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
336
-
337
- checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
338
-
339
- checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
340
- model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
341
-
342
- vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
343
- self.get_temp_dir(), use_sp_model=not use_bert)
344
- export_path = os.path.join(self.get_temp_dir(), "hub")
345
- export_tfhub_lib.export_model(
346
- export_path=export_path,
347
- bert_config=bert_config,
348
- encoder_config=encoder_config,
349
- model_checkpoint_path=model_checkpoint_path,
350
- with_mlm=True,
351
- vocab_file=vocab_file,
352
- sp_model_file=sp_model_file,
353
- do_lower_case=True)
354
-
355
- # Restore the exported model.
356
- hub_layer = hub.KerasLayer(export_path, trainable=True)
357
-
358
- # Check legacy tokenization data.
359
- if use_bert:
360
- self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
361
- self.assertEqual("dummy content",
362
- _read_asset(hub_layer.resolved_object.vocab_file))
363
- self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
364
- else:
365
- self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
366
- self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
367
- self.assertEqual("dummy content",
368
- _read_asset(hub_layer.resolved_object.sp_model_file))
369
-
370
- # Check restored weights.
371
- # Note that we set `_auto_track_sub_layers` to False when exporting the
372
- # SavedModel, so hub_layer has the same number of weights as bert_model;
373
- # otherwise, hub_layer will have extra weights from its `mlm` subobject.
374
- self.assertEqual(
375
- len(bert_model.trainable_weights), len(hub_layer.trainable_weights))
376
- for source_weight, hub_weight in zip(bert_model.trainable_weights,
377
- hub_layer.trainable_weights):
378
- self.assertAllClose(source_weight, hub_weight)
379
-
380
- # Check computation.
381
- seq_length = 10
382
- dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
383
- input_dict = dict(
384
- input_word_ids=dummy_ids,
385
- input_mask=dummy_ids,
386
- input_type_ids=dummy_ids)
387
- hub_outputs_dict = hub_layer(input_dict)
388
- source_outputs_dict = bert_model(input_dict)
389
- encoder_outputs_dict = pretrainer.encoder_network(
390
- [dummy_ids, dummy_ids, dummy_ids])
391
- self.assertEqual(hub_outputs_dict["pooled_output"].shape, (2, hidden_size))
392
- self.assertEqual(hub_outputs_dict["sequence_output"].shape,
393
- (2, seq_length, hidden_size))
394
- for output_key in ("pooled_output", "sequence_output", "encoder_outputs"):
395
- self.assertAllClose(source_outputs_dict[output_key],
396
- hub_outputs_dict[output_key])
397
- self.assertAllClose(source_outputs_dict[output_key],
398
- encoder_outputs_dict[output_key])
399
-
400
- # The "default" output of BERT as a text representation is pooled_output.
401
- self.assertAllClose(hub_outputs_dict["pooled_output"],
402
- hub_outputs_dict["default"])
403
-
404
- # Test that training=True makes a difference (activates dropout).
405
- def _dropout_mean_stddev(training, num_runs=20):
406
- input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
407
- input_dict = dict(
408
- input_word_ids=input_ids,
409
- input_mask=np.ones_like(input_ids),
410
- input_type_ids=np.zeros_like(input_ids))
411
- outputs = np.concatenate([
412
- hub_layer(input_dict, training=training)["pooled_output"]
413
- for _ in range(num_runs)
414
- ])
415
- return np.mean(np.std(outputs, axis=0))
416
-
417
- self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
418
- self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
419
-
420
- # Checks sub-object `mlm`.
421
- self.assertTrue(hasattr(hub_layer.resolved_object, "mlm"))
422
-
423
- self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
424
- len(bert_model_with_mlm.trainable_weights))
425
- self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
426
- len(pretrainer.trainable_weights))
427
- for source_weight, hub_weight, pretrainer_weight in zip(
428
- bert_model_with_mlm.trainable_weights,
429
- hub_layer.resolved_object.mlm.trainable_variables,
430
- pretrainer.trainable_weights):
431
- self.assertAllClose(source_weight, hub_weight)
432
- self.assertAllClose(source_weight, pretrainer_weight)
433
-
434
- max_predictions_per_seq = 4
435
- mlm_positions = np.zeros((2, max_predictions_per_seq), dtype=np.int32)
436
- input_dict = dict(
437
- input_word_ids=dummy_ids,
438
- input_mask=dummy_ids,
439
- input_type_ids=dummy_ids,
440
- masked_lm_positions=mlm_positions)
441
- hub_mlm_outputs_dict = hub_layer.resolved_object.mlm(input_dict)
442
- source_mlm_outputs_dict = bert_model_with_mlm(input_dict)
443
- for output_key in ("pooled_output", "sequence_output", "mlm_logits",
444
- "encoder_outputs"):
445
- self.assertAllClose(hub_mlm_outputs_dict[output_key],
446
- source_mlm_outputs_dict[output_key])
447
-
448
- pretrainer_mlm_logits_output = pretrainer(input_dict)["mlm_logits"]
449
- self.assertAllClose(hub_mlm_outputs_dict["mlm_logits"],
450
- pretrainer_mlm_logits_output)
451
-
452
- # Test that training=True makes a difference (activates dropout).
453
- def _dropout_mean_stddev_mlm(training, num_runs=20):
454
- input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
455
- mlm_position_ids = np.array([[1, 2, 3, 4]], np.int32)
456
- input_dict = dict(
457
- input_word_ids=input_ids,
458
- input_mask=np.ones_like(input_ids),
459
- input_type_ids=np.zeros_like(input_ids),
460
- masked_lm_positions=mlm_position_ids)
461
- outputs = np.concatenate([
462
- hub_layer.resolved_object.mlm(input_dict,
463
- training=training)["pooled_output"]
464
- for _ in range(num_runs)
465
- ])
466
- return np.mean(np.std(outputs, axis=0))
467
-
468
- self.assertLess(_dropout_mean_stddev_mlm(training=False), 1e-6)
469
- self.assertGreater(_dropout_mean_stddev_mlm(training=True), 1e-3)
470
-
471
- # Test propagation of seq_length in shape inference.
472
- input_word_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
473
- input_mask = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
474
- input_type_ids = tf_keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
475
- input_dict = dict(
476
- input_word_ids=input_word_ids,
477
- input_mask=input_mask,
478
- input_type_ids=input_type_ids)
479
- hub_outputs_dict = hub_layer(input_dict)
480
- self.assertEqual(hub_outputs_dict["pooled_output"].shape.as_list(),
481
- [None, hidden_size])
482
- self.assertEqual(hub_outputs_dict["sequence_output"].shape.as_list(),
483
- [None, seq_length, hidden_size])
484
-
485
-
486
- _STRING_NOT_TO_LEAK = "private_path_component_"
487
-
488
-
489
- class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
490
-
491
- def _make_vocab_file(self, vocab, filename="vocab.txt", add_mask_token=False):
492
- """Creates wordpiece vocab file with given words plus special tokens.
493
-
494
- The tokens of the resulting model are, in this order:
495
- [PAD], [UNK], [CLS], [SEP], [MASK]*, ...vocab...
496
- *=if requested by args.
497
-
498
- This function also accepts wordpieces that start with the ## continuation
499
- marker, but avoiding those makes this function interchangeable with
500
- _make_sp_model_file(), up to the extra dimension returned by BertTokenizer.
501
-
502
- Args:
503
- vocab: a list of strings with the words or wordpieces to put into the
504
- model's vocabulary. Do not include special tokens here.
505
- filename: Optionally, a filename (relative to the temporary directory
506
- created by this function).
507
- add_mask_token: an optional bool, whether to include a [MASK] token.
508
-
509
- Returns:
510
- The absolute filename of the created vocab file.
511
- """
512
- full_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"
513
- ] + ["[MASK]"] * add_mask_token + vocab
514
- path = os.path.join(
515
- tempfile.mkdtemp(
516
- dir=self.get_temp_dir(), # New subdir each time.
517
- prefix=_STRING_NOT_TO_LEAK),
518
- filename)
519
- with tf.io.gfile.GFile(path, "w") as f:
520
- f.write("\n".join(full_vocab + [""]))
521
- return path
522
-
523
- def _make_sp_model_file(self, vocab, prefix="spm", add_mask_token=False):
524
- """Creates Sentencepiece word model with given words plus special tokens.
525
-
526
- The tokens of the resulting model are, in this order:
527
- <pad>, <unk>, [CLS], [SEP], [MASK]*, ...vocab..., <s>, </s>
528
- *=if requested by args.
529
-
530
- The words in the input vocab are plain text, without the whitespace marker.
531
- That makes this function interchangeable with _make_vocab_file().
532
-
533
- Args:
534
- vocab: a list of strings with the words to put into the model's
535
- vocabulary. Do not include special tokens here.
536
- prefix: an optional string, to change the filename prefix for the model
537
- (relative to the temporary directory created by this function).
538
- add_mask_token: an optional bool, whether to include a [MASK] token.
539
-
540
- Returns:
541
- The absolute filename of the created Sentencepiece model file.
542
- """
543
- model_prefix = os.path.join(
544
- tempfile.mkdtemp(dir=self.get_temp_dir()), # New subdir each time.
545
- prefix)
546
- input_file = model_prefix + "_train_input.txt"
547
- # Create input text for training the sp model from the tokens provided.
548
- # Repeat tokens, the earlier the more, because they are sorted by frequency.
549
- input_text = []
550
- for i, token in enumerate(vocab):
551
- input_text.append(" ".join([token] * (len(vocab) - i)))
552
- with tf.io.gfile.GFile(input_file, "w") as f:
553
- f.write("\n".join(input_text + [""]))
554
- control_symbols = "[CLS],[SEP]"
555
- full_vocab_size = len(vocab) + 6 # <pad>, <unk>, [CLS], [SEP], <s>, </s>.
556
- if add_mask_token:
557
- control_symbols += ",[MASK]"
558
- full_vocab_size += 1
559
- flags = dict(
560
- model_prefix=model_prefix,
561
- model_type="word",
562
- input=input_file,
563
- pad_id=0,
564
- unk_id=1,
565
- control_symbols=control_symbols,
566
- vocab_size=full_vocab_size,
567
- bos_id=full_vocab_size - 2,
568
- eos_id=full_vocab_size - 1)
569
- SentencePieceTrainer.Train(" ".join(
570
- ["--{}={}".format(k, v) for k, v in flags.items()]))
571
- return model_prefix + ".model"
572
-
573
- def _do_export(self,
574
- vocab,
575
- do_lower_case,
576
- default_seq_length=128,
577
- tokenize_with_offsets=True,
578
- use_sp_model=False,
579
- experimental_disable_assert=False,
580
- add_mask_token=False):
581
- """Runs SavedModel export and returns the export_path."""
582
- export_path = tempfile.mkdtemp(dir=self.get_temp_dir())
583
- vocab_file = sp_model_file = None
584
- if use_sp_model:
585
- sp_model_file = self._make_sp_model_file(
586
- vocab, add_mask_token=add_mask_token)
587
- else:
588
- vocab_file = self._make_vocab_file(vocab, add_mask_token=add_mask_token)
589
- export_tfhub_lib.export_preprocessing(
590
- export_path,
591
- vocab_file=vocab_file,
592
- sp_model_file=sp_model_file,
593
- do_lower_case=do_lower_case,
594
- tokenize_with_offsets=tokenize_with_offsets,
595
- default_seq_length=default_seq_length,
596
- experimental_disable_assert=experimental_disable_assert)
597
- # Invalidate the original filename to verify loading from the SavedModel.
598
- tf.io.gfile.remove(sp_model_file or vocab_file)
599
- return export_path
600
-
601
- def test_no_leaks(self):
602
- """Tests not leaking the path to the original vocab file."""
603
- path = self._do_export(["d", "ef", "abc", "xy"],
604
- do_lower_case=True,
605
- use_sp_model=False)
606
- with tf.io.gfile.GFile(os.path.join(path, "saved_model.pb"), "rb") as f:
607
- self.assertFalse( # pylint: disable=g-generic-assert
608
- _STRING_NOT_TO_LEAK.encode("ascii") in f.read())
609
-
610
- @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
611
- def test_exported_callables(self, use_sp_model):
612
- preprocess = tf.saved_model.load(
613
- self._do_export(
614
- ["d", "ef", "abc", "xy"],
615
- do_lower_case=True,
616
- # TODO(b/181866850): drop this.
617
- tokenize_with_offsets=not use_sp_model,
618
- # TODO(b/175369555): drop this.
619
- experimental_disable_assert=True,
620
- use_sp_model=use_sp_model))
621
-
622
- def fold_dim(rt):
623
- """Removes the word/subword distinction of BertTokenizer."""
624
- return rt if use_sp_model else rt.merge_dims(1, 2)
625
-
626
- # .tokenize()
627
- inputs = tf.constant(["abc d ef", "ABC D EF d"])
628
- token_ids = preprocess.tokenize(inputs)
629
- self.assertAllEqual(
630
- fold_dim(token_ids), tf.ragged.constant([[6, 4, 5], [6, 4, 5, 4]]))
631
-
632
- special_tokens_dict = {
633
- k: v.numpy().item() # Expecting eager Tensor, converting to Python.
634
- for k, v in preprocess.tokenize.get_special_tokens_dict().items()
635
- }
636
- self.assertDictEqual(
637
- special_tokens_dict,
638
- dict(
639
- padding_id=0,
640
- start_of_sequence_id=2,
641
- end_of_segment_id=3,
642
- vocab_size=4 + 6 if use_sp_model else 4 + 4))
643
-
644
- # .tokenize_with_offsets()
645
- if use_sp_model:
646
- # TODO(b/181866850): Enable tokenize_with_offsets when it works and test.
647
- self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
648
- else:
649
- token_ids, start_offsets, limit_offsets = (
650
- preprocess.tokenize_with_offsets(inputs))
651
- self.assertAllEqual(
652
- fold_dim(token_ids), tf.ragged.constant([[6, 4, 5], [6, 4, 5, 4]]))
653
- self.assertAllEqual(
654
- fold_dim(start_offsets), tf.ragged.constant([[0, 4, 6], [0, 4, 6,
655
- 9]]))
656
- self.assertAllEqual(
657
- fold_dim(limit_offsets), tf.ragged.constant([[3, 5, 8], [3, 5, 8,
658
- 10]]))
659
- self.assertIs(preprocess.tokenize.get_special_tokens_dict,
660
- preprocess.tokenize_with_offsets.get_special_tokens_dict)
661
-
662
- # Root callable.
663
- bert_inputs = preprocess(inputs)
664
- self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 128])
665
- self.assertAllEqual(
666
- bert_inputs["input_word_ids"][:, :10],
667
- tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
668
- [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
669
- self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 128])
670
- self.assertAllEqual(
671
- bert_inputs["input_mask"][:, :10],
672
- tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
673
- [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
674
- self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 128])
675
- self.assertAllEqual(
676
- bert_inputs["input_type_ids"][:, :10],
677
- tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
678
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
679
-
680
- # .bert_pack_inputs()
681
- inputs_2 = tf.constant(["d xy", "xy abc"])
682
- token_ids_2 = preprocess.tokenize(inputs_2)
683
- bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2],
684
- seq_length=256)
685
- self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 256])
686
- self.assertAllEqual(
687
- bert_inputs["input_word_ids"][:, :10],
688
- tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
689
- [2, 6, 4, 5, 4, 3, 7, 6, 3, 0]]))
690
- self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 256])
691
- self.assertAllEqual(
692
- bert_inputs["input_mask"][:, :10],
693
- tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
694
- [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]))
695
- self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 256])
696
- self.assertAllEqual(
697
- bert_inputs["input_type_ids"][:, :10],
698
- tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
699
- [0, 0, 0, 0, 0, 0, 1, 1, 1, 0]]))
700
-
701
- # For BertTokenizer only: repeat relevant parts for do_lower_case=False,
702
- # default_seq_length=10, experimental_disable_assert=False,
703
- # tokenize_with_offsets=False, and without folding the word/subword dimension.
704
- def test_cased_length10(self):
705
- preprocess = tf.saved_model.load(
706
- self._do_export(["d", "##ef", "abc", "ABC"],
707
- do_lower_case=False,
708
- default_seq_length=10,
709
- tokenize_with_offsets=False,
710
- use_sp_model=False,
711
- experimental_disable_assert=False))
712
- inputs = tf.constant(["abc def", "ABC DEF"])
713
- token_ids = preprocess.tokenize(inputs)
714
- self.assertAllEqual(token_ids,
715
- tf.ragged.constant([[[6], [4, 5]], [[7], [1]]]))
716
-
717
- self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
718
-
719
- bert_inputs = preprocess(inputs)
720
- self.assertAllEqual(
721
- bert_inputs["input_word_ids"],
722
- tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
723
- [2, 7, 1, 3, 0, 0, 0, 0, 0, 0]]))
724
- self.assertAllEqual(
725
- bert_inputs["input_mask"],
726
- tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
727
- [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]))
728
- self.assertAllEqual(
729
- bert_inputs["input_type_ids"],
730
- tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
731
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
732
-
733
- inputs_2 = tf.constant(["d ABC", "ABC abc"])
734
- token_ids_2 = preprocess.tokenize(inputs_2)
735
- bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2])
736
- # Test default seq_length=10.
737
- self.assertAllEqual(
738
- bert_inputs["input_word_ids"],
739
- tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
740
- [2, 7, 1, 3, 7, 6, 3, 0, 0, 0]]))
741
- self.assertAllEqual(
742
- bert_inputs["input_mask"],
743
- tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
744
- [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]))
745
- self.assertAllEqual(
746
- bert_inputs["input_type_ids"],
747
- tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
748
- [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]]))
749
-
750
- # XLA requires fixed shapes for tensors found in graph mode.
751
- # Statically known shapes in Python are a particularly firm way to
752
- # guarantee that, and they are generally more convenient to work with.
753
- # We test that the exported SavedModel plays well with TF's shape
754
- # inference when applied to fully or partially known input shapes.
755
- @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
756
- def test_shapes(self, use_sp_model):
757
- preprocess = tf.saved_model.load(
758
- self._do_export(
759
- ["abc", "def"],
760
- do_lower_case=True,
761
- # TODO(b/181866850): drop this.
762
- tokenize_with_offsets=not use_sp_model,
763
- # TODO(b/175369555): drop this.
764
- experimental_disable_assert=True,
765
- use_sp_model=use_sp_model))
766
-
767
- def expected_bert_input_shapes(batch_size, seq_length):
768
- return dict(
769
- input_word_ids=[batch_size, seq_length],
770
- input_mask=[batch_size, seq_length],
771
- input_type_ids=[batch_size, seq_length])
772
-
773
- for batch_size in [7, None]:
774
- if use_sp_model:
775
- token_out_shape = [batch_size, None] # No word/subword distinction.
776
- else:
777
- token_out_shape = [batch_size, None, None]
778
- self.assertEqual(
779
- _result_shapes_in_tf_function(preprocess.tokenize,
780
- tf.TensorSpec([batch_size], tf.string)),
781
- token_out_shape, "with batch_size=%s" % batch_size)
782
- # TODO(b/181866850): Enable tokenize_with_offsets when it works and test.
783
- if use_sp_model:
784
- self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
785
- else:
786
- self.assertEqual(
787
- _result_shapes_in_tf_function(
788
- preprocess.tokenize_with_offsets,
789
- tf.TensorSpec([batch_size], tf.string)), [token_out_shape] * 3,
790
- "with batch_size=%s" % batch_size)
791
- self.assertEqual(
792
- _result_shapes_in_tf_function(
793
- preprocess.bert_pack_inputs,
794
- [tf.RaggedTensorSpec([batch_size, None, None], tf.int32)] * 2,
795
- seq_length=256), expected_bert_input_shapes(batch_size, 256),
796
- "with batch_size=%s" % batch_size)
797
- self.assertEqual(
798
- _result_shapes_in_tf_function(preprocess,
799
- tf.TensorSpec([batch_size], tf.string)),
800
- expected_bert_input_shapes(batch_size, 128),
801
- "with batch_size=%s" % batch_size)
802
-
803
- @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
804
- def test_reexport(self, use_sp_model):
805
- """Test that preprocess keeps working after another save/load cycle."""
806
- path1 = self._do_export(
807
- ["d", "ef", "abc", "xy"],
808
- do_lower_case=True,
809
- default_seq_length=10,
810
- tokenize_with_offsets=False,
811
- experimental_disable_assert=True, # TODO(b/175369555): drop this.
812
- use_sp_model=use_sp_model)
813
- path2 = path1.rstrip("/") + ".2"
814
- model1 = tf.saved_model.load(path1)
815
- tf.saved_model.save(model1, path2)
816
- # Delete the first SavedModel to test that the sceond one loads by itself.
817
- # https://github.com/tensorflow/tensorflow/issues/46456 reports such a
818
- # failure case for BertTokenizer.
819
- tf.io.gfile.rmtree(path1)
820
- model2 = tf.saved_model.load(path2)
821
-
822
- inputs = tf.constant(["abc d ef", "ABC D EF d"])
823
- bert_inputs = model2(inputs)
824
- self.assertAllEqual(
825
- bert_inputs["input_word_ids"],
826
- tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
827
- [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
828
- self.assertAllEqual(
829
- bert_inputs["input_mask"],
830
- tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
831
- [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
832
- self.assertAllEqual(
833
- bert_inputs["input_type_ids"],
834
- tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
835
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
836
-
837
- @parameterized.named_parameters(("Bert", True), ("Albert", False))
838
- def test_preprocessing_for_mlm(self, use_bert):
839
- """Combines both SavedModel types and TF.text helpers for MLM."""
840
- # Create the preprocessing SavedModel with a [MASK] token.
841
- non_special_tokens = [
842
- "hello", "world", "nice", "movie", "great", "actors", "quick", "fox",
843
- "lazy", "dog"
844
- ]
845
-
846
- preprocess = tf.saved_model.load(
847
- self._do_export(
848
- non_special_tokens,
849
- do_lower_case=True,
850
- tokenize_with_offsets=use_bert, # TODO(b/181866850): drop this.
851
- experimental_disable_assert=True, # TODO(b/175369555): drop this.
852
- add_mask_token=True,
853
- use_sp_model=not use_bert))
854
- vocab_size = len(non_special_tokens) + (5 if use_bert else 7)
855
-
856
- # Create the encoder SavedModel with an .mlm subobject.
857
- hidden_size = 16
858
- num_hidden_layers = 2
859
- bert_config, encoder_config = _get_bert_config_or_encoder_config(
860
- use_bert_config=use_bert,
861
- hidden_size=hidden_size,
862
- num_hidden_layers=num_hidden_layers,
863
- vocab_size=vocab_size)
864
- _, pretrainer = export_tfhub_lib._create_model(
865
- bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
866
- model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
867
- checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
868
- checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
869
- model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
870
- vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy( # Not used below.
871
- self.get_temp_dir(), use_sp_model=not use_bert)
872
- encoder_export_path = os.path.join(self.get_temp_dir(), "encoder_export")
873
- export_tfhub_lib.export_model(
874
- export_path=encoder_export_path,
875
- bert_config=bert_config,
876
- encoder_config=encoder_config,
877
- model_checkpoint_path=model_checkpoint_path,
878
- with_mlm=True,
879
- vocab_file=vocab_file,
880
- sp_model_file=sp_model_file,
881
- do_lower_case=True)
882
- encoder = tf.saved_model.load(encoder_export_path)
883
-
884
- # Get special tokens from the vocab (and vocab size).
885
- special_tokens_dict = preprocess.tokenize.get_special_tokens_dict()
886
- self.assertEqual(int(special_tokens_dict["vocab_size"]), vocab_size)
887
- padding_id = int(special_tokens_dict["padding_id"])
888
- self.assertEqual(padding_id, 0)
889
- start_of_sequence_id = int(special_tokens_dict["start_of_sequence_id"])
890
- self.assertEqual(start_of_sequence_id, 2)
891
- end_of_segment_id = int(special_tokens_dict["end_of_segment_id"])
892
- self.assertEqual(end_of_segment_id, 3)
893
- mask_id = int(special_tokens_dict["mask_id"])
894
- self.assertEqual(mask_id, 4)
895
-
896
- # A batch of 3 segment pairs.
897
- raw_segments = [
898
- tf.constant(["hello", "nice movie", "quick fox"]),
899
- tf.constant(["world", "great actors", "lazy dog"])
900
- ]
901
- batch_size = 3
902
-
903
- # Misc hyperparameters.
904
- seq_length = 10
905
- max_selections_per_seq = 2
906
-
907
- # Tokenize inputs.
908
- tokenized_segments = [preprocess.tokenize(s) for s in raw_segments]
909
- # Trim inputs to eventually fit seq_lentgh.
910
- num_special_tokens = len(raw_segments) + 1
911
- trimmed_segments = text.WaterfallTrimmer(
912
- seq_length - num_special_tokens).trim(tokenized_segments)
913
- # Combine input segments into one input sequence.
914
- input_ids, segment_ids = text.combine_segments(
915
- trimmed_segments,
916
- start_of_sequence_id=start_of_sequence_id,
917
- end_of_segment_id=end_of_segment_id)
918
- # Apply random masking controlled by policy objects.
919
- (masked_input_ids, masked_lm_positions,
920
- masked_ids) = text.mask_language_model(
921
- input_ids=input_ids,
922
- item_selector=text.RandomItemSelector(
923
- max_selections_per_seq,
924
- selection_rate=0.5, # Adjusted for the short test examples.
925
- unselectable_ids=[start_of_sequence_id, end_of_segment_id]),
926
- mask_values_chooser=text.MaskValuesChooser(
927
- vocab_size=vocab_size,
928
- mask_token=mask_id,
929
- # Always put [MASK] to have a predictable result.
930
- mask_token_rate=1.0,
931
- random_token_rate=0.0))
932
- # Pad to fixed-length Transformer encoder inputs.
933
- input_word_ids, _ = text.pad_model_inputs(
934
- masked_input_ids, seq_length, pad_value=padding_id)
935
- input_type_ids, input_mask = text.pad_model_inputs(
936
- segment_ids, seq_length, pad_value=0)
937
- masked_lm_positions, _ = text.pad_model_inputs(
938
- masked_lm_positions, max_selections_per_seq, pad_value=0)
939
- masked_lm_positions = tf.cast(masked_lm_positions, tf.int32)
940
- num_predictions = int(tf.shape(masked_lm_positions)[1])
941
-
942
- # Test transformer inputs.
943
- self.assertEqual(num_predictions, max_selections_per_seq)
944
- expected_word_ids = np.array([
945
- # [CLS] hello [SEP] world [SEP]
946
- [2, 5, 3, 6, 3, 0, 0, 0, 0, 0],
947
- # [CLS] nice movie [SEP] great actors [SEP]
948
- [2, 7, 8, 3, 9, 10, 3, 0, 0, 0],
949
- # [CLS] brown fox [SEP] lazy dog [SEP]
950
- [2, 11, 12, 3, 13, 14, 3, 0, 0, 0]
951
- ])
952
- for i in range(batch_size):
953
- for j in range(num_predictions):
954
- k = int(masked_lm_positions[i, j])
955
- if k != 0:
956
- expected_word_ids[i, k] = 4 # [MASK]
957
- self.assertAllEqual(input_word_ids, expected_word_ids)
958
-
959
- # Call the MLM head of the Transformer encoder.
960
- mlm_inputs = dict(
961
- input_word_ids=input_word_ids,
962
- input_mask=input_mask,
963
- input_type_ids=input_type_ids,
964
- masked_lm_positions=masked_lm_positions,
965
- )
966
- mlm_outputs = encoder.mlm(mlm_inputs)
967
- self.assertEqual(mlm_outputs["pooled_output"].shape,
968
- (batch_size, hidden_size))
969
- self.assertEqual(mlm_outputs["sequence_output"].shape,
970
- (batch_size, seq_length, hidden_size))
971
- self.assertEqual(mlm_outputs["mlm_logits"].shape,
972
- (batch_size, num_predictions, vocab_size))
973
- self.assertLen(mlm_outputs["encoder_outputs"], num_hidden_layers)
974
-
975
- # A real trainer would now compute the loss of mlm_logits
976
- # trying to predict the masked_ids.
977
- del masked_ids # Unused.
978
-
979
- @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
980
- def test_special_tokens_in_estimator(self, use_sp_model):
981
- """Tests getting special tokens without an Eager init context."""
982
- preprocess_export_path = self._do_export(["d", "ef", "abc", "xy"],
983
- do_lower_case=True,
984
- use_sp_model=use_sp_model,
985
- tokenize_with_offsets=False)
986
-
987
- def _get_special_tokens_dict(obj):
988
- """Returns special tokens of restored tokenizer as Python values."""
989
- if tf.executing_eagerly():
990
- special_tokens_numpy = {
991
- k: v.numpy() for k, v in obj.get_special_tokens_dict()
992
- }
993
- else:
994
- with tf.Graph().as_default():
995
- # This code expects `get_special_tokens_dict()` to be a tf.function
996
- # with no dependencies (bound args) from the context it was loaded in,
997
- # and boldly assumes that it can just be called in a dfferent context.
998
- special_tokens_tensors = obj.get_special_tokens_dict()
999
- with tf.compat.v1.Session() as sess:
1000
- special_tokens_numpy = sess.run(special_tokens_tensors)
1001
- return {
1002
- k: v.item() # Numpy to Python.
1003
- for k, v in special_tokens_numpy.items()
1004
- }
1005
-
1006
- def input_fn():
1007
- self.assertFalse(tf.executing_eagerly())
1008
- # Build a preprocessing Model.
1009
- sentences = tf_keras.layers.Input(shape=[], dtype=tf.string)
1010
- preprocess = tf.saved_model.load(preprocess_export_path)
1011
- tokenize = hub.KerasLayer(preprocess.tokenize)
1012
- special_tokens_dict = _get_special_tokens_dict(tokenize.resolved_object)
1013
- for k, v in special_tokens_dict.items():
1014
- self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
1015
- tokens = tokenize(sentences)
1016
- packed_inputs = layers.BertPackInputs(
1017
- 4, special_tokens_dict=special_tokens_dict)(
1018
- tokens)
1019
- preprocessing = tf_keras.Model(sentences, packed_inputs)
1020
- # Map the dataset.
1021
- ds = tf.data.Dataset.from_tensors(
1022
- (tf.constant(["abc", "D EF"]), tf.constant([0, 1])))
1023
- ds = ds.map(lambda features, labels: (preprocessing(features), labels))
1024
- return ds
1025
-
1026
- def model_fn(features, labels, mode):
1027
- del labels # Unused.
1028
- return tf_estimator.EstimatorSpec(
1029
- mode=mode, predictions=features["input_word_ids"])
1030
-
1031
- estimator = tf_estimator.Estimator(model_fn=model_fn)
1032
- outputs = list(estimator.predict(input_fn))
1033
- self.assertAllEqual(outputs, np.array([[2, 6, 3, 0], [2, 4, 5, 3]]))
1034
-
1035
- # TODO(b/175369555): Remove that code and its test.
1036
- @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
1037
- def test_check_no_assert(self, use_sp_model):
1038
- """Tests the self-check during export without assertions."""
1039
- preprocess_export_path = self._do_export(["d", "ef", "abc", "xy"],
1040
- do_lower_case=True,
1041
- use_sp_model=use_sp_model,
1042
- tokenize_with_offsets=False,
1043
- experimental_disable_assert=False)
1044
- with self.assertRaisesRegex(AssertionError,
1045
- r"failed to suppress \d+ Assert ops"):
1046
- export_tfhub_lib._check_no_assert(preprocess_export_path)
1047
-
1048
-
1049
- def _result_shapes_in_tf_function(fn, *args, **kwargs):
1050
- """Returns shapes (as lists) observed on the result of `fn`.
1051
-
1052
- Args:
1053
- fn: A callable.
1054
- *args: TensorSpecs for Tensor-valued arguments and actual values for
1055
- Python-valued arguments to fn.
1056
- **kwargs: Same for keyword arguments.
1057
-
1058
- Returns:
1059
- The nest of partial tensor shapes (as lists) that is statically known inside
1060
- tf.function(fn)(*args, **kwargs) for the nest of its results.
1061
- """
1062
- # Use a captured mutable container for a side outout from the wrapper.
1063
- uninitialized = "uninitialized!"
1064
- result_shapes_container = [uninitialized]
1065
- assert result_shapes_container[0] is uninitialized
1066
-
1067
- @tf.function
1068
- def shape_reporting_wrapper(*args, **kwargs):
1069
- result = fn(*args, **kwargs)
1070
- result_shapes_container[0] = tf.nest.map_structure(
1071
- lambda x: x.shape.as_list(), result)
1072
- return result
1073
-
1074
- shape_reporting_wrapper.get_concrete_function(*args, **kwargs)
1075
- assert result_shapes_container[0] is not uninitialized
1076
- return result_shapes_container[0]
1077
-
1078
-
1079
- if __name__ == "__main__":
1080
- tf.test.main()