Spaces:
Runtime error
Runtime error
# Copyright 2023 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Tests for nlp.serving.serving_modules.""" | |
import os | |
from absl.testing import parameterized | |
import tensorflow as tf, tf_keras | |
from sentencepiece import SentencePieceTrainer | |
from official.core import export_base | |
from official.nlp.configs import bert | |
from official.nlp.configs import encoders | |
from official.nlp.serving import serving_modules | |
from official.nlp.tasks import masked_lm | |
from official.nlp.tasks import question_answering | |
from official.nlp.tasks import sentence_prediction | |
from official.nlp.tasks import tagging | |
from official.nlp.tasks import translation | |
def _create_fake_serialized_examples(features_dict): | |
"""Creates a fake dataset.""" | |
def create_int_feature(values): | |
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) | |
return f | |
def create_str_feature(value): | |
f = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) | |
return f | |
examples = [] | |
for _ in range(10): | |
features = {} | |
for key, values in features_dict.items(): | |
if isinstance(values, bytes): | |
features[key] = create_str_feature(values) | |
else: | |
features[key] = create_int_feature(values) | |
tf_example = tf.train.Example(features=tf.train.Features(feature=features)) | |
examples.append(tf_example.SerializeToString()) | |
return tf.constant(examples) | |
def _create_fake_vocab_file(vocab_file_path): | |
tokens = ["[PAD]"] | |
for i in range(1, 100): | |
tokens.append("[unused%d]" % i) | |
tokens.extend(["[UNK]", "[CLS]", "[SEP]", "[MASK]", "hello", "world"]) | |
with tf.io.gfile.GFile(vocab_file_path, "w") as outfile: | |
outfile.write("\n".join(tokens)) | |
def _train_sentencepiece(input_path, vocab_size, model_path, eos_id=1): | |
argstr = " ".join([ | |
f"--input={input_path}", f"--vocab_size={vocab_size}", | |
"--character_coverage=0.995", | |
f"--model_prefix={model_path}", "--model_type=bpe", | |
"--bos_id=-1", "--pad_id=0", f"--eos_id={eos_id}", "--unk_id=2" | |
]) | |
SentencePieceTrainer.Train(argstr) | |
def _generate_line_file(filepath, lines): | |
with tf.io.gfile.GFile(filepath, "w") as f: | |
for l in lines: | |
f.write("{}\n".format(l)) | |
def _make_sentencepeice(output_dir): | |
src_lines = ["abc ede fg", "bbcd ef a g", "de f a a g"] | |
tgt_lines = ["dd cc a ef g", "bcd ef a g", "gef cd ba"] | |
sentencepeice_input_path = os.path.join(output_dir, "inputs.txt") | |
_generate_line_file(sentencepeice_input_path, src_lines + tgt_lines) | |
sentencepeice_model_prefix = os.path.join(output_dir, "sp") | |
_train_sentencepiece(sentencepeice_input_path, 11, sentencepeice_model_prefix) | |
sentencepeice_model_path = "{}.model".format(sentencepeice_model_prefix) | |
return sentencepeice_model_path | |
class ServingModulesTest(tf.test.TestCase, parameterized.TestCase): | |
def test_sentence_prediction(self, use_v2_feature_names): | |
if use_v2_feature_names: | |
input_word_ids_field = "input_word_ids" | |
input_type_ids_field = "input_type_ids" | |
else: | |
input_word_ids_field = "input_ids" | |
input_type_ids_field = "segment_ids" | |
config = sentence_prediction.SentencePredictionConfig( | |
model=sentence_prediction.ModelConfig( | |
encoder=encoders.EncoderConfig( | |
bert=encoders.BertEncoderConfig(vocab_size=30522, | |
num_layers=1)), | |
num_classes=2)) | |
task = sentence_prediction.SentencePredictionTask(config) | |
model = task.build_model() | |
params = serving_modules.SentencePrediction.Params( | |
inputs_only=True, | |
parse_sequence_length=10, | |
use_v2_feature_names=use_v2_feature_names) | |
export_module = serving_modules.SentencePrediction( | |
params=params, model=model) | |
functions = export_module.get_inference_signatures({ | |
"serve": "serving_default", | |
"serve_examples": "serving_examples" | |
}) | |
self.assertSameElements(functions.keys(), | |
["serving_default", "serving_examples"]) | |
dummy_ids = tf.ones((10, 10), dtype=tf.int32) | |
outputs = functions["serving_default"](dummy_ids) | |
self.assertEqual(outputs["outputs"].shape, (10, 2)) | |
params = serving_modules.SentencePrediction.Params( | |
inputs_only=False, | |
parse_sequence_length=10, | |
use_v2_feature_names=use_v2_feature_names) | |
export_module = serving_modules.SentencePrediction( | |
params=params, model=model) | |
functions = export_module.get_inference_signatures({ | |
"serve": "serving_default", | |
"serve_examples": "serving_examples" | |
}) | |
outputs = functions["serving_default"]( | |
input_word_ids=dummy_ids, | |
input_mask=dummy_ids, | |
input_type_ids=dummy_ids) | |
self.assertEqual(outputs["outputs"].shape, (10, 2)) | |
dummy_ids = tf.ones((10,), dtype=tf.int32) | |
examples = _create_fake_serialized_examples({ | |
input_word_ids_field: dummy_ids, | |
"input_mask": dummy_ids, | |
input_type_ids_field: dummy_ids | |
}) | |
outputs = functions["serving_examples"](examples) | |
self.assertEqual(outputs["outputs"].shape, (10, 2)) | |
with self.assertRaises(ValueError): | |
_ = export_module.get_inference_signatures({"foo": None}) | |
def test_sentence_prediction_text(self, inputs_only): | |
vocab_file_path = os.path.join(self.get_temp_dir(), "vocab.txt") | |
_create_fake_vocab_file(vocab_file_path) | |
config = sentence_prediction.SentencePredictionConfig( | |
model=sentence_prediction.ModelConfig( | |
encoder=encoders.EncoderConfig( | |
bert=encoders.BertEncoderConfig(vocab_size=30522, | |
num_layers=1)), | |
num_classes=2)) | |
task = sentence_prediction.SentencePredictionTask(config) | |
model = task.build_model() | |
params = serving_modules.SentencePrediction.Params( | |
inputs_only=inputs_only, | |
parse_sequence_length=10, | |
text_fields=["foo", "bar"], | |
vocab_file=vocab_file_path) | |
export_module = serving_modules.SentencePrediction( | |
params=params, model=model) | |
examples = _create_fake_serialized_examples({ | |
"foo": b"hello world", | |
"bar": b"hello world" | |
}) | |
functions = export_module.get_inference_signatures({ | |
"serve_text_examples": "serving_default", | |
}) | |
outputs = functions["serving_default"](examples) | |
self.assertEqual(outputs["outputs"].shape, (10, 2)) | |
def test_masked_lm(self, use_v2_feature_names): | |
if use_v2_feature_names: | |
input_word_ids_field = "input_word_ids" | |
input_type_ids_field = "input_type_ids" | |
else: | |
input_word_ids_field = "input_ids" | |
input_type_ids_field = "segment_ids" | |
config = masked_lm.MaskedLMConfig( | |
model=bert.PretrainerConfig( | |
encoder=encoders.EncoderConfig( | |
bert=encoders.BertEncoderConfig(vocab_size=30522, | |
num_layers=1)), | |
cls_heads=[ | |
bert.ClsHeadConfig( | |
inner_dim=10, num_classes=2, name="next_sentence") | |
])) | |
task = masked_lm.MaskedLMTask(config) | |
model = task.build_model() | |
params = serving_modules.MaskedLM.Params( | |
parse_sequence_length=10, | |
max_predictions_per_seq=5, | |
use_v2_feature_names=use_v2_feature_names) | |
export_module = serving_modules.MaskedLM(params=params, model=model) | |
functions = export_module.get_inference_signatures({ | |
"serve": "serving_default", | |
"serve_examples": "serving_examples" | |
}) | |
self.assertSameElements(functions.keys(), | |
["serving_default", "serving_examples"]) | |
dummy_ids = tf.ones((10, 10), dtype=tf.int32) | |
dummy_pos = tf.ones((10, 5), dtype=tf.int32) | |
outputs = functions["serving_default"]( | |
input_word_ids=dummy_ids, | |
input_mask=dummy_ids, | |
input_type_ids=dummy_ids, | |
masked_lm_positions=dummy_pos) | |
self.assertEqual(outputs["classification"].shape, (10, 2)) | |
dummy_ids = tf.ones((10,), dtype=tf.int32) | |
dummy_pos = tf.ones((5,), dtype=tf.int32) | |
examples = _create_fake_serialized_examples({ | |
input_word_ids_field: dummy_ids, | |
"input_mask": dummy_ids, | |
input_type_ids_field: dummy_ids, | |
"masked_lm_positions": dummy_pos | |
}) | |
outputs = functions["serving_examples"](examples) | |
self.assertEqual(outputs["classification"].shape, (10, 2)) | |
def test_question_answering(self, use_v2_feature_names): | |
if use_v2_feature_names: | |
input_word_ids_field = "input_word_ids" | |
input_type_ids_field = "input_type_ids" | |
else: | |
input_word_ids_field = "input_ids" | |
input_type_ids_field = "segment_ids" | |
config = question_answering.QuestionAnsweringConfig( | |
model=question_answering.ModelConfig( | |
encoder=encoders.EncoderConfig( | |
bert=encoders.BertEncoderConfig(vocab_size=30522, | |
num_layers=1))), | |
validation_data=None) | |
task = question_answering.QuestionAnsweringTask(config) | |
model = task.build_model() | |
params = serving_modules.QuestionAnswering.Params( | |
parse_sequence_length=10, use_v2_feature_names=use_v2_feature_names) | |
export_module = serving_modules.QuestionAnswering( | |
params=params, model=model) | |
functions = export_module.get_inference_signatures({ | |
"serve": "serving_default", | |
"serve_examples": "serving_examples" | |
}) | |
self.assertSameElements(functions.keys(), | |
["serving_default", "serving_examples"]) | |
dummy_ids = tf.ones((10, 10), dtype=tf.int32) | |
outputs = functions["serving_default"]( | |
input_word_ids=dummy_ids, | |
input_mask=dummy_ids, | |
input_type_ids=dummy_ids) | |
self.assertEqual(outputs["start_logits"].shape, (10, 10)) | |
self.assertEqual(outputs["end_logits"].shape, (10, 10)) | |
dummy_ids = tf.ones((10,), dtype=tf.int32) | |
examples = _create_fake_serialized_examples({ | |
input_word_ids_field: dummy_ids, | |
"input_mask": dummy_ids, | |
input_type_ids_field: dummy_ids | |
}) | |
outputs = functions["serving_examples"](examples) | |
self.assertEqual(outputs["start_logits"].shape, (10, 10)) | |
self.assertEqual(outputs["end_logits"].shape, (10, 10)) | |
def test_tagging(self, use_v2_feature_names, output_encoder_outputs): | |
if use_v2_feature_names: | |
input_word_ids_field = "input_word_ids" | |
input_type_ids_field = "input_type_ids" | |
else: | |
input_word_ids_field = "input_ids" | |
input_type_ids_field = "segment_ids" | |
hidden_size = 768 | |
num_classes = 3 | |
config = tagging.TaggingConfig( | |
model=tagging.ModelConfig( | |
encoder=encoders.EncoderConfig( | |
bert=encoders.BertEncoderConfig( | |
hidden_size=hidden_size, num_layers=1))), | |
class_names=["class_0", "class_1", "class_2"]) | |
task = tagging.TaggingTask(config) | |
model = task.build_model() | |
params = serving_modules.Tagging.Params( | |
parse_sequence_length=10, | |
use_v2_feature_names=use_v2_feature_names, | |
output_encoder_outputs=output_encoder_outputs) | |
export_module = serving_modules.Tagging(params=params, model=model) | |
functions = export_module.get_inference_signatures({ | |
"serve": "serving_default", | |
"serve_examples": "serving_examples" | |
}) | |
dummy_ids = tf.ones((10, 10), dtype=tf.int32) | |
outputs = functions["serving_default"]( | |
input_word_ids=dummy_ids, | |
input_mask=dummy_ids, | |
input_type_ids=dummy_ids) | |
self.assertEqual(outputs["logits"].shape, (10, 10, num_classes)) | |
if output_encoder_outputs: | |
self.assertEqual(outputs["encoder_outputs"].shape, (10, 10, hidden_size)) | |
dummy_ids = tf.ones((10,), dtype=tf.int32) | |
examples = _create_fake_serialized_examples({ | |
input_word_ids_field: dummy_ids, | |
"input_mask": dummy_ids, | |
input_type_ids_field: dummy_ids | |
}) | |
outputs = functions["serving_examples"](examples) | |
self.assertEqual(outputs["logits"].shape, (10, 10, num_classes)) | |
if output_encoder_outputs: | |
self.assertEqual(outputs["encoder_outputs"].shape, (10, 10, hidden_size)) | |
with self.assertRaises(ValueError): | |
_ = export_module.get_inference_signatures({"foo": None}) | |
def test_translation(self, padded_decode, batch_size): | |
sp_path = _make_sentencepeice(self.get_temp_dir()) | |
encdecoder = translation.EncDecoder( | |
num_attention_heads=4, intermediate_size=256) | |
config = translation.TranslationConfig( | |
model=translation.ModelConfig( | |
encoder=encdecoder, | |
decoder=encdecoder, | |
embedding_width=256, | |
padded_decode=padded_decode, | |
decode_max_length=100), | |
sentencepiece_model_path=sp_path, | |
) | |
task = translation.TranslationTask(config) | |
model = task.build_model() | |
params = serving_modules.Translation.Params( | |
sentencepiece_model_path=sp_path, batch_size=batch_size) | |
export_module = serving_modules.Translation(params=params, model=model) | |
functions = export_module.get_inference_signatures({ | |
"serve_text": "serving_default" | |
}) | |
outputs = functions["serving_default"](tf.constant(["abcd", "ef gh"])) | |
self.assertEqual(outputs.shape, (2,)) | |
self.assertEqual(outputs.dtype, tf.string) | |
tmp_dir = self.get_temp_dir() | |
tmp_dir = os.path.join(tmp_dir, "padded_decode", str(padded_decode)) | |
export_base_dir = os.path.join(tmp_dir, "export") | |
ckpt_dir = os.path.join(tmp_dir, "ckpt") | |
ckpt_path = tf.train.Checkpoint(model=model).save(ckpt_dir) | |
export_dir = export_base.export(export_module, | |
{"serve_text": "serving_default"}, | |
export_base_dir, ckpt_path) | |
loaded = tf.saved_model.load(export_dir) | |
infer = loaded.signatures["serving_default"] | |
out = infer(text=tf.constant(["abcd", "ef gh"])) | |
self.assertLen(out["output_0"], 2) | |
if __name__ == "__main__": | |
tf.test.main() | |