storage / ASR /transformer-deploy /tests /test_triton.py

Upload folder using huggingface_hub

e0c2d04 verified about 1 year ago

12.8 kB

	# Copyright 2022, Lefebvre Dalloz Services
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import inspect
	import tempfile
	from pathlib import Path

	import pytest
	from transformers import AutoConfig, AutoTokenizer, PretrainedConfig, PreTrainedTokenizer

	from transformer_deploy.t5_utils import t5_model
	from transformer_deploy.triton.configuration import EngineType
	from transformer_deploy.triton.configuration_decoder import ConfigurationDec
	from transformer_deploy.triton.configuration_encoder import ConfigurationEnc
	from transformer_deploy.triton.configuration_question_answering import ConfigurationQuestionAnswering
	from transformer_deploy.triton.configuration_t5 import ConfigurationT5Decoder, ConfigurationT5Encoder
	from transformer_deploy.triton.configuration_token_classifier import ConfigurationTokenClassifier
	from transformer_deploy.utils import generative_model, python_tokenizer, question_answering, token_classifier


	@pytest.fixture
	def working_directory() -> tempfile.TemporaryDirectory:
	return tempfile.TemporaryDirectory()


	@pytest.fixture
	def conf_encoder(working_directory: tempfile.TemporaryDirectory):
	conf = ConfigurationEnc(
	model_name_base="test",
	dim_output=[-1, 2],
	nb_instance=1,
	tensor_input_names=["input_ids", "attention_mask"],
	working_directory=working_directory.name,
	device="cuda",
	)
	conf.engine_type = EngineType.ONNX # should be provided later...
	return conf


	@pytest.fixture
	def conf_decoder(working_directory: tempfile.TemporaryDirectory):
	conf = ConfigurationDec(
	model_name_base="test",
	dim_output=[-1, 2],
	nb_instance=1,
	tensor_input_names=["input_ids", "attention_mask"],
	working_directory=working_directory.name,
	device="cuda",
	)
	conf.engine_type = EngineType.ONNX # should be provided later...
	return conf


	@pytest.fixture
	def conf_token_classifier(working_directory: tempfile.TemporaryDirectory):
	conf = ConfigurationTokenClassifier(
	model_name_base="test",
	dim_output=[-1, 2],
	nb_instance=1,
	tensor_input_names=["input_ids", "attention_mask"],
	working_directory=working_directory.name,
	device="cuda",
	)
	conf.engine_type = EngineType.ONNX
	return conf


	@pytest.fixture
	def conf_question_answering(working_directory: tempfile.TemporaryDirectory):
	conf = ConfigurationQuestionAnswering(
	model_name_base="test",
	dim_output=[-1, 2],
	nb_instance=1,
	tensor_input_names=["input_ids", "attention_mask"],
	working_directory=working_directory.name,
	device="cuda",
	)
	conf.engine_type = EngineType.ONNX
	return conf


	@pytest.fixture
	def conf_encoder_t5(working_directory: tempfile.TemporaryDirectory):
	conf = ConfigurationT5Encoder(
	model_name_base="test",
	dim_output=[-1, 2],
	nb_instance=1,
	tensor_input_names=["input_ids", "attention_mask"],
	working_directory=working_directory.name,
	device="cuda",
	)
	conf.engine_type = EngineType.ONNX # should be provided later...
	return conf


	@pytest.fixture
	def conf_decoder_t5(working_directory: tempfile.TemporaryDirectory):
	conf = ConfigurationT5Decoder(
	model_name_base="test",
	dim_output=[-1, 2],
	nb_instance=1,
	tensor_input_names=["input_ids", "attention_mask"],
	working_directory=working_directory.name,
	device="cuda",
	)
	conf.engine_type = EngineType.ONNX # should be provided later...
	return conf


	def test_model_conf(conf_encoder, conf_decoder, conf_token_classifier):
	expected = """
	name: "test_onnx_model"
	max_batch_size: 0
	platform: "onnxruntime_onnx"
	default_model_filename: "model.bin"

	input [
	{
	name: "input_ids"
	data_type: TYPE_INT32
	dims: [-1, -1]
	},
	{
	name: "attention_mask"
	data_type: TYPE_INT32
	dims: [-1, -1]
	}
	]

	output {
	name: "output"
	data_type: TYPE_FP32
	dims: [-1, 2]
	}

	instance_group [
	{
	count: 1
	kind: KIND_GPU
	}
	]
	""" # noqa: W293
	assert expected.strip() == conf_encoder.get_model_conf()
	assert expected.strip() == conf_decoder.get_model_conf()
	assert expected.strip() == conf_token_classifier.get_model_conf()


	def test_tokenizer_conf(conf_encoder):
	expected = """
	name: "test_onnx_tokenize"
	max_batch_size: 0
	backend: "python"

	input [
	{
	name: "TEXT"
	data_type: TYPE_STRING
	dims: [ -1 ]
	}
	]

	output [
	{
	name: "input_ids"
	data_type: TYPE_INT32
	dims: [-1, -1]
	},
	{
	name: "attention_mask"
	data_type: TYPE_INT32
	dims: [-1, -1]
	}
	]

	instance_group [
	{
	count: 1
	kind: KIND_GPU
	}
	]
	""" # noqa: W293
	assert expected.strip() == conf_encoder.get_tokenize_conf()


	def test_inference_conf(conf_encoder):
	expected = """
	name: "test_onnx_inference"
	max_batch_size: 0
	platform: "ensemble"

	input [
	{
	name: "TEXT"
	data_type: TYPE_STRING
	dims: [ -1 ]
	}
	]

	output {
	name: "output"
	data_type: TYPE_FP32
	dims: [-1, 2]
	}

	ensemble_scheduling {
	step [
	{
	model_name: "test_onnx_tokenize"
	model_version: -1
	input_map {
	key: "TEXT"
	value: "TEXT"
	}
	output_map [
	{
	key: "input_ids"
	value: "input_ids"
	},
	{
	key: "attention_mask"
	value: "attention_mask"
	}
	]
	},
	{
	model_name: "test_onnx_model"
	model_version: -1
	input_map [
	{
	key: "input_ids"
	value: "input_ids"
	},
	{
	key: "attention_mask"
	value: "attention_mask"
	}
	]
	output_map {
	key: "output"
	value: "output"
	}
	}
	]
	}
	""" # noqa: W293
	assert expected.strip() == conf_encoder.get_inference_conf()


	def test_generate_conf(conf_decoder):
	expected = """
	name: "test_onnx_generate"
	max_batch_size: 0
	backend: "python"

	input [
	{
	name: "TEXT"
	data_type: TYPE_STRING
	dims: [ -1 ]
	}
	]

	output [
	{
	name: "output"
	data_type: TYPE_STRING
	dims: [ -1 ]
	}
	]

	instance_group [
	{
	count: 1
	kind: KIND_GPU
	}
	]

	parameters: {
	key: "FORCE_CPU_ONLY_INPUT_TENSORS"
	value: {
	string_value:"no"
	}
	}
	""" # noqa: W293
	print(conf_decoder.get_generation_conf())
	assert expected.strip() == conf_decoder.get_generation_conf()


	def test_token_classifier_inference_conf(conf_token_classifier):
	expected = """
	name: "test_onnx_inference"
	max_batch_size: 0
	backend: "python"

	input [
	{
	name: "TEXT"
	data_type: TYPE_STRING
	dims: [ -1 ]
	}
	]

	output [
	{
	name: "output"
	data_type: TYPE_STRING
	dims: [ -1 ]
	}
	]

	instance_group [
	{
	count: 1
	kind: KIND_GPU
	}
	]


	parameters: {
	key: "FORCE_CPU_ONLY_INPUT_TENSORS"
	value: {
	string_value:"no"
	}
	}
	"""
	assert expected.strip() == conf_token_classifier.get_inference_conf()


	def test_question_answering_inference_conf(conf_question_answering):
	expected = """
	name: "test_onnx_inference"
	max_batch_size: 0
	backend: "python"

	input [
	{
	name: "QUESTION"
	data_type: TYPE_STRING
	dims: [ -1 ]
	},
	{
	name: "CONTEXT"
	data_type: TYPE_STRING
	dims: [ -1 ]
	}
	]

	output [
	{
	name: "output"
	data_type: TYPE_STRING
	dims: [ -1 ]
	}
	]

	instance_group [
	{
	count: 1
	kind: KIND_GPU
	}
	]


	parameters: {
	key: "FORCE_CPU_ONLY_INPUT_TENSORS"
	value: {
	string_value:"no"
	}
	}
	"""
	assert expected.strip() == conf_question_answering.get_inference_conf()


	def test_t5_encoder_inference_conf(conf_encoder_t5):
	expected = """
	name: "test_onnx_inference"
	max_batch_size: 0
	platform: "ensemble"

	input [
	{
	name: "TEXT"
	data_type: TYPE_STRING
	dims: [ -1 ]
	}
	]

	output {
	name: "output"
	data_type: TYPE_FP32
	dims: [-1, 2]
	}

	ensemble_scheduling {
	step [
	{
	model_name: "test_onnx_tokenize"
	model_version: -1
	input_map {
	key: "TEXT"
	value: "TEXT"
	}
	output_map [
	{
	key: "input_ids"
	value: "input_ids"
	},
	{
	key: "attention_mask"
	value: "attention_mask"
	}
	]
	},
	{
	model_name: "test_onnx_model"
	model_version: -1
	input_map [
	{
	key: "input_ids"
	value: "input_ids"
	},
	{
	key: "attention_mask"
	value: "attention_mask"
	}
	]
	output_map {
	key: "output"
	value: "output"
	}
	}
	]
	}
	"""
	assert expected.strip() == conf_encoder_t5.get_inference_conf()


	def test_t5_decoder_generate_conf(conf_decoder_t5):
	expected = """
	name: "t5_model_generate"
	max_batch_size: 0
	backend: "python"

	input [
	{
	name: "TEXT"
	data_type: TYPE_STRING
	dims: [ -1 ]
	}
	]

	output {
	name: "OUTPUT_TEXT"
	data_type: TYPE_STRING
	dims: [ -1 ]
	}
	instance_group [
	{
	count: 1
	kind: KIND_GPU
	}
	]

	parameters: {
	key: "FORCE_CPU_ONLY_INPUT_TENSORS"
	value: {
	string_value:"no"
	}
	}
	"""
	assert expected.strip() == conf_decoder_t5.get_generation_conf()


	def test_create_folders(
	conf_encoder,
	conf_decoder,
	conf_token_classifier,
	conf_question_answering,
	conf_encoder_t5,
	conf_decoder_t5,
	working_directory: tempfile.TemporaryDirectory,
	):
	fake_model_path = Path(working_directory.name).joinpath("fake_model.bin")
	fake_model_path.write_bytes(b"abc")

	for conf, paths, python_code in [
	(
	conf_encoder,
	[
	conf_encoder.model_folder_name,
	conf_encoder.python_folder_name,
	conf_encoder.inference_folder_name,
	],
	python_tokenizer,
	),
	(
	conf_decoder,
	[
	conf_decoder.model_folder_name,
	conf_decoder.python_folder_name,
	conf_decoder.inference_folder_name,
	],
	generative_model,
	),
	(
	conf_token_classifier,
	[
	conf_token_classifier.model_folder_name,
	conf_token_classifier.python_folder_name,
	conf_token_classifier.inference_folder_name,
	],
	token_classifier,
	),
	(
	conf_question_answering,
	[
	conf_question_answering.model_folder_name,
	conf_question_answering.python_folder_name,
	conf_question_answering.inference_folder_name,
	],
	question_answering,
	),
	(
	conf_encoder_t5,
	[
	conf_encoder_t5.model_folder_name,
	conf_encoder_t5.python_folder_name,
	conf_encoder_t5.inference_folder_name,
	],
	t5_model,
	),
	(
	conf_decoder_t5,
	[
	conf_decoder_t5.model_folder_name,
	conf_decoder_t5.python_folder_name,
	],
	t5_model,
	),
	]:
	model_name = (
	"t5-small"
	if type(conf) in [ConfigurationT5Decoder, ConfigurationT5Encoder]
	else "philschmid/MiniLM-L6-H384-uncased-sst2"
	)
	tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(model_name)
	config: PretrainedConfig = AutoConfig.from_pretrained(model_name)
	conf.create_configs(tokenizer=tokenizer, config=config, model_path=fake_model_path, engine_type=EngineType.ONNX)
	for folder_name in paths:
	path = Path(conf.working_dir).joinpath(folder_name)
	assert path.joinpath("config.pbtxt").exists()
	assert path.joinpath("config.pbtxt").read_text() != ""
	assert path.joinpath("1").exists()

	model_path = Path(conf.working_dir).joinpath(conf.python_folder_name).joinpath("1").joinpath("model.py")
	assert model_path.exists()
	if type(conf) not in [ConfigurationT5Decoder, ConfigurationT5Encoder]:
	assert model_path.read_text() == inspect.getsource(python_code)