storage / ASR /transformer-deploy /tests /test_utils.py

Upload folder using huggingface_hub

e0c2d04 verified about 1 year ago

4.01 kB

	# Copyright 2022, Lefebvre Dalloz Services
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	from typing import List, Tuple, Union

	import numpy as np
	import torch
	import transformers.models.t5.modeling_t5

	from transformer_deploy.backends.pytorch_utils import get_model_size
	from transformer_deploy.benchmarks.utils import compare_outputs, generate_input, generate_multiple_inputs, to_numpy
	from transformer_deploy.utils.accuracy import check_accuracy
	from transformer_deploy.utils.fastseq import code_patcher


	def generate_fake_outputs(
	shape: Tuple[int, int], nb: int, factor: float, tensor_type: str
	) -> List[Union[np.ndarray, torch.Tensor]]:
	results = list()
	for _ in range(nb):
	if tensor_type == "np":
	tensor = np.arange(start=0, stop=shape[0] * shape[1]).reshape(shape) * factor
	elif tensor_type == "torch":
	tensor = torch.arange(start=0, end=shape[0] * shape[1], device="cpu").reshape(shape) * factor
	else:
	raise Exception(f"unknown: {tensor_type}")
	results.append(tensor)
	return results


	def test_gap():
	shape = (1, 4)
	pairs = [("np", "np"), ("np", "torch"), ("torch", "np"), ("torch", "torch")]
	for t1_type, t2_type in pairs:
	t1 = generate_fake_outputs(shape=shape, nb=1, factor=0.1, tensor_type=t1_type)
	t2 = generate_fake_outputs(shape=shape, nb=1, factor=0.2, tensor_type=t2_type)
	assert np.isclose(a=compare_outputs(pytorch_output=to_numpy(t1), engine_output=to_numpy(t2)), b=0.15, atol=1e-3)
	check_accuracy(engine_name=f"test [{t1_type}/{t2_type}]", pytorch_output=t1, engine_output=t2, tolerance=0.16)


	def test_generate_input():
	inputs_pytorch = generate_input(seq_len=16, batch_size=4, input_names=["input_ids", "attention_mask"], device="cpu")
	assert set(inputs_pytorch.keys()) == {"input_ids", "attention_mask"}
	assert inputs_pytorch["input_ids"].shape == torch.Size([4, 16])
	inputs_pytorch = generate_input(
	seq_len=1, batch_size=1, input_names=["input_ids", "attention_mask", "token_type_ids"], device="cpu"
	)
	assert set(inputs_pytorch.keys()) == {"input_ids", "attention_mask", "token_type_ids"}


	def test_multiple_generate_input():
	multiple_inputs_pytorch = generate_multiple_inputs(
	seq_len=16, batch_size=4, input_names=["input_ids", "attention_mask"], nb_inputs_to_gen=4, device="cpu"
	)
	assert len(multiple_inputs_pytorch) == 4
	assert set(multiple_inputs_pytorch[0].keys()) == {"input_ids", "attention_mask"}


	def test_extract_model_info():
	models = [
	"philschmid/MiniLM-L6-H384-uncased-sst2",
	"camembert-base",
	"sentence-transformers/msmarco-distilbert-cos-v5",
	]
	for m in models:
	att, hidden_size = get_model_size(path=m)
	assert att > 0 and hidden_size > 0


	def test_update_module():
	src_code = (
	"vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}\n"
	" vocab.update(self.added_tokens_encoder)\n"
	)
	code_patcher(
	module_name="transformers.models.t5.tokenization_t5",
	function=transformers.models.t5.tokenization_t5.T5Tokenizer.get_vocab,
	new_function_name="new_vocab",
	modifications={src_code: 'vocab = {"1": "success"}\n'},
	)
	transformers.models.t5.tokenization_t5.T5Tokenizer.get_vocab = transformers.models.t5.tokenization_t5.new_vocab
	assert transformers.models.t5.tokenization_t5.T5Tokenizer.get_vocab(1) == {"1": "success"}