storage / ASR /transformer-deploy /demo /torchdynamo /dynamo_utils.py

Upload folder using huggingface_hub

e0c2d04 verified about 1 year ago

7.8 kB

	# Copyright 2022, Lefebvre Dalloz Services
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import contextlib
	import gc
	import time
	from dataclasses import dataclass
	from typing import Callable, Dict, List, Tuple

	import matplotlib.pyplot as plt
	import numpy as np
	import seaborn as sns
	import torch
	import torchdynamo
	from matplotlib.axes import Axes
	from matplotlib.figure import Figure
	from onnxruntime import GraphOptimizationLevel
	from tensorrt.tensorrt import Runtime
	from torch._C._autograd import ProfilerActivity
	from torchdynamo.eval_frame import OptimizeContext
	from transformers import PreTrainedModel

	from transformer_deploy.backends.ort_utils import create_model_for_provider, inference_onnx_binding
	from transformer_deploy.backends.trt_utils import load_engine


	seq_lengths = [16, 64, 128, 256, 384, 512]
	batch_sizes = [1, 8, 16, 32, 64, 128, 256]
	shapes_to_test: Dict[int, List[int]] = {b_s: seq_lengths for b_s in batch_sizes}


	@dataclass
	class BenchmarkOutput:
	latency: float
	output: Dict[str, torch.Tensor]


	def get_pytorch_input(size: Tuple[int, int]) -> Dict[str, torch.Tensor]:
	return {
	"input_ids": torch.randint(2, 1000, size=size, dtype=torch.int32, device="cuda"),
	"attention_mask": torch.ones(size=size, dtype=torch.int32, device="cuda"),
	}


	def benchmark(
	fn: Callable[[Dict[str, torch.Tensor]], BenchmarkOutput], shapes: Dict[int, List[int]] = shapes_to_test
	) -> np.ndarray:
	gc.collect() # delete all deletable objects so CUDA memory can be freed by empty_cache
	torch.cuda.empty_cache()
	timings: List[List[float]] = list()
	for is_warmup in [True, True, False]:
	for batch_size, seq_lens in shapes.items():
	batch_timings: List[float] = list()
	for seq_len in seq_lens:
	inputs = get_pytorch_input(size=(batch_size, seq_len))
	latencies = list()
	nb_retry = 5
	for _ in range(nb_retry):
	results: BenchmarkOutput = fn(inputs)
	latencies.append(results.latency)
	batch_timings.append(float(np.median(latencies)))
	if not is_warmup:
	timings.append(batch_timings)
	return np.array(timings)


	def get_pytorch_inference(
	model: PreTrainedModel, context_managers: List[contextlib.contextmanager]
	) -> Callable[[Dict[str, torch.Tensor]], BenchmarkOutput]:
	def fn(inputs: Dict[str, torch.Tensor]) -> BenchmarkOutput:
	context_managers_stack = contextlib.ExitStack()
	for cm in context_managers:
	context_managers_stack.enter_context(cm)
	with context_managers_stack:
	torch.cuda.synchronize()
	start = time.perf_counter()
	output = model(**inputs)
	torch.cuda.synchronize()
	timing = time.perf_counter() - start
	return BenchmarkOutput(latency=timing, output=output)

	return fn


	def get_onnx_inference(
	onnx_path: str, optimization_level: GraphOptimizationLevel
	) -> Callable[[Dict[str, torch.Tensor]], BenchmarkOutput]:
	onnx_session = create_model_for_provider(
	onnx_path, provider_to_use="CUDAExecutionProvider", optimization_level=optimization_level
	)
	onnx_binding = onnx_session.io_binding()

	def fn(inputs: Dict[str, torch.Tensor]) -> BenchmarkOutput:
	start = time.perf_counter()
	output = inference_onnx_binding(
	model_onnx=onnx_session, inputs=inputs, device="cuda", binding=onnx_binding, clone_tensor=False
	)
	timing = time.perf_counter() - start
	return BenchmarkOutput(latency=timing, output=output)

	return fn


	def get_tensorrt_inference(runtime: Runtime, plan_path: str) -> Callable[[Dict[str, torch.Tensor]], BenchmarkOutput]:
	trt_inference = load_engine(runtime=runtime, engine_file_path=plan_path)

	def fn(inputs: Dict[str, torch.Tensor]) -> BenchmarkOutput:
	start = time.perf_counter()
	output = trt_inference(inputs)
	timing = time.perf_counter() - start
	return BenchmarkOutput(latency=timing, output=output)

	return fn


	def check_output(
	fn: Callable[[Dict[str, torch.Tensor]], BenchmarkOutput],
	inputs: Dict[str, torch.Tensor],
	expected_outputs: Dict[str, torch.Tensor],
	atol: int = 1e-1,
	) -> None:
	model_output: BenchmarkOutput = fn(inputs)
	for tensor_name in expected_outputs.keys():
	assert model_output.output[tensor_name].shape == expected_outputs[tensor_name].shape
	reference = expected_outputs[tensor_name]
	to_check = model_output.output[tensor_name].type_as(reference) # to manage the case of float16
	assert torch.allclose(to_check, reference, atol=atol), f"{tensor_name} diff > {atol}"


	# for tensorrt there is no way to provide min_shape, opt_shape,max_shape -> dynamic shape requires recompilation
	# so better to use the true one
	def get_dynamo_optimizer(
	name: str, dynamic_shape: bool = True, dynamo_cache_size: int = 64, reset_cache: bool = True
	) -> OptimizeContext:
	# breaks in the graph in small parts, better performance, but no recompilation when size change!
	torchdynamo.config.dynamic_shapes = dynamic_shape
	torchdynamo.config.cache_size_limit = dynamo_cache_size
	if reset_cache:
	torchdynamo.reset()
	# to parameter nvfuser https://github.com/pytorch/pytorch/blob/release/1.12/torch/csrc/jit/codegen/cuda/README.md
	return torchdynamo.optimize(name) # interesting fusers: nvfuser_ofi, nnc_ofi, fx2trt


	def print_pytorch_profile(
	fn: Callable[[Dict[str, torch.Tensor]], BenchmarkOutput], inputs: Dict[str, torch.Tensor], row_limit: int = 20
	) -> None:
	with torch.profiler.profile(activities=[ProfilerActivity.CUDA], profile_memory=True, with_flops=True) as prof:
	fn(inputs)
	print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=row_limit))


	def plot_benchmarks(title: str, latencies: np.ndarray, baseline: np.ndarray, batches: List[int] = batch_sizes) -> None:
	sns.set_style("whitegrid") # darkgrid, whitegrid, dark, white and ticks
	plt.rc("axes", titlesize=15) # fontsize of the axes title
	plt.rc("axes", labelsize=14) # fontsize of the x and y labels
	plt.rc("xtick", labelsize=13) # fontsize of the tick labels
	plt.rc("ytick", labelsize=13) # fontsize of the tick labels
	plt.rc("legend", fontsize=15) # legend fontsize
	plt.rc("font", size=13) # controls default text sizes

	colors = sns.color_palette("deep")
	batch_latencies = baseline / latencies
	fig, (ax1, ax2) = plt.subplots(
	2, 1, figsize=(10, 10), sharex=True, sharey=False
	) # type: Tuple[Figure, Tuple[Axes, Axes]]

	plt.suptitle(f"{title}")
	plt.xticks(seq_lengths)
	ax1.set_title("effect of arithmetic intensity on speedup")
	ax2.set_title("throughput")
	ax1.set_ylabel("speedup over baseline")
	ax2.set_ylabel("# processed sequences per second")
	plt.xlabel("sequence length")

	for i in range(batch_latencies.shape[0]):
	batch = batches[i]
	ax1.plot(seq_lengths, batch_latencies[i], label=batch, color=colors[i])
	ax2.plot(seq_lengths, batch / latencies[i], label=batch, color=colors[i])

	ax1.legend(title="batch size")
	ax2.legend(title="batch size")

	plt.show()