File size: 7,801 Bytes
e0c2d04 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
# Copyright 2022, Lefebvre Dalloz Services
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import gc
import time
from dataclasses import dataclass
from typing import Callable, Dict, List, Tuple
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import torchdynamo
from matplotlib.axes import Axes
from matplotlib.figure import Figure
from onnxruntime import GraphOptimizationLevel
from tensorrt.tensorrt import Runtime
from torch._C._autograd import ProfilerActivity
from torchdynamo.eval_frame import OptimizeContext
from transformers import PreTrainedModel
from transformer_deploy.backends.ort_utils import create_model_for_provider, inference_onnx_binding
from transformer_deploy.backends.trt_utils import load_engine
seq_lengths = [16, 64, 128, 256, 384, 512]
batch_sizes = [1, 8, 16, 32, 64, 128, 256]
shapes_to_test: Dict[int, List[int]] = {b_s: seq_lengths for b_s in batch_sizes}
@dataclass
class BenchmarkOutput:
latency: float
output: Dict[str, torch.Tensor]
def get_pytorch_input(size: Tuple[int, int]) -> Dict[str, torch.Tensor]:
return {
"input_ids": torch.randint(2, 1000, size=size, dtype=torch.int32, device="cuda"),
"attention_mask": torch.ones(size=size, dtype=torch.int32, device="cuda"),
}
def benchmark(
fn: Callable[[Dict[str, torch.Tensor]], BenchmarkOutput], shapes: Dict[int, List[int]] = shapes_to_test
) -> np.ndarray:
gc.collect() # delete all deletable objects so CUDA memory can be freed by empty_cache
torch.cuda.empty_cache()
timings: List[List[float]] = list()
for is_warmup in [True, True, False]:
for batch_size, seq_lens in shapes.items():
batch_timings: List[float] = list()
for seq_len in seq_lens:
inputs = get_pytorch_input(size=(batch_size, seq_len))
latencies = list()
nb_retry = 5
for _ in range(nb_retry):
results: BenchmarkOutput = fn(inputs)
latencies.append(results.latency)
batch_timings.append(float(np.median(latencies)))
if not is_warmup:
timings.append(batch_timings)
return np.array(timings)
def get_pytorch_inference(
model: PreTrainedModel, context_managers: List[contextlib.contextmanager]
) -> Callable[[Dict[str, torch.Tensor]], BenchmarkOutput]:
def fn(inputs: Dict[str, torch.Tensor]) -> BenchmarkOutput:
context_managers_stack = contextlib.ExitStack()
for cm in context_managers:
context_managers_stack.enter_context(cm)
with context_managers_stack:
torch.cuda.synchronize()
start = time.perf_counter()
output = model(**inputs)
torch.cuda.synchronize()
timing = time.perf_counter() - start
return BenchmarkOutput(latency=timing, output=output)
return fn
def get_onnx_inference(
onnx_path: str, optimization_level: GraphOptimizationLevel
) -> Callable[[Dict[str, torch.Tensor]], BenchmarkOutput]:
onnx_session = create_model_for_provider(
onnx_path, provider_to_use="CUDAExecutionProvider", optimization_level=optimization_level
)
onnx_binding = onnx_session.io_binding()
def fn(inputs: Dict[str, torch.Tensor]) -> BenchmarkOutput:
start = time.perf_counter()
output = inference_onnx_binding(
model_onnx=onnx_session, inputs=inputs, device="cuda", binding=onnx_binding, clone_tensor=False
)
timing = time.perf_counter() - start
return BenchmarkOutput(latency=timing, output=output)
return fn
def get_tensorrt_inference(runtime: Runtime, plan_path: str) -> Callable[[Dict[str, torch.Tensor]], BenchmarkOutput]:
trt_inference = load_engine(runtime=runtime, engine_file_path=plan_path)
def fn(inputs: Dict[str, torch.Tensor]) -> BenchmarkOutput:
start = time.perf_counter()
output = trt_inference(inputs)
timing = time.perf_counter() - start
return BenchmarkOutput(latency=timing, output=output)
return fn
def check_output(
fn: Callable[[Dict[str, torch.Tensor]], BenchmarkOutput],
inputs: Dict[str, torch.Tensor],
expected_outputs: Dict[str, torch.Tensor],
atol: int = 1e-1,
) -> None:
model_output: BenchmarkOutput = fn(inputs)
for tensor_name in expected_outputs.keys():
assert model_output.output[tensor_name].shape == expected_outputs[tensor_name].shape
reference = expected_outputs[tensor_name]
to_check = model_output.output[tensor_name].type_as(reference) # to manage the case of float16
assert torch.allclose(to_check, reference, atol=atol), f"{tensor_name} diff > {atol}"
# for tensorrt there is no way to provide min_shape, opt_shape,max_shape -> dynamic shape requires recompilation
# so better to use the true one
def get_dynamo_optimizer(
name: str, dynamic_shape: bool = True, dynamo_cache_size: int = 64, reset_cache: bool = True
) -> OptimizeContext:
# breaks in the graph in small parts, better performance, but no recompilation when size change!
torchdynamo.config.dynamic_shapes = dynamic_shape
torchdynamo.config.cache_size_limit = dynamo_cache_size
if reset_cache:
torchdynamo.reset()
# to parameter nvfuser https://github.com/pytorch/pytorch/blob/release/1.12/torch/csrc/jit/codegen/cuda/README.md
return torchdynamo.optimize(name) # interesting fusers: nvfuser_ofi, nnc_ofi, fx2trt
def print_pytorch_profile(
fn: Callable[[Dict[str, torch.Tensor]], BenchmarkOutput], inputs: Dict[str, torch.Tensor], row_limit: int = 20
) -> None:
with torch.profiler.profile(activities=[ProfilerActivity.CUDA], profile_memory=True, with_flops=True) as prof:
fn(inputs)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=row_limit))
def plot_benchmarks(title: str, latencies: np.ndarray, baseline: np.ndarray, batches: List[int] = batch_sizes) -> None:
sns.set_style("whitegrid") # darkgrid, whitegrid, dark, white and ticks
plt.rc("axes", titlesize=15) # fontsize of the axes title
plt.rc("axes", labelsize=14) # fontsize of the x and y labels
plt.rc("xtick", labelsize=13) # fontsize of the tick labels
plt.rc("ytick", labelsize=13) # fontsize of the tick labels
plt.rc("legend", fontsize=15) # legend fontsize
plt.rc("font", size=13) # controls default text sizes
colors = sns.color_palette("deep")
batch_latencies = baseline / latencies
fig, (ax1, ax2) = plt.subplots(
2, 1, figsize=(10, 10), sharex=True, sharey=False
) # type: Tuple[Figure, Tuple[Axes, Axes]]
plt.suptitle(f"{title}")
plt.xticks(seq_lengths)
ax1.set_title("effect of arithmetic intensity on speedup")
ax2.set_title("throughput")
ax1.set_ylabel("speedup over baseline")
ax2.set_ylabel("# processed sequences per second")
plt.xlabel("sequence length")
for i in range(batch_latencies.shape[0]):
batch = batches[i]
ax1.plot(seq_lengths, batch_latencies[i], label=batch, color=colors[i])
ax2.plot(seq_lengths, batch / latencies[i], label=batch, color=colors[i])
ax1.legend(title="batch size")
ax2.legend(title="batch size")
plt.show()
|