File size: 7,801 Bytes

e0c2d04

#  Copyright 2022, Lefebvre Dalloz Services
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
import contextlib
import gc
import time
from dataclasses import dataclass
from typing import Callable, Dict, List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import torchdynamo
from matplotlib.axes import Axes
from matplotlib.figure import Figure
from onnxruntime import GraphOptimizationLevel
from tensorrt.tensorrt import Runtime
from torch._C._autograd import ProfilerActivity
from torchdynamo.eval_frame import OptimizeContext
from transformers import PreTrainedModel

from transformer_deploy.backends.ort_utils import create_model_for_provider, inference_onnx_binding
from transformer_deploy.backends.trt_utils import load_engine


seq_lengths = [16, 64, 128, 256, 384, 512]
batch_sizes = [1, 8, 16, 32, 64, 128, 256]
shapes_to_test: Dict[int, List[int]] = {b_s: seq_lengths for b_s in batch_sizes}


@dataclass
class BenchmarkOutput:
    latency: float
    output: Dict[str, torch.Tensor]


def get_pytorch_input(size: Tuple[int, int]) -> Dict[str, torch.Tensor]:
    return {
        "input_ids": torch.randint(2, 1000, size=size, dtype=torch.int32, device="cuda"),
        "attention_mask": torch.ones(size=size, dtype=torch.int32, device="cuda"),
    }


def benchmark(
    fn: Callable[[Dict[str, torch.Tensor]], BenchmarkOutput], shapes: Dict[int, List[int]] = shapes_to_test
) -> np.ndarray:
    gc.collect()  # delete all deletable objects so CUDA memory can be freed by empty_cache
    torch.cuda.empty_cache()
    timings: List[List[float]] = list()
    for is_warmup in [True, True, False]:
        for batch_size, seq_lens in shapes.items():
            batch_timings: List[float] = list()
            for seq_len in seq_lens:
                inputs = get_pytorch_input(size=(batch_size, seq_len))
                latencies = list()
                nb_retry = 5
                for _ in range(nb_retry):
                    results: BenchmarkOutput = fn(inputs)
                    latencies.append(results.latency)
                batch_timings.append(float(np.median(latencies)))
            if not is_warmup:
                timings.append(batch_timings)
    return np.array(timings)


def get_pytorch_inference(
    model: PreTrainedModel, context_managers: List[contextlib.contextmanager]
) -> Callable[[Dict[str, torch.Tensor]], BenchmarkOutput]:
    def fn(inputs: Dict[str, torch.Tensor]) -> BenchmarkOutput:
        context_managers_stack = contextlib.ExitStack()
        for cm in context_managers:
            context_managers_stack.enter_context(cm)
        with context_managers_stack:
            torch.cuda.synchronize()
            start = time.perf_counter()
            output = model(**inputs)
            torch.cuda.synchronize()
            timing = time.perf_counter() - start
        return BenchmarkOutput(latency=timing, output=output)

    return fn


def get_onnx_inference(
    onnx_path: str, optimization_level: GraphOptimizationLevel
) -> Callable[[Dict[str, torch.Tensor]], BenchmarkOutput]:
    onnx_session = create_model_for_provider(
        onnx_path, provider_to_use="CUDAExecutionProvider", optimization_level=optimization_level
    )
    onnx_binding = onnx_session.io_binding()

    def fn(inputs: Dict[str, torch.Tensor]) -> BenchmarkOutput:
        start = time.perf_counter()
        output = inference_onnx_binding(
            model_onnx=onnx_session, inputs=inputs, device="cuda", binding=onnx_binding, clone_tensor=False
        )
        timing = time.perf_counter() - start
        return BenchmarkOutput(latency=timing, output=output)

    return fn


def get_tensorrt_inference(runtime: Runtime, plan_path: str) -> Callable[[Dict[str, torch.Tensor]], BenchmarkOutput]:
    trt_inference = load_engine(runtime=runtime, engine_file_path=plan_path)

    def fn(inputs: Dict[str, torch.Tensor]) -> BenchmarkOutput:
        start = time.perf_counter()
        output = trt_inference(inputs)
        timing = time.perf_counter() - start
        return BenchmarkOutput(latency=timing, output=output)

    return fn


def check_output(
    fn: Callable[[Dict[str, torch.Tensor]], BenchmarkOutput],
    inputs: Dict[str, torch.Tensor],
    expected_outputs: Dict[str, torch.Tensor],
    atol: int = 1e-1,
) -> None:
    model_output: BenchmarkOutput = fn(inputs)
    for tensor_name in expected_outputs.keys():
        assert model_output.output[tensor_name].shape == expected_outputs[tensor_name].shape
        reference = expected_outputs[tensor_name]
        to_check = model_output.output[tensor_name].type_as(reference)  # to manage the case of float16
        assert torch.allclose(to_check, reference, atol=atol), f"{tensor_name} diff > {atol}"


# for tensorrt there is no way to provide min_shape, opt_shape,max_shape -> dynamic shape requires recompilation
# so better to use the true one
def get_dynamo_optimizer(
    name: str, dynamic_shape: bool = True, dynamo_cache_size: int = 64, reset_cache: bool = True
) -> OptimizeContext:
    # breaks in the graph in small parts, better performance, but no recompilation when size change!
    torchdynamo.config.dynamic_shapes = dynamic_shape
    torchdynamo.config.cache_size_limit = dynamo_cache_size
    if reset_cache:
        torchdynamo.reset()
    # to parameter nvfuser https://github.com/pytorch/pytorch/blob/release/1.12/torch/csrc/jit/codegen/cuda/README.md
    return torchdynamo.optimize(name)  # interesting fusers: nvfuser_ofi, nnc_ofi, fx2trt


def print_pytorch_profile(
    fn: Callable[[Dict[str, torch.Tensor]], BenchmarkOutput], inputs: Dict[str, torch.Tensor], row_limit: int = 20
) -> None:
    with torch.profiler.profile(activities=[ProfilerActivity.CUDA], profile_memory=True, with_flops=True) as prof:
        fn(inputs)
    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=row_limit))


def plot_benchmarks(title: str, latencies: np.ndarray, baseline: np.ndarray, batches: List[int] = batch_sizes) -> None:
    sns.set_style("whitegrid")  # darkgrid, whitegrid, dark, white and ticks
    plt.rc("axes", titlesize=15)  # fontsize of the axes title
    plt.rc("axes", labelsize=14)  # fontsize of the x and y labels
    plt.rc("xtick", labelsize=13)  # fontsize of the tick labels
    plt.rc("ytick", labelsize=13)  # fontsize of the tick labels
    plt.rc("legend", fontsize=15)  # legend fontsize
    plt.rc("font", size=13)  # controls default text sizes

    colors = sns.color_palette("deep")
    batch_latencies = baseline / latencies
    fig, (ax1, ax2) = plt.subplots(
        2, 1, figsize=(10, 10), sharex=True, sharey=False
    )  # type: Tuple[Figure, Tuple[Axes, Axes]]

    plt.suptitle(f"{title}")
    plt.xticks(seq_lengths)
    ax1.set_title("effect of arithmetic intensity on speedup")
    ax2.set_title("throughput")
    ax1.set_ylabel("speedup over baseline")
    ax2.set_ylabel("# processed sequences per second")
    plt.xlabel("sequence length")

    for i in range(batch_latencies.shape[0]):
        batch = batches[i]
        ax1.plot(seq_lengths, batch_latencies[i], label=batch, color=colors[i])
        ax2.plot(seq_lengths, batch / latencies[i], label=batch, color=colors[i])

    ax1.legend(title="batch size")
    ax2.legend(title="batch size")

    plt.show()