File size: 4,698 Bytes

de4ade4

# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

import sys
import time
from contextlib import nullcontext

import torch
from omegaconf import DictConfig
from omegaconf import OmegaConf as om

from llmfoundry import COMPOSER_MODEL_REGISTRY


def get_dtype(dtype: str):
    if dtype == 'fp32':
        return torch.float32
    elif dtype == 'fp16':
        return torch.float16
    elif dtype == 'bf16':
        return torch.bfloat16
    else:
        raise NotImplementedError(
            f'dtype {dtype} is not supported. ' +
            f'We only support fp32, fp16, and bf16 currently')


def compare_dtype(dtype: torch.dtype, param_dtype: torch.dtype):
    if dtype != param_dtype:
        raise ValueError(
            f'dtype type is: {dtype} but model dtype is: {param_dtype}. ' +
            f"The expected dtype and model dtype don't match.")


def main(config: DictConfig):
    if config.device is not None:
        device = config.device
    else:
        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    model_dtype = get_dtype(config.model_dtype)
    print(f'Using device={device} and dtype={model_dtype}...')

    if config.autocast_dtype is not None:
        autocast_dtype = get_dtype(config.autocast_dtype)
        autocast_context = torch.autocast(device, autocast_dtype)
        print(f'Using autocast with dtype={autocast_dtype}...')
    else:
        autocast_context = nullcontext()
        print('NOT using autocast...')

    inference_config = {
        'replace_with_kernel_inject': True,
        'dtype': model_dtype,
        'replace_method': 'auto',
        'enable_cuda_graph': False,
        'tensor_parallel': {
            'tp_size': 0
        },
    }

    composer_model = COMPOSER_MODEL_REGISTRY[config.model.name](
        config.model, config.tokenizer)
    model = composer_model.model
    model.eval()

    if config.use_deepspeed:
        import deepspeed  # type: ignore
        model = deepspeed.init_inference(model, config=inference_config)

        # Checking if deepspeed casts dtypes correctly
        for _, p in model.named_parameters():
            compare_dtype(model_dtype, p.dtype)
            break
    else:
        model.to(device=device, dtype=model_dtype)

    n_params = sum(p.numel() for p in model.parameters())
    print('n_params is: ', n_params)

    print(
        'name, latency (s), throughput (tokens/s), latency_per_sequence_output_token (ms)'
    )
    print('=' * 75)

    for batch_size in config.batch_sizes:
        for input_length in config.input_lengths:
            for output_length in config.output_lengths:
                batch = torch.randint(0,
                                      config.model.vocab_size - 1,
                                      size=(batch_size,
                                            input_length)).to(device)

                # We're just going to have generate eos, padding tokens be
                # ignored by HF generate
                batch = batch.to(torch.long)
                attention_mask = torch.ones_like(batch)

                start_time = 0
                for i in range(config.num_batches + config.num_warmup_batches):
                    if i == config.num_warmup_batches:
                        torch.cuda.synchronize()
                        start_time = time.time()
                    with torch.no_grad():
                        with autocast_context:
                            model.generate(batch,
                                           max_new_tokens=output_length,
                                           use_cache=config.use_cache,
                                           attention_mask=attention_mask,
                                           eos_token_id=None,
                                           pad_token_id=None)

                torch.cuda.synchronize()
                mean_time = (time.time() - start_time) / config.num_batches

                num_output_tokens = output_length * batch_size
                tokens_per_second = num_output_tokens / mean_time
                ms_per_seq_output_token = mean_time * 1000 / output_length

                run_name = f'{config.benchmark_name}_{batch_size}_{input_length}_{output_length}'
                print(
                    f'{run_name}, {mean_time:.3f}, {tokens_per_second:.3f}, {ms_per_seq_output_token:.3f}'
                )


if __name__ == '__main__':
    yaml_path, args_list = sys.argv[1], sys.argv[2:]
    with open(yaml_path) as f:
        yaml_config = om.load(f)
    cli_config = om.from_cli(args_list)
    config = om.merge(yaml_config, cli_config)
    assert isinstance(config, DictConfig)
    main(config)