File size: 4,698 Bytes
de4ade4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0
import sys
import time
from contextlib import nullcontext
import torch
from omegaconf import DictConfig
from omegaconf import OmegaConf as om
from llmfoundry import COMPOSER_MODEL_REGISTRY
def get_dtype(dtype: str):
if dtype == 'fp32':
return torch.float32
elif dtype == 'fp16':
return torch.float16
elif dtype == 'bf16':
return torch.bfloat16
else:
raise NotImplementedError(
f'dtype {dtype} is not supported. ' +
f'We only support fp32, fp16, and bf16 currently')
def compare_dtype(dtype: torch.dtype, param_dtype: torch.dtype):
if dtype != param_dtype:
raise ValueError(
f'dtype type is: {dtype} but model dtype is: {param_dtype}. ' +
f"The expected dtype and model dtype don't match.")
def main(config: DictConfig):
if config.device is not None:
device = config.device
else:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model_dtype = get_dtype(config.model_dtype)
print(f'Using device={device} and dtype={model_dtype}...')
if config.autocast_dtype is not None:
autocast_dtype = get_dtype(config.autocast_dtype)
autocast_context = torch.autocast(device, autocast_dtype)
print(f'Using autocast with dtype={autocast_dtype}...')
else:
autocast_context = nullcontext()
print('NOT using autocast...')
inference_config = {
'replace_with_kernel_inject': True,
'dtype': model_dtype,
'replace_method': 'auto',
'enable_cuda_graph': False,
'tensor_parallel': {
'tp_size': 0
},
}
composer_model = COMPOSER_MODEL_REGISTRY[config.model.name](
config.model, config.tokenizer)
model = composer_model.model
model.eval()
if config.use_deepspeed:
import deepspeed # type: ignore
model = deepspeed.init_inference(model, config=inference_config)
# Checking if deepspeed casts dtypes correctly
for _, p in model.named_parameters():
compare_dtype(model_dtype, p.dtype)
break
else:
model.to(device=device, dtype=model_dtype)
n_params = sum(p.numel() for p in model.parameters())
print('n_params is: ', n_params)
print(
'name, latency (s), throughput (tokens/s), latency_per_sequence_output_token (ms)'
)
print('=' * 75)
for batch_size in config.batch_sizes:
for input_length in config.input_lengths:
for output_length in config.output_lengths:
batch = torch.randint(0,
config.model.vocab_size - 1,
size=(batch_size,
input_length)).to(device)
# We're just going to have generate eos, padding tokens be
# ignored by HF generate
batch = batch.to(torch.long)
attention_mask = torch.ones_like(batch)
start_time = 0
for i in range(config.num_batches + config.num_warmup_batches):
if i == config.num_warmup_batches:
torch.cuda.synchronize()
start_time = time.time()
with torch.no_grad():
with autocast_context:
model.generate(batch,
max_new_tokens=output_length,
use_cache=config.use_cache,
attention_mask=attention_mask,
eos_token_id=None,
pad_token_id=None)
torch.cuda.synchronize()
mean_time = (time.time() - start_time) / config.num_batches
num_output_tokens = output_length * batch_size
tokens_per_second = num_output_tokens / mean_time
ms_per_seq_output_token = mean_time * 1000 / output_length
run_name = f'{config.benchmark_name}_{batch_size}_{input_length}_{output_length}'
print(
f'{run_name}, {mean_time:.3f}, {tokens_per_second:.3f}, {ms_per_seq_output_token:.3f}'
)
if __name__ == '__main__':
yaml_path, args_list = sys.argv[1], sys.argv[2:]
with open(yaml_path) as f:
yaml_config = om.load(f)
cli_config = om.from_cli(args_list)
config = om.merge(yaml_config, cli_config)
assert isinstance(config, DictConfig)
main(config)
|