File size: 4,698 Bytes
de4ade4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

import sys
import time
from contextlib import nullcontext

import torch
from omegaconf import DictConfig
from omegaconf import OmegaConf as om

from llmfoundry import COMPOSER_MODEL_REGISTRY


def get_dtype(dtype: str):
    if dtype == 'fp32':
        return torch.float32
    elif dtype == 'fp16':
        return torch.float16
    elif dtype == 'bf16':
        return torch.bfloat16
    else:
        raise NotImplementedError(
            f'dtype {dtype} is not supported. ' +
            f'We only support fp32, fp16, and bf16 currently')


def compare_dtype(dtype: torch.dtype, param_dtype: torch.dtype):
    if dtype != param_dtype:
        raise ValueError(
            f'dtype type is: {dtype} but model dtype is: {param_dtype}. ' +
            f"The expected dtype and model dtype don't match.")


def main(config: DictConfig):
    if config.device is not None:
        device = config.device
    else:
        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    model_dtype = get_dtype(config.model_dtype)
    print(f'Using device={device} and dtype={model_dtype}...')

    if config.autocast_dtype is not None:
        autocast_dtype = get_dtype(config.autocast_dtype)
        autocast_context = torch.autocast(device, autocast_dtype)
        print(f'Using autocast with dtype={autocast_dtype}...')
    else:
        autocast_context = nullcontext()
        print('NOT using autocast...')

    inference_config = {
        'replace_with_kernel_inject': True,
        'dtype': model_dtype,
        'replace_method': 'auto',
        'enable_cuda_graph': False,
        'tensor_parallel': {
            'tp_size': 0
        },
    }

    composer_model = COMPOSER_MODEL_REGISTRY[config.model.name](
        config.model, config.tokenizer)
    model = composer_model.model
    model.eval()

    if config.use_deepspeed:
        import deepspeed  # type: ignore
        model = deepspeed.init_inference(model, config=inference_config)

        # Checking if deepspeed casts dtypes correctly
        for _, p in model.named_parameters():
            compare_dtype(model_dtype, p.dtype)
            break
    else:
        model.to(device=device, dtype=model_dtype)

    n_params = sum(p.numel() for p in model.parameters())
    print('n_params is: ', n_params)

    print(
        'name, latency (s), throughput (tokens/s), latency_per_sequence_output_token (ms)'
    )
    print('=' * 75)

    for batch_size in config.batch_sizes:
        for input_length in config.input_lengths:
            for output_length in config.output_lengths:
                batch = torch.randint(0,
                                      config.model.vocab_size - 1,
                                      size=(batch_size,
                                            input_length)).to(device)

                # We're just going to have generate eos, padding tokens be
                # ignored by HF generate
                batch = batch.to(torch.long)
                attention_mask = torch.ones_like(batch)

                start_time = 0
                for i in range(config.num_batches + config.num_warmup_batches):
                    if i == config.num_warmup_batches:
                        torch.cuda.synchronize()
                        start_time = time.time()
                    with torch.no_grad():
                        with autocast_context:
                            model.generate(batch,
                                           max_new_tokens=output_length,
                                           use_cache=config.use_cache,
                                           attention_mask=attention_mask,
                                           eos_token_id=None,
                                           pad_token_id=None)

                torch.cuda.synchronize()
                mean_time = (time.time() - start_time) / config.num_batches

                num_output_tokens = output_length * batch_size
                tokens_per_second = num_output_tokens / mean_time
                ms_per_seq_output_token = mean_time * 1000 / output_length

                run_name = f'{config.benchmark_name}_{batch_size}_{input_length}_{output_length}'
                print(
                    f'{run_name}, {mean_time:.3f}, {tokens_per_second:.3f}, {ms_per_seq_output_token:.3f}'
                )


if __name__ == '__main__':
    yaml_path, args_list = sys.argv[1], sys.argv[2:]
    with open(yaml_path) as f:
        yaml_config = om.load(f)
    cli_config = om.from_cli(args_list)
    config = om.merge(yaml_config, cli_config)
    assert isinstance(config, DictConfig)
    main(config)