File size: 5,582 Bytes
3943768
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123


# %%
import json

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

# %%
# Read the json file
# This file processes the llm_gpu_benchmark.json file in the tmp/inputs folder
# File is generated using the command
# curl  -sSL https://raw.githubusercontent.com/h2oai/h2ogpt/main/benchmarks/perf.json | jq -s '.' > llm_gpu_benchmarks.json
with open('llm_gpu_benchmarks.json') as f:
    data = json.load(f)
del f

# %%
# Read the json file into a dataframe
df = pd.json_normalize(data)
del data

# %%
# Process the dataframe
# Drop columns that are not needed
df.drop(columns=['task', 'ngpus', 'reps', 'date', 'git_sha', 'transformers', 'bitsandbytes', 'cuda', 'hostname',
                 'summarize_input_len_bytes'], inplace=True)
# Rename columns
df.rename(columns={'n_gpus': 'gpu_count'}, inplace=True)
# Split the gpu column into gpu and gpu_memory
df["gpu_name"] = df.gpus.str.extract(r'[1-9] x ([\w\- ]+) .+')
df["gpu_memory_gb"] = round(
    pd.to_numeric(df.gpus.str.extract(r'[\w ]+ \(([\d]+) .+', expand=False), errors='coerce') / 1024)
df["gpu_memory_gb"] = df["gpu_memory_gb"].astype('Int64')
df.drop(columns=['gpus'], inplace=True)
# Manage gpu_names
df.gpu_name = df.gpu_name.str.replace('NVIDIA ', '')
df.gpu_name = df.gpu_name.str.replace('GeForce ', '')
df.gpu_name = df.gpu_name.str.replace('A100-SXM4-80GB', 'A100 SXM4')
df.gpu_name = df.gpu_memory_gb.astype(str) + "-" + df.gpu_name
# Remove CPUs
df.drop(df[df.gpu_name.isnull()].index, inplace=True)

# %%
# Remove duplicate rows
df.drop_duplicates(['backend', 'base_model', 'bits', 'gpu_count', 'gpu_name'], inplace=True)

# %% Add baseline comparison columns
# Looking at the CPU data for 4, 8, and 16 bit quantization values for the benchmark we are simplifying it to a single
# value
cpu_summary_out_throughput = 1353 / 1216  # bytes/second  (calculated from summarize_output_len_bytes / summarize_time)
cpu_generate_out_throughput = 849 / 180  # bytes/second   (calculated from generate_output_len_bytes / generate_time)

# add GPU throughput columns
df["summary_out_throughput"] = df.summarize_output_len_bytes / df.summarize_time
df["generate_out_throughput"] = df.generate_output_len_bytes / df.generate_time
# add GPU throughput boost columns
df["summary_out_throughput_normalize"] = df.summary_out_throughput / cpu_summary_out_throughput
df["generate_out_throughput_normalize"] = df.generate_out_throughput / cpu_generate_out_throughput

# %%
# df.to_excel('tmp/scratchpad/output/llm_gpu_benchmarks.xlsx', index=False)

# %%
pio.renderers.default = "browser"

# %%
bits_bar_colors = {'4': px.colors.qualitative.D3[0],
                   '8': px.colors.qualitative.D3[1],
                   '16': px.colors.qualitative.D3[2]}

backends = list(df.backend.unique())
base_models = list(df.base_model.unique())
n_gpus = list(df.gpu_count.unique())

# %%
for backend in backends:
    # for backend in ['transformers']:
    fig_bar = make_subplots(rows=len(n_gpus),
                            cols=len(base_models) * 2,
                            shared_xaxes='all',
                            shared_yaxes='columns',
                            start_cell="top-left",
                            vertical_spacing=0.1,
                            print_grid=False,
                            row_titles=[f'{gpu_count} GPUs' for gpu_count in n_gpus],
                            column_titles=['llama2-7b-chat Summarization', 'llama2-7b-chat Generation',
                                           'llama2-13b-chat Summarization', 'llama2-13b-chat Generation',
                                           'llama2-70b-chat Summarization', 'llama2-70b-chat Generation'],)

    # for base_model in ['h2oai/h2ogpt-4096-llama2-7b-chat']:
    for base_model in base_models:
        for gpu_count in n_gpus:
            for bits in sorted(df.bits.unique()):
                sub_df = df[(df.backend == backend) &
                            (df.base_model == base_model) &
                            (df.gpu_count == gpu_count) &
                            (df.bits == bits)].sort_values(by='gpu_name')
                fig_bar.add_trace(go.Bar(x=sub_df.summary_out_throughput_normalize,
                                         y=sub_df.gpu_name,
                                         name=f'sum-{bits} bits',
                                         legendgroup=f'sum-{bits} bits',
                                         marker=dict(color=bits_bar_colors[f'{bits}']),
                                         orientation='h'),
                                  row=n_gpus.index(gpu_count) + 1,
                                  col=base_models.index(base_model) * 2 + 1)
                fig_bar.add_trace(go.Bar(x=sub_df.generate_out_throughput_normalize,
                                         y=sub_df.gpu_name,
                                         name=f'gen-{bits} bits',
                                         legendgroup=f'gen-{bits} bits',
                                         marker=dict(color=bits_bar_colors[f'{bits}']),
                                         orientation='h'),
                                  row=list(n_gpus).index(gpu_count) + 1,
                                  col=list(base_models).index(base_model) * 2 + 2)

    fig_bar.update_layout(plot_bgcolor='rgb(250,250,250)',
                          showlegend=True,
                          barmode="group")
    # fig_bar.show()
    fig_bar.write_html(f'llm_gpu_benchmark_{backend}.html', include_plotlyjs='cdn')