|
import gradio as gr |
|
import pandas as pd |
|
import plotly.express as px |
|
|
|
|
|
CUSTOM_KERNELS_DATA = [ |
|
|
|
"Model π€", |
|
"Arch ποΈ", |
|
"DType π₯", |
|
"Backend π", |
|
"Params (B)", |
|
"Open LLM Score (%)", |
|
|
|
"DType π₯", |
|
"Backend π", |
|
"Optimization π οΈ", |
|
"Quantization ποΈ", |
|
"Optimization π οΈ Custom Kernel", |
|
"Quantization ποΈ Custom Kernel", |
|
|
|
"Prefill Latency (s)", |
|
"Prefill Latency (s) Custom Kernel", |
|
"Decode Throughput (tokens/s)", |
|
"Decode Throughput (tokens/s) Custom Kernel", |
|
|
|
"Prefill Latency Speedup (%)", |
|
"Decode Throughput Speedup (%)", |
|
] |
|
|
|
|
|
def get_custom_kernels_df(llm_perf_df): |
|
copy_df = llm_perf_df.copy() |
|
|
|
vanilla_df = copy_df[ |
|
(copy_df["Backend π"] == "pytorch") & |
|
(copy_df["Quantization ποΈ"] == "None") & |
|
(copy_df["Optimization π οΈ"] == "None") & |
|
(copy_df["DType π₯"] == "float16") |
|
] |
|
exllamav1_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV1")] |
|
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")] |
|
gemm_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMM")] |
|
gemv_df = copy_df[(copy_df["Quantization ποΈ"] == "AWQ.4bit+GEMV")] |
|
|
|
exllamav1_df = pd.merge( |
|
vanilla_df, |
|
exllamav1_df, |
|
on=["Model π€"], |
|
suffixes=["", " Custom Kernel"], |
|
) |
|
exllamav2_df = pd.merge( |
|
vanilla_df, |
|
exllamav2_df, |
|
on=["Model π€"], |
|
suffixes=["", " Custom Kernel"], |
|
) |
|
gemm_df = pd.merge( |
|
vanilla_df, |
|
gemm_df, |
|
on=["Model π€"], |
|
suffixes=["", " Custom Kernel"], |
|
) |
|
gemv_df = pd.merge( |
|
vanilla_df, |
|
gemv_df, |
|
on=["Model π€"], |
|
suffixes=["", " Custom Kernel"], |
|
) |
|
|
|
custom_kernels_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df]) |
|
|
|
custom_kernels_df["Prefill Latency Speedup (%)"] = ( |
|
(custom_kernels_df["Prefill Latency (s)"] / custom_kernels_df["Prefill Latency (s) Custom Kernel"]) * 100 |
|
).round(2) - 100 |
|
custom_kernels_df["Decode Throughput Speedup (%)"] = ( |
|
( |
|
custom_kernels_df["Decode Throughput (tokens/s) Custom Kernel"] |
|
/ custom_kernels_df["Decode Throughput (tokens/s)"] |
|
) |
|
* 100 |
|
).round(2) - 100 |
|
|
|
custom_kernels_df = custom_kernels_df[custom_kernels_df["Prefill Latency Speedup (%)"] < 1000] |
|
custom_kernels_df = custom_kernels_df[custom_kernels_df["Decode Throughput Speedup (%)"] < 1000] |
|
|
|
return custom_kernels_df |
|
|
|
|
|
def get_custom_kernels_decode_fig(llm_perf_df): |
|
custom_kernels_df = get_custom_kernels_df(llm_perf_df) |
|
|
|
decode_fig = px.box( |
|
custom_kernels_df, |
|
x="Arch ποΈ", |
|
y="Decode Throughput Speedup (%)", |
|
color_discrete_sequence=px.colors.qualitative.Light24, |
|
custom_data=CUSTOM_KERNELS_DATA, |
|
color="Quantization ποΈ Custom Kernel", |
|
points="all", |
|
) |
|
|
|
decode_fig.update_traces( |
|
hovertemplate="<br>".join( |
|
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(CUSTOM_KERNELS_DATA)] |
|
) |
|
) |
|
|
|
decode_fig.update_layout( |
|
title={ |
|
"text": "Decode Throughput Speedup per Architecture", |
|
"y": 0.95, |
|
"x": 0.5, |
|
"xanchor": "center", |
|
"yanchor": "top", |
|
}, |
|
xaxis_title="LLM Architecture", |
|
yaxis_title="Decode Speedup (%)", |
|
legend_title="Quantization Scheme", |
|
width=1200, |
|
height=600, |
|
) |
|
|
|
return decode_fig |
|
|
|
|
|
def get_custom_kernels_prefill_fig(llm_perf_df): |
|
custom_kernels_df = get_custom_kernels_df(llm_perf_df) |
|
|
|
prefill_fig = px.box( |
|
custom_kernels_df, |
|
x="Arch ποΈ", |
|
y="Prefill Latency Speedup (%)", |
|
color_discrete_sequence=px.colors.qualitative.Light24, |
|
custom_data=CUSTOM_KERNELS_DATA, |
|
color="Quantization ποΈ Custom Kernel", |
|
points="all", |
|
) |
|
|
|
prefill_fig.update_traces( |
|
hovertemplate="<br>".join( |
|
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(CUSTOM_KERNELS_DATA)] |
|
) |
|
) |
|
|
|
prefill_fig.update_layout( |
|
title={ |
|
"text": "Prefill Latency Speedup per Architecture", |
|
"y": 0.95, |
|
"x": 0.5, |
|
"xanchor": "center", |
|
"yanchor": "top", |
|
}, |
|
xaxis_title="LLM Architecture", |
|
yaxis_title="Prefill Speedup (%)", |
|
legend_title="Quantization Scheme", |
|
width=1200, |
|
height=600, |
|
) |
|
|
|
return prefill_fig |
|
|
|
|
|
def create_custom_kernels_plots(llm_perf_df): |
|
|
|
gr.HTML("π Hover over the points π for additional information.", elem_id="text") |
|
|
|
prefill_fig = get_custom_kernels_prefill_fig(llm_perf_df) |
|
decode_fig = get_custom_kernels_decode_fig(llm_perf_df) |
|
|
|
|
|
prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False) |
|
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False) |
|
|
|
return prefill_plot, decode_plot |
|
|