Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import plotly.express as px | |
EXLLAMA_DATA = [ | |
# open llm | |
"Model π€", | |
"Arch ποΈ", | |
"DType π₯", | |
"Backend π", | |
"Params (B)", | |
"Open LLM Score (%)", | |
# deployment settings | |
"DType π₯", | |
"Backend π", | |
"Quantization ποΈ", | |
# primary measurements | |
"Prefill Latency (s)", | |
"Prefill Latency (s) Exllama", | |
"Decode Throughput (tokens/s)", | |
"Decode Throughput (tokens/s) Exllama", | |
"E2E Throughput (tokens/s)", | |
"E2E Throughput (tokens/s) Exllama", | |
# speedups | |
"Prefill Latency Speedup (%)", | |
"Decode Throughput Speedup (%)", | |
] | |
def get_exllama_df(llm_perf_df): | |
copy_df = llm_perf_df.copy() | |
# seperate vanilla GPTQ experiments from Exllama experiments | |
gptq_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit")] | |
exllamav1_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV1")] | |
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")] | |
# merge the three dataframes | |
exllamav1_df = pd.merge( | |
gptq_df, | |
exllamav1_df, | |
on=["Model π€"], | |
suffixes=["", " Exllama"], | |
) | |
exllamav2_df = pd.merge( | |
gptq_df, | |
exllamav2_df, | |
on=["Model π€"], | |
suffixes=["", " Exllama"], | |
) | |
# concat the two dataframes row-wise | |
exllama_df = pd.concat([exllamav1_df, exllamav2_df]) | |
exllama_df["Quantization ποΈ"] = exllama_df["Quantization ποΈ Exllama"] | |
# compute speedups | |
exllama_df["Prefill Latency Speedup (%)"] = ( | |
(exllama_df["Prefill Latency (s)"] / exllama_df["Prefill Latency (s) Exllama"]) * 100 | |
).round(2) - 100 | |
exllama_df["Decode Throughput Speedup (%)"] = ( | |
(exllama_df["Decode Throughput (tokens/s) Exllama"] / exllama_df["Decode Throughput (tokens/s)"]) * 100 | |
).round(2) - 100 | |
# filter speedups > 1000% | |
exllama_df = exllama_df[exllama_df["Prefill Latency Speedup (%)"] < 1000] | |
exllama_df = exllama_df[exllama_df["Decode Throughput Speedup (%)"] < 1000] | |
return exllama_df | |
def get_exllama_decode_fig(llm_perf_df): | |
exllama_df = get_exllama_df(llm_perf_df) | |
# plot | |
decode_fig = px.box( | |
exllama_df, | |
x="Arch ποΈ", | |
y="Decode Throughput Speedup (%)", | |
color_discrete_sequence=px.colors.qualitative.Light24, | |
custom_data=EXLLAMA_DATA, | |
color="Quantization ποΈ Exllama", | |
points="all", | |
) | |
# add hover data | |
decode_fig.update_traces( | |
hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)]) | |
) | |
# add layout | |
decode_fig.update_layout( | |
title={ | |
"text": "Decode Throughput Speedup per Architecture", | |
"y": 0.95, | |
"x": 0.5, | |
"xanchor": "center", | |
"yanchor": "top", | |
}, | |
xaxis_title="LLM Architecture", | |
yaxis_title="Decode Speedup (%)", | |
legend_title="Quantization Scheme", | |
width=1200, | |
height=600, | |
) | |
return decode_fig | |
def get_exllama_prefill_fig(llm_perf_df): | |
exllama_df = get_exllama_df(llm_perf_df) | |
# plot | |
prefill_fig = px.box( | |
exllama_df, | |
x="Arch ποΈ", | |
y="Prefill Latency Speedup (%)", | |
color_discrete_sequence=px.colors.qualitative.Light24, | |
custom_data=EXLLAMA_DATA, | |
color="Quantization ποΈ Exllama", | |
points="all", | |
) | |
# add hover data | |
prefill_fig.update_traces( | |
hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)]) | |
) | |
# add layout | |
prefill_fig.update_layout( | |
title={ | |
"text": "Prefill Latency Speedup per Architecture", | |
"y": 0.95, | |
"x": 0.5, | |
"xanchor": "center", | |
"yanchor": "top", | |
}, | |
xaxis_title="LLM Architecture", | |
yaxis_title="Prefill Speedup (%)", | |
legend_title="Quantization Scheme", | |
width=1200, | |
height=600, | |
) | |
return prefill_fig | |
def create_exllama_plots(llm_perf_df): | |
# descriptive text | |
gr.HTML("π Hover over the points π for additional information.", elem_id="text") | |
# get figures | |
prefill_fig = get_exllama_prefill_fig(llm_perf_df) | |
decode_fig = get_exllama_decode_fig(llm_perf_df) | |
# create plots | |
prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False) | |
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False) | |
return prefill_plot, decode_plot | |