import gradio as gr import pandas as pd import plotly.express as px EXLLAMA_DATA = [ # open llm "Model 🤗", "Arch 🏛️", "DType 📥", "Backend 🏭", "Params (B)", "Open LLM Score (%)", # deployment settings "DType 📥", "Backend 🏭", "Quantization 🗜️", # primary measurements "Prefill Latency (s)", "Prefill Latency (s) Exllama", "Decode Throughput (tokens/s)", "Decode Throughput (tokens/s) Exllama", "E2E Throughput (tokens/s)", "E2E Throughput (tokens/s) Exllama", # speedups "Prefill Latency Speedup (%)", "Decode Throughput Speedup (%)", ] def get_exllama_df(llm_perf_df): copy_df = llm_perf_df.copy() # seperate vanilla GPTQ experiments from Exllama experiments gptq_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit")] exllamav1_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV1")] exllamav2_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV2")] # merge the three dataframes exllamav1_df = pd.merge( gptq_df, exllamav1_df, on=["Model 🤗"], suffixes=["", " Exllama"], ) exllamav2_df = pd.merge( gptq_df, exllamav2_df, on=["Model 🤗"], suffixes=["", " Exllama"], ) # concat the two dataframes row-wise exllama_df = pd.concat([exllamav1_df, exllamav2_df]) exllama_df["Quantization 🗜️"] = exllama_df["Quantization 🗜️ Exllama"] # compute speedups exllama_df["Prefill Latency Speedup (%)"] = ( (exllama_df["Prefill Latency (s)"] / exllama_df["Prefill Latency (s) Exllama"]) * 100 ).round(2) - 100 exllama_df["Decode Throughput Speedup (%)"] = ( (exllama_df["Decode Throughput (tokens/s) Exllama"] / exllama_df["Decode Throughput (tokens/s)"]) * 100 ).round(2) - 100 # filter speedups > 1000% exllama_df = exllama_df[exllama_df["Prefill Latency Speedup (%)"] < 1000] exllama_df = exllama_df[exllama_df["Decode Throughput Speedup (%)"] < 1000] return exllama_df def get_exllama_decode_fig(llm_perf_df): exllama_df = get_exllama_df(llm_perf_df) # plot decode_fig = px.box( exllama_df, x="Arch 🏛️", y="Decode Throughput Speedup (%)", color_discrete_sequence=px.colors.qualitative.Light24, custom_data=EXLLAMA_DATA, color="Quantization 🗜️ Exllama", points="all", ) # add hover data decode_fig.update_traces( hovertemplate="
".join([f"{column}: %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)]) ) # add layout decode_fig.update_layout( title={ "text": "Decode Throughput Speedup per Architecture", "y": 0.95, "x": 0.5, "xanchor": "center", "yanchor": "top", }, xaxis_title="LLM Architecture", yaxis_title="Decode Speedup (%)", legend_title="Quantization Scheme", width=1200, height=600, ) return decode_fig def get_exllama_prefill_fig(llm_perf_df): exllama_df = get_exllama_df(llm_perf_df) # plot prefill_fig = px.box( exllama_df, x="Arch 🏛️", y="Prefill Latency Speedup (%)", color_discrete_sequence=px.colors.qualitative.Light24, custom_data=EXLLAMA_DATA, color="Quantization 🗜️ Exllama", points="all", ) # add hover data prefill_fig.update_traces( hovertemplate="
".join([f"{column}: %{{customdata[{i}]}}" for i, column in enumerate(EXLLAMA_DATA)]) ) # add layout prefill_fig.update_layout( title={ "text": "Prefill Latency Speedup per Architecture", "y": 0.95, "x": 0.5, "xanchor": "center", "yanchor": "top", }, xaxis_title="LLM Architecture", yaxis_title="Prefill Speedup (%)", legend_title="Quantization Scheme", width=1200, height=600, ) return prefill_fig def create_exllama_plots(llm_perf_df): # descriptive text gr.HTML("👆 Hover over the points 👆 for additional information.", elem_id="text") # get figures prefill_fig = get_exllama_prefill_fig(llm_perf_df) decode_fig = get_exllama_decode_fig(llm_perf_df) # create plots prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False) decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False) return prefill_plot, decode_plot