|
""" |
|
Core module for data visualization components. |
|
""" |
|
|
|
import streamlit as st |
|
import plotly.express as px |
|
import pandas as pd |
|
from typing import Optional, Dict, List, Set |
|
|
|
|
|
def create_performance_plot( |
|
df: pd.DataFrame, metric: str, title: str, hover_data: List[str] = None |
|
): |
|
"""Create a performance comparison plot""" |
|
if df.empty: |
|
return None |
|
|
|
if hover_data is None: |
|
hover_data = [ |
|
"CPU Cores", |
|
"Peak Memory (GB)", |
|
"performance_score", |
|
"quant_factor", |
|
] |
|
|
|
fig = px.bar( |
|
df, |
|
x="Device", |
|
y=metric, |
|
color="Platform", |
|
title=title, |
|
template="plotly_white", |
|
barmode="group", |
|
hover_data=hover_data, |
|
) |
|
fig.update_layout( |
|
xaxis_title="Device", |
|
yaxis_title="Token/sec" if "Token" in metric else metric, |
|
legend_title="Platform", |
|
plot_bgcolor="white", |
|
height=400, |
|
) |
|
return fig |
|
|
|
|
|
def filter_dataframe(df: pd.DataFrame, filters: Dict) -> pd.DataFrame: |
|
"""Apply all filters to the dataframe""" |
|
if df.empty: |
|
return df |
|
|
|
filtered_df = df.copy() |
|
|
|
|
|
if filters["model"] != "All": |
|
filtered_df = filtered_df[filtered_df["Model ID"] == filters["model"]] |
|
if filters["platform"] != "All": |
|
filtered_df = filtered_df[filtered_df["Platform"] == filters["platform"]] |
|
if filters["device"] != "All": |
|
filtered_df = filtered_df[filtered_df["Device"] == filters["device"]] |
|
|
|
|
|
if filters["flash_attn"] != "All": |
|
filtered_df = filtered_df[filtered_df["flash_attn"] == filters["flash_attn"]] |
|
|
|
|
|
if filters["cache_type_k"] != "All": |
|
filtered_df = filtered_df[ |
|
filtered_df["cache_type_k"] == filters["cache_type_k"] |
|
] |
|
|
|
if filters["cache_type_v"] != "All": |
|
filtered_df = filtered_df[ |
|
filtered_df["cache_type_v"] == filters["cache_type_v"] |
|
] |
|
|
|
|
|
pp_min, pp_max = filters["pp_range"] |
|
if pp_min is not None and pp_max is not None: |
|
pp_values = filtered_df["PP Config"] |
|
filtered_df = filtered_df[(pp_values >= pp_min) & (pp_values <= pp_max)] |
|
|
|
tg_min, tg_max = filters["tg_range"] |
|
if tg_min is not None and tg_max is not None: |
|
tg_values = filtered_df["TG Config"] |
|
filtered_df = filtered_df[(tg_values >= tg_min) & (tg_values <= tg_max)] |
|
|
|
n_threads_min, n_threads_max = filters["n_threads"] |
|
if n_threads_min is not None and n_threads_max is not None: |
|
n_threads = filtered_df["n_threads"] |
|
filtered_df = filtered_df[ |
|
(n_threads >= n_threads_min) & (n_threads <= n_threads_max) |
|
] |
|
|
|
n_gpu_layers_min, n_gpu_layers_max = filters["n_gpu_layers"] |
|
if n_gpu_layers_min is not None and n_gpu_layers_max is not None: |
|
n_gpu_layers = filtered_df["n_gpu_layers"] |
|
filtered_df = filtered_df[ |
|
(n_gpu_layers >= n_gpu_layers_min) & (n_gpu_layers <= n_gpu_layers_max) |
|
] |
|
|
|
|
|
if filters.get("Version") != "All" and filters.get("Version"): |
|
filtered_df = filtered_df[filtered_df["Version"] == filters["Version"]] |
|
|
|
return filtered_df |
|
|
|
|
|
def render_performance_plots(df: pd.DataFrame, filters: Dict): |
|
"""Render performance comparison plots""" |
|
if df.empty: |
|
st.warning("No data available for plotting.") |
|
return |
|
|
|
|
|
filtered_df = filter_dataframe(df, filters) |
|
if filtered_df.empty: |
|
st.warning("No data matches the selected filters for plotting.") |
|
return |
|
|
|
|
|
agg_dict = { |
|
"Prompt Processing": "mean", |
|
"Token Generation": "mean", |
|
"performance_score": "mean", |
|
"quant_factor": "first", |
|
} |
|
|
|
|
|
if "Memory Usage (%)" in filtered_df.columns: |
|
agg_dict["Memory Usage (%)"] = "mean" |
|
if "Peak Memory (GB)" in filtered_df.columns: |
|
agg_dict["Peak Memory (GB)"] = "mean" |
|
|
|
|
|
if "CPU Cores" in filtered_df.columns: |
|
agg_dict["CPU Cores"] = "first" |
|
|
|
|
|
agg_dict.update( |
|
{ |
|
"PP Config": "first", |
|
"TG Config": "first", |
|
} |
|
) |
|
|
|
|
|
plot_group = filtered_df.groupby(["Device", "Platform"]).agg(agg_dict).reset_index() |
|
|
|
|
|
column_mapping = { |
|
"Prompt Processing": "PP Avg (t/s)", |
|
"Token Generation": "TG Avg (t/s)", |
|
"Memory Usage (%) (mean)": "Memory Usage (%)", |
|
"Peak Memory (GB) (mean)": "Peak Memory (GB)", |
|
"PP Config (first)": "PP Config", |
|
"TG Config (first)": "TG Config", |
|
"Model Size (first)": "Model Size", |
|
"CPU Cores (first)": "CPU Cores", |
|
"Total Memory (GB) (first)": "Total Memory (GB)", |
|
"n_threads (first)": "n_threads", |
|
"flash_attn (first)": "flash_attn", |
|
"cache_type_k (first)": "cache_type_k", |
|
"cache_type_v (first)": "cache_type_v", |
|
"n_context (first)": "n_context", |
|
"n_batch (first)": "n_batch", |
|
"n_ubatch (first)": "n_ubatch", |
|
"performance_score (mean)": "Performance Score", |
|
"quant_factor (first)": "Quant Factor", |
|
} |
|
plot_group = plot_group.rename(columns=column_mapping) |
|
|
|
|
|
hover_data = [ |
|
"CPU Cores", |
|
"Peak Memory (GB)", |
|
"performance_score", |
|
"quant_factor", |
|
] |
|
|
|
|
|
tab1, tab2, tab3 = st.tabs( |
|
["Token Generation", "Prompt Processing", "Overall Score"] |
|
) |
|
|
|
with tab1: |
|
fig1 = create_performance_plot( |
|
plot_group, |
|
"TG Avg (t/s)", |
|
f"Token Generation (TG: {plot_group['TG Config'].iloc[0]})", |
|
hover_data=hover_data, |
|
) |
|
if fig1: |
|
st.plotly_chart(fig1, use_container_width=True) |
|
|
|
with tab2: |
|
fig2 = create_performance_plot( |
|
plot_group, |
|
"PP Avg (t/s)", |
|
f"Prompt Processing (PP: {plot_group['PP Config'].iloc[0]})", |
|
hover_data=hover_data, |
|
) |
|
if fig2: |
|
st.plotly_chart(fig2, use_container_width=True) |
|
|
|
with tab3: |
|
fig3 = create_performance_plot( |
|
plot_group, |
|
"performance_score", |
|
"Overall Performance Score (Normalized)", |
|
hover_data=hover_data, |
|
) |
|
if fig3: |
|
st.plotly_chart(fig3, use_container_width=True) |
|
|
|
|
|
def render_leaderboard_table(df: pd.DataFrame, filters: Dict): |
|
"""Render the leaderboard table with grouped and formatted data""" |
|
if df.empty: |
|
st.warning("No data available for the selected filters.") |
|
return |
|
|
|
|
|
filtered_df = filter_dataframe(df, filters) |
|
if filtered_df.empty: |
|
st.warning("No data matches the selected filters.") |
|
return |
|
|
|
|
|
column_order = [ |
|
|
|
"performance_score", |
|
"quant_factor", |
|
|
|
"Device", |
|
"Platform", |
|
"CPU Cores", |
|
"Total Memory (GB)", |
|
"Peak Memory (GB)", |
|
"Memory Usage (%)", |
|
|
|
"PP Config", |
|
"PP Avg (t/s)", |
|
"PP Std (t/s)", |
|
"TG Config", |
|
"TG Avg (t/s)", |
|
"TG Std (t/s)", |
|
|
|
"Model ID", |
|
"Model Size", |
|
"n_threads", |
|
"flash_attn", |
|
"cache_type_k", |
|
"cache_type_v", |
|
"n_context", |
|
"n_batch", |
|
"n_ubatch", |
|
"Version", |
|
] |
|
|
|
|
|
grouping_cols = filters["grouping"] |
|
if not grouping_cols: |
|
grouping_cols = ["Model ID", "Device", "Platform"] |
|
|
|
|
|
agg_dict = { |
|
col: agg |
|
for col, agg in { |
|
"Prompt Processing": ["mean", "std"], |
|
"Token Generation": ["mean", "std"], |
|
"Peak Memory (GB)": "mean", |
|
"Total Memory (GB)": "first", |
|
"CPU Cores": "first", |
|
"Model Size": "first", |
|
"Version": lambda x: ", ".join(sorted(set(x))), |
|
"n_gpu_layers": lambda x: ", ".join(sorted(set(str(x)))), |
|
"performance_score": "mean", |
|
"quant_factor": "first", |
|
}.items() |
|
if col not in grouping_cols |
|
} |
|
|
|
|
|
grouped_df = filtered_df.groupby(grouping_cols).agg(agg_dict).reset_index() |
|
|
|
|
|
grouped_df.columns = [ |
|
col[0] if col[1] == "" else f"{col[0]} ({col[1]})" for col in grouped_df.columns |
|
] |
|
|
|
|
|
column_mapping = { |
|
"Prompt Processing (mean)": "PP Avg (t/s)", |
|
"Prompt Processing (std)": "PP Std (t/s)", |
|
"Token Generation (mean)": "TG Avg (t/s)", |
|
"Token Generation (std)": "TG Std (t/s)", |
|
"Memory Usage (%) (mean)": "Memory Usage (%)", |
|
"Peak Memory (GB) (mean)": "Peak Memory (GB)", |
|
"PP Config (first)": "PP Config", |
|
"TG Config (first)": "TG Config", |
|
"Model Size (first)": "Model Size", |
|
"CPU Cores (first)": "CPU Cores", |
|
"Total Memory (GB) (first)": "Total Memory (GB)", |
|
"n_threads (first)": "n_threads", |
|
"flash_attn (first)": "flash_attn", |
|
"cache_type_k (first)": "cache_type_k", |
|
"cache_type_v (first)": "cache_type_v", |
|
"n_context (first)": "n_context", |
|
"n_batch (first)": "n_batch", |
|
"n_ubatch (first)": "n_ubatch", |
|
"Version (<lambda>)": "Version", |
|
"performance_score (mean)": "Performance Score", |
|
"quant_factor (first)": "Quant Factor", |
|
} |
|
grouped_df = grouped_df.rename(columns=column_mapping) |
|
|
|
|
|
grouped_df = grouped_df.sort_values("Performance Score", ascending=False) |
|
|
|
|
|
visible_cols = filters["visible_columns"] |
|
if visible_cols: |
|
|
|
column_name_mapping = { |
|
"Device": "Device", |
|
"Platform": "Platform", |
|
"CPU Cores": "CPU Cores", |
|
"Total Memory (GB)": "Total Memory (GB)", |
|
"Peak Memory (GB)": "Peak Memory (GB)", |
|
"Memory Usage (%)": "Memory Usage (%)", |
|
"PP Config": "PP Config", |
|
"TG Config": "TG Config", |
|
"Prompt Processing (mean)": "PP Avg (t/s)", |
|
"Token Generation (mean)": "TG Avg (t/s)", |
|
"Prompt Processing (std)": "PP Std (t/s)", |
|
"Token Generation (std)": "TG Std (t/s)", |
|
"Model": "Model ID", |
|
"Model Size": "Model Size", |
|
"Model ID": "Model ID", |
|
"n_threads": "n_threads", |
|
"flash_attn": "flash_attn", |
|
"cache_type_k": "cache_type_k", |
|
"cache_type_v": "cache_type_v", |
|
"n_context": "n_context", |
|
"n_batch": "n_batch", |
|
"n_ubatch": "n_ubatch", |
|
"Version": "Version", |
|
"Performance Score": "Performance Score", |
|
"Quant Factor": "Quant Factor", |
|
} |
|
|
|
|
|
mapped_visible = {column_name_mapping.get(col, col) for col in visible_cols} |
|
mapped_grouping = { |
|
column_name_mapping.get(col, col) for col in filters["grouping"] |
|
} |
|
|
|
|
|
mapped_visible.add("Performance Score") |
|
mapped_visible.add("Quant Factor") |
|
|
|
|
|
all_cols = mapped_visible | mapped_grouping |
|
|
|
|
|
display_cols = [] |
|
|
|
|
|
available_cols = set(all_cols) |
|
|
|
|
|
for col in column_order: |
|
if col in available_cols: |
|
display_cols.append(col) |
|
|
|
|
|
remaining_cols = sorted(list(available_cols - set(display_cols))) |
|
display_cols.extend(remaining_cols) |
|
else: |
|
|
|
display_cols = ["Performance Score", "Quant Factor"] + column_order[:8] |
|
|
|
|
|
st.markdown("#### 📊 Benchmark Results") |
|
st.dataframe( |
|
grouped_df[display_cols], |
|
use_container_width=True, |
|
height=400, |
|
) |
|
|
|
|
|
def render_device_rankings(df: pd.DataFrame): |
|
"""Render device rankings with detailed performance metrics.""" |
|
if df.empty: |
|
st.warning("No data available for device rankings.") |
|
return |
|
|
|
def clean_device_id(device_id: str) -> str: |
|
"""Extract clean device name from normalized ID by removing platform prefix""" |
|
if device_id.startswith("iOS/"): |
|
return device_id[4:] |
|
return device_id |
|
|
|
|
|
device_summary = ( |
|
df.groupby(["Normalized Device ID", "Platform"]) |
|
.agg( |
|
{ |
|
"performance_score": "max", |
|
"Model Size": ["min", "max"], |
|
"Token Generation": "max", |
|
"Prompt Processing": "max", |
|
"Model ID": lambda x: ", ".join(sorted(set(x))), |
|
"quant_factor": lambda x: sorted(set(x)), |
|
} |
|
) |
|
.reset_index() |
|
) |
|
|
|
|
|
device_summary.columns = [ |
|
"Device ID", |
|
"Platform", |
|
"Best Score", |
|
"Min Model Size", |
|
"Max Model Size", |
|
"Best TG Speed", |
|
"Best PP Speed", |
|
"Tested Models", |
|
"Tested Quantizations", |
|
] |
|
|
|
|
|
device_summary["Device"] = device_summary["Device ID"].apply(clean_device_id) |
|
|
|
|
|
rank_tab1, rank_tab2, rank_tab3 = st.tabs( |
|
["Overall Rankings", "Rankings by Model Size", "Rankings by Quantization"] |
|
) |
|
|
|
with rank_tab1: |
|
st.subheader("📱 Overall Device Rankings") |
|
|
|
|
|
overall_rankings = device_summary.sort_values("Best Score", ascending=False) |
|
|
|
|
|
display_df = overall_rankings.copy() |
|
display_df["Best Score"] = display_df["Best Score"].round(2) |
|
display_df["Best TG Speed"] = display_df["Best TG Speed"].round(2) |
|
display_df["Best PP Speed"] = display_df["Best PP Speed"].round(2) |
|
display_df["Model Size Range"] = display_df.apply( |
|
lambda x: f"{x['Min Model Size']:.1f}B - {x['Max Model Size']:.1f}B", axis=1 |
|
) |
|
|
|
|
|
display_cols = [ |
|
"Device", |
|
"Platform", |
|
"Best Score", |
|
"Best TG Speed", |
|
"Best PP Speed", |
|
"Model Size Range", |
|
] |
|
|
|
st.dataframe( |
|
display_df[display_cols], |
|
use_container_width=True, |
|
height=400, |
|
column_config={ |
|
"Device": st.column_config.TextColumn( |
|
"Device", |
|
help="Device brand and model", |
|
), |
|
"Best Score": st.column_config.NumberColumn( |
|
"Score", help="Overall performance score (0-100)", format="%.2f" |
|
), |
|
"Best TG Speed": st.column_config.NumberColumn( |
|
"Best TG Speed (t/s)", |
|
help="Best token generation speed", |
|
format="%.2f", |
|
), |
|
"Best PP Speed": st.column_config.NumberColumn( |
|
"Best PP Speed (t/s)", |
|
help="Best prompt processing speed", |
|
format="%.2f", |
|
), |
|
}, |
|
) |
|
|
|
with rank_tab2: |
|
st.subheader("📊 Rankings by Model Size") |
|
|
|
|
|
def get_size_category(size): |
|
if size < 1: |
|
return "Tiny (<1B)" |
|
elif size < 3: |
|
return "Small (<3B)" |
|
elif size < 7: |
|
return "Medium (3-7B)" |
|
elif size < 13: |
|
return "Large (7-13B)" |
|
else: |
|
return "Extra Large (>13B)" |
|
|
|
|
|
size_rankings = df.copy() |
|
size_rankings["Size Category"] = size_rankings["Model Size"].apply( |
|
get_size_category |
|
) |
|
|
|
size_summary = ( |
|
size_rankings.groupby(["Normalized Device ID", "Platform", "Size Category"]) |
|
.agg( |
|
{ |
|
"performance_score": ["max", "mean"], |
|
"Token Generation": "max", |
|
"Prompt Processing": "max", |
|
"Model ID": lambda x: ", ".join(sorted(set(x))), |
|
} |
|
) |
|
.reset_index() |
|
) |
|
|
|
|
|
size_summary.columns = [ |
|
"Device ID", |
|
"Platform", |
|
"Size Category", |
|
"Best Score", |
|
"Avg Score", |
|
"Best TG Speed", |
|
"Best PP Speed", |
|
"Models", |
|
] |
|
|
|
|
|
size_summary["Device"] = size_summary["Device ID"].apply(clean_device_id) |
|
|
|
|
|
for size_cat in sorted(size_summary["Size Category"].unique()): |
|
st.markdown(f"##### {size_cat}") |
|
cat_data = size_summary[size_summary["Size Category"] == size_cat].copy() |
|
cat_data = cat_data.sort_values("Best Score", ascending=False) |
|
|
|
|
|
cat_data["Best Score"] = cat_data["Best Score"].round(2) |
|
cat_data["Avg Score"] = cat_data["Avg Score"].round(2) |
|
cat_data["Best TG Speed"] = cat_data["Best TG Speed"].round(2) |
|
cat_data["Best PP Speed"] = cat_data["Best PP Speed"].round(2) |
|
|
|
display_cols = [ |
|
"Device", |
|
"Platform", |
|
"Best Score", |
|
"Avg Score", |
|
"Best TG Speed", |
|
"Best PP Speed", |
|
] |
|
|
|
st.dataframe( |
|
cat_data[display_cols], |
|
use_container_width=True, |
|
column_config={ |
|
"Device": st.column_config.TextColumn( |
|
"Device", |
|
help="Device brand and model", |
|
), |
|
"Best Score": st.column_config.NumberColumn( |
|
"Best Score", |
|
help="Best performance score achieved", |
|
format="%.2f", |
|
), |
|
"Avg Score": st.column_config.NumberColumn( |
|
"Avg Score", help="Average performance score", format="%.2f" |
|
), |
|
"Best TG Speed": st.column_config.NumberColumn( |
|
"Best TG (t/s)", |
|
help="Best token generation speed", |
|
format="%.2f", |
|
), |
|
"Best PP Speed": st.column_config.NumberColumn( |
|
"Best PP (t/s)", |
|
help="Best prompt processing speed", |
|
format="%.2f", |
|
), |
|
}, |
|
) |
|
|
|
with rank_tab3: |
|
st.subheader("🔍 Rankings by Quantization") |
|
|
|
|
|
quant_rankings = df.copy() |
|
quant_summary = ( |
|
quant_rankings.groupby(["Normalized Device ID", "Platform", "quant_factor"]) |
|
.agg( |
|
{ |
|
"performance_score": ["max", "mean"], |
|
"Token Generation": "max", |
|
"Prompt Processing": "max", |
|
"Model ID": lambda x: ", ".join(sorted(set(x))), |
|
} |
|
) |
|
.reset_index() |
|
) |
|
|
|
|
|
quant_summary.columns = [ |
|
"Device ID", |
|
"Platform", |
|
"Quant Factor", |
|
"Best Score", |
|
"Avg Score", |
|
"Best TG Speed", |
|
"Best PP Speed", |
|
"Models", |
|
] |
|
|
|
|
|
quant_summary["Device"] = quant_summary["Device ID"].apply(clean_device_id) |
|
|
|
|
|
for quant_level in sorted(quant_summary["Quant Factor"].unique(), reverse=True): |
|
st.markdown(f"##### Quantization Level: {quant_level:.2f}") |
|
quant_data = quant_summary[ |
|
quant_summary["Quant Factor"] == quant_level |
|
].copy() |
|
quant_data = quant_data.sort_values("Best Score", ascending=False) |
|
|
|
|
|
quant_data["Best Score"] = quant_data["Best Score"].round(2) |
|
quant_data["Avg Score"] = quant_data["Avg Score"].round(2) |
|
quant_data["Best TG Speed"] = quant_data["Best TG Speed"].round(2) |
|
quant_data["Best PP Speed"] = quant_data["Best PP Speed"].round(2) |
|
|
|
display_cols = [ |
|
"Device", |
|
"Platform", |
|
"Best Score", |
|
"Avg Score", |
|
"Best TG Speed", |
|
"Best PP Speed", |
|
] |
|
|
|
st.dataframe( |
|
quant_data[display_cols], |
|
use_container_width=True, |
|
column_config={ |
|
"Device": st.column_config.TextColumn( |
|
"Device", |
|
help="Device brand and model", |
|
), |
|
"Best Score": st.column_config.NumberColumn( |
|
"Best Score", |
|
help="Best performance score achieved", |
|
format="%.2f", |
|
), |
|
"Avg Score": st.column_config.NumberColumn( |
|
"Avg Score", help="Average performance score", format="%.2f" |
|
), |
|
"Best TG Speed": st.column_config.NumberColumn( |
|
"Best TG (t/s)", |
|
help="Best token generation speed", |
|
format="%.2f", |
|
), |
|
"Best PP Speed": st.column_config.NumberColumn( |
|
"Best PP (t/s)", |
|
help="Best prompt processing speed", |
|
format="%.2f", |
|
), |
|
}, |
|
) |
|
|