agh123's picture
fix(ranking): use normalized device id for aggregation
6a390d7
raw
history blame
23 kB
"""
Core module for data visualization components.
"""
import streamlit as st
import plotly.express as px
import pandas as pd
from typing import Optional, Dict, List, Set
def create_performance_plot(
df: pd.DataFrame, metric: str, title: str, hover_data: List[str] = None
):
"""Create a performance comparison plot"""
if df.empty:
return None
if hover_data is None:
hover_data = [
"CPU Cores",
"Peak Memory (GB)",
"performance_score",
"quant_factor",
]
fig = px.bar(
df,
x="Device",
y=metric,
color="Platform",
title=title,
template="plotly_white",
barmode="group",
hover_data=hover_data,
)
fig.update_layout(
xaxis_title="Device",
yaxis_title="Token/sec" if "Token" in metric else metric,
legend_title="Platform",
plot_bgcolor="white",
height=400,
)
return fig
def filter_dataframe(df: pd.DataFrame, filters: Dict) -> pd.DataFrame:
"""Apply all filters to the dataframe"""
if df.empty:
return df
filtered_df = df.copy()
# Basic filters
if filters["model"] != "All":
filtered_df = filtered_df[filtered_df["Model ID"] == filters["model"]]
if filters["platform"] != "All":
filtered_df = filtered_df[filtered_df["Platform"] == filters["platform"]]
if filters["device"] != "All":
filtered_df = filtered_df[filtered_df["Device"] == filters["device"]]
# Flash Attention filter
if filters["flash_attn"] != "All":
filtered_df = filtered_df[filtered_df["flash_attn"] == filters["flash_attn"]]
# Cache Type filters
if filters["cache_type_k"] != "All":
filtered_df = filtered_df[
filtered_df["cache_type_k"] == filters["cache_type_k"]
]
if filters["cache_type_v"] != "All":
filtered_df = filtered_df[
filtered_df["cache_type_v"] == filters["cache_type_v"]
]
# Range filters
pp_min, pp_max = filters["pp_range"]
if pp_min is not None and pp_max is not None:
pp_values = filtered_df["PP Config"]
filtered_df = filtered_df[(pp_values >= pp_min) & (pp_values <= pp_max)]
tg_min, tg_max = filters["tg_range"]
if tg_min is not None and tg_max is not None:
tg_values = filtered_df["TG Config"]
filtered_df = filtered_df[(tg_values >= tg_min) & (tg_values <= tg_max)]
n_threads_min, n_threads_max = filters["n_threads"]
if n_threads_min is not None and n_threads_max is not None:
n_threads = filtered_df["n_threads"]
filtered_df = filtered_df[
(n_threads >= n_threads_min) & (n_threads <= n_threads_max)
]
n_gpu_layers_min, n_gpu_layers_max = filters["n_gpu_layers"]
if n_gpu_layers_min is not None and n_gpu_layers_max is not None:
n_gpu_layers = filtered_df["n_gpu_layers"]
filtered_df = filtered_df[
(n_gpu_layers >= n_gpu_layers_min) & (n_gpu_layers <= n_gpu_layers_max)
]
# Version filter
if filters.get("Version") != "All" and filters.get("Version"):
filtered_df = filtered_df[filtered_df["Version"] == filters["Version"]]
return filtered_df
def render_performance_plots(df: pd.DataFrame, filters: Dict):
"""Render performance comparison plots"""
if df.empty:
st.warning("No data available for plotting.")
return
# Apply filters
filtered_df = filter_dataframe(df, filters)
if filtered_df.empty:
st.warning("No data matches the selected filters for plotting.")
return
# Build aggregation dictionary
agg_dict = {
"Prompt Processing": "mean",
"Token Generation": "mean",
"performance_score": "mean",
"quant_factor": "first",
}
# Include memory metrics if available
if "Memory Usage (%)" in filtered_df.columns:
agg_dict["Memory Usage (%)"] = "mean"
if "Peak Memory (GB)" in filtered_df.columns:
agg_dict["Peak Memory (GB)"] = "mean"
# Include device info if available
if "CPU Cores" in filtered_df.columns:
agg_dict["CPU Cores"] = "first"
# Include config values
agg_dict.update(
{
"PP Config": "first",
"TG Config": "first",
}
)
# Group by device and platform for plotting
plot_group = filtered_df.groupby(["Device", "Platform"]).agg(agg_dict).reset_index()
# Rename columns for display
column_mapping = {
"Prompt Processing": "PP Avg (t/s)",
"Token Generation": "TG Avg (t/s)",
"Memory Usage (%) (mean)": "Memory Usage (%)",
"Peak Memory (GB) (mean)": "Peak Memory (GB)",
"PP Config (first)": "PP Config",
"TG Config (first)": "TG Config",
"Model Size (first)": "Model Size",
"CPU Cores (first)": "CPU Cores",
"Total Memory (GB) (first)": "Total Memory (GB)",
"n_threads (first)": "n_threads",
"flash_attn (first)": "flash_attn",
"cache_type_k (first)": "cache_type_k",
"cache_type_v (first)": "cache_type_v",
"n_context (first)": "n_context",
"n_batch (first)": "n_batch",
"n_ubatch (first)": "n_ubatch",
"performance_score (mean)": "Performance Score",
"quant_factor (first)": "Quant Factor",
}
plot_group = plot_group.rename(columns=column_mapping)
# Define hover data
hover_data = [
"CPU Cores",
"Peak Memory (GB)",
"performance_score",
"quant_factor",
]
# Create plots in tabs
tab1, tab2, tab3 = st.tabs(
["Token Generation", "Prompt Processing", "Overall Score"]
)
with tab1:
fig1 = create_performance_plot(
plot_group,
"TG Avg (t/s)",
f"Token Generation (TG: {plot_group['TG Config'].iloc[0]})",
hover_data=hover_data,
)
if fig1:
st.plotly_chart(fig1, use_container_width=True)
with tab2:
fig2 = create_performance_plot(
plot_group,
"PP Avg (t/s)",
f"Prompt Processing (PP: {plot_group['PP Config'].iloc[0]})",
hover_data=hover_data,
)
if fig2:
st.plotly_chart(fig2, use_container_width=True)
with tab3:
fig3 = create_performance_plot(
plot_group,
"performance_score",
"Overall Performance Score (Normalized)",
hover_data=hover_data,
)
if fig3:
st.plotly_chart(fig3, use_container_width=True)
def render_leaderboard_table(df: pd.DataFrame, filters: Dict):
"""Render the leaderboard table with grouped and formatted data"""
if df.empty:
st.warning("No data available for the selected filters.")
return
# Apply filters
filtered_df = filter_dataframe(df, filters)
if filtered_df.empty:
st.warning("No data matches the selected filters.")
return
# Define the preferred column order (grouped logically)
column_order = [
# Performance Score
"performance_score",
"quant_factor",
# Device Info
"Device",
"Platform",
"CPU Cores",
"Total Memory (GB)",
"Peak Memory (GB)",
"Memory Usage (%)",
# Benchmark Results
"PP Config",
"PP Avg (t/s)",
"PP Std (t/s)",
"TG Config",
"TG Avg (t/s)",
"TG Std (t/s)",
# Model Config
"Model ID",
"Model Size",
"n_threads",
"flash_attn",
"cache_type_k",
"cache_type_v",
"n_context",
"n_batch",
"n_ubatch",
"Version",
]
# Group by selected columns
grouping_cols = filters["grouping"]
if not grouping_cols:
grouping_cols = ["Model ID", "Device", "Platform"] # Default grouping
# Create aggregations (excluding grouping columns)
agg_dict = {
col: agg
for col, agg in {
"Prompt Processing": ["mean", "std"],
"Token Generation": ["mean", "std"],
"Peak Memory (GB)": "mean",
"Total Memory (GB)": "first",
"CPU Cores": "first",
"Model Size": "first",
"Version": lambda x: ", ".join(sorted(set(x))),
"n_gpu_layers": lambda x: ", ".join(sorted(set(str(x)))),
"performance_score": "mean",
"quant_factor": "first",
}.items()
if col not in grouping_cols
}
# Group and aggregate
grouped_df = filtered_df.groupby(grouping_cols).agg(agg_dict).reset_index()
# Flatten column names
grouped_df.columns = [
col[0] if col[1] == "" else f"{col[0]} ({col[1]})" for col in grouped_df.columns
]
# Rename columns for display
column_mapping = {
"Prompt Processing (mean)": "PP Avg (t/s)",
"Prompt Processing (std)": "PP Std (t/s)",
"Token Generation (mean)": "TG Avg (t/s)",
"Token Generation (std)": "TG Std (t/s)",
"Memory Usage (%) (mean)": "Memory Usage (%)",
"Peak Memory (GB) (mean)": "Peak Memory (GB)",
"PP Config (first)": "PP Config",
"TG Config (first)": "TG Config",
"Model Size (first)": "Model Size",
"CPU Cores (first)": "CPU Cores",
"Total Memory (GB) (first)": "Total Memory (GB)",
"n_threads (first)": "n_threads",
"flash_attn (first)": "flash_attn",
"cache_type_k (first)": "cache_type_k",
"cache_type_v (first)": "cache_type_v",
"n_context (first)": "n_context",
"n_batch (first)": "n_batch",
"n_ubatch (first)": "n_ubatch",
"Version (<lambda>)": "Version",
"performance_score (mean)": "Performance Score",
"quant_factor (first)": "Quant Factor",
}
grouped_df = grouped_df.rename(columns=column_mapping)
# Sort by performance score
grouped_df = grouped_df.sort_values("Performance Score", ascending=False)
# Filter visible columns
visible_cols = filters["visible_columns"]
if visible_cols:
# Map the user-friendly names to actual column names
column_name_mapping = {
"Device": "Device",
"Platform": "Platform",
"CPU Cores": "CPU Cores",
"Total Memory (GB)": "Total Memory (GB)",
"Peak Memory (GB)": "Peak Memory (GB)",
"Memory Usage (%)": "Memory Usage (%)",
"PP Config": "PP Config",
"TG Config": "TG Config",
"Prompt Processing (mean)": "PP Avg (t/s)",
"Token Generation (mean)": "TG Avg (t/s)",
"Prompt Processing (std)": "PP Std (t/s)",
"Token Generation (std)": "TG Std (t/s)",
"Model": "Model ID",
"Model Size": "Model Size",
"Model ID": "Model ID",
"n_threads": "n_threads",
"flash_attn": "flash_attn",
"cache_type_k": "cache_type_k",
"cache_type_v": "cache_type_v",
"n_context": "n_context",
"n_batch": "n_batch",
"n_ubatch": "n_ubatch",
"Version": "Version",
"Performance Score": "Performance Score",
"Quant Factor": "Quant Factor",
}
# Convert visible columns and grouping columns to their mapped names
mapped_visible = {column_name_mapping.get(col, col) for col in visible_cols}
mapped_grouping = {
column_name_mapping.get(col, col) for col in filters["grouping"]
}
# Always include performance score and quant factor
mapped_visible.add("Performance Score")
mapped_visible.add("Quant Factor")
# Combine both sets to get unique columns
all_cols = mapped_visible | mapped_grouping
# Create final display columns list
display_cols = []
# Get all available columns we want to display
available_cols = set(all_cols)
# Add columns in the predefined order
for col in column_order:
if col in available_cols:
display_cols.append(col)
# Add any remaining columns that weren't in our predefined order
remaining_cols = sorted(list(available_cols - set(display_cols)))
display_cols.extend(remaining_cols)
else:
# Default columns if none selected
display_cols = ["Performance Score", "Quant Factor"] + column_order[:8]
# Display the filtered and grouped table
st.markdown("#### 📊 Benchmark Results")
st.dataframe(
grouped_df[display_cols],
use_container_width=True,
height=400,
)
def render_device_rankings(df: pd.DataFrame):
"""Render device rankings with detailed performance metrics."""
if df.empty:
st.warning("No data available for device rankings.")
return
def clean_device_id(device_id: str) -> str:
"""Extract clean device name from normalized ID by removing platform prefix"""
if device_id.startswith("iOS/"):
return device_id[4:] # Remove "iOS/"
return device_id
# Create device summary
device_summary = (
df.groupby(["Normalized Device ID", "Platform"])
.agg(
{
"performance_score": "max", # Best score achieved
"Model Size": ["min", "max"], # Size range
"Token Generation": "max", # Best token generation speed
"Prompt Processing": "max", # Best prompt processing speed
"Model ID": lambda x: ", ".join(sorted(set(x))), # All models tested
"quant_factor": lambda x: sorted(set(x)), # Quantization levels tested
}
)
.reset_index()
)
# Flatten column names
device_summary.columns = [
"Device ID", # Normalized Device ID for grouping
"Platform",
"Best Score",
"Min Model Size",
"Max Model Size",
"Best TG Speed",
"Best PP Speed",
"Tested Models",
"Tested Quantizations",
]
# Add clean device name
device_summary["Device"] = device_summary["Device ID"].apply(clean_device_id)
# Create three tabs for different ranking views
rank_tab1, rank_tab2, rank_tab3 = st.tabs(
["Overall Rankings", "Rankings by Model Size", "Rankings by Quantization"]
)
with rank_tab1:
st.subheader("📱 Overall Device Rankings")
# Sort by best score
overall_rankings = device_summary.sort_values("Best Score", ascending=False)
# Format the display columns
display_df = overall_rankings.copy()
display_df["Best Score"] = display_df["Best Score"].round(2)
display_df["Best TG Speed"] = display_df["Best TG Speed"].round(2)
display_df["Best PP Speed"] = display_df["Best PP Speed"].round(2)
display_df["Model Size Range"] = display_df.apply(
lambda x: f"{x['Min Model Size']:.1f}B - {x['Max Model Size']:.1f}B", axis=1
)
# Select and reorder columns for display
display_cols = [
"Device",
"Platform",
"Best Score",
"Best TG Speed",
"Best PP Speed",
"Model Size Range",
]
st.dataframe(
display_df[display_cols],
use_container_width=True,
height=400,
column_config={
"Device": st.column_config.TextColumn(
"Device",
help="Device brand and model",
),
"Best Score": st.column_config.NumberColumn(
"Score", help="Overall performance score (0-100)", format="%.2f"
),
"Best TG Speed": st.column_config.NumberColumn(
"Best TG Speed (t/s)",
help="Best token generation speed",
format="%.2f",
),
"Best PP Speed": st.column_config.NumberColumn(
"Best PP Speed (t/s)",
help="Best prompt processing speed",
format="%.2f",
),
},
)
with rank_tab2:
st.subheader("📊 Rankings by Model Size")
# Define model size categories
def get_size_category(size):
if size < 1:
return "Tiny (<1B)"
elif size < 3:
return "Small (<3B)"
elif size < 7:
return "Medium (3-7B)"
elif size < 13:
return "Large (7-13B)"
else:
return "Extra Large (>13B)"
# Create size-based rankings
size_rankings = df.copy()
size_rankings["Size Category"] = size_rankings["Model Size"].apply(
get_size_category
)
size_summary = (
size_rankings.groupby(["Normalized Device ID", "Platform", "Size Category"])
.agg(
{
"performance_score": ["max", "mean"],
"Token Generation": "max",
"Prompt Processing": "max",
"Model ID": lambda x: ", ".join(sorted(set(x))),
}
)
.reset_index()
)
# Flatten and rename columns
size_summary.columns = [
"Device ID",
"Platform",
"Size Category",
"Best Score",
"Avg Score",
"Best TG Speed",
"Best PP Speed",
"Models",
]
# Add clean device name
size_summary["Device"] = size_summary["Device ID"].apply(clean_device_id)
# Format and display each category
for size_cat in sorted(size_summary["Size Category"].unique()):
st.markdown(f"##### {size_cat}")
cat_data = size_summary[size_summary["Size Category"] == size_cat].copy()
cat_data = cat_data.sort_values("Best Score", ascending=False)
# Format scores
cat_data["Best Score"] = cat_data["Best Score"].round(2)
cat_data["Avg Score"] = cat_data["Avg Score"].round(2)
cat_data["Best TG Speed"] = cat_data["Best TG Speed"].round(2)
cat_data["Best PP Speed"] = cat_data["Best PP Speed"].round(2)
display_cols = [
"Device",
"Platform",
"Best Score",
"Avg Score",
"Best TG Speed",
"Best PP Speed",
]
st.dataframe(
cat_data[display_cols],
use_container_width=True,
column_config={
"Device": st.column_config.TextColumn(
"Device",
help="Device brand and model",
),
"Best Score": st.column_config.NumberColumn(
"Best Score",
help="Best performance score achieved",
format="%.2f",
),
"Avg Score": st.column_config.NumberColumn(
"Avg Score", help="Average performance score", format="%.2f"
),
"Best TG Speed": st.column_config.NumberColumn(
"Best TG (t/s)",
help="Best token generation speed",
format="%.2f",
),
"Best PP Speed": st.column_config.NumberColumn(
"Best PP (t/s)",
help="Best prompt processing speed",
format="%.2f",
),
},
)
with rank_tab3:
st.subheader("🔍 Rankings by Quantization")
# Group by device and quantization level
quant_rankings = df.copy()
quant_summary = (
quant_rankings.groupby(["Normalized Device ID", "Platform", "quant_factor"])
.agg(
{
"performance_score": ["max", "mean"],
"Token Generation": "max",
"Prompt Processing": "max",
"Model ID": lambda x: ", ".join(sorted(set(x))),
}
)
.reset_index()
)
# Flatten and rename columns
quant_summary.columns = [
"Device ID",
"Platform",
"Quant Factor",
"Best Score",
"Avg Score",
"Best TG Speed",
"Best PP Speed",
"Models",
]
# Add clean device name
quant_summary["Device"] = quant_summary["Device ID"].apply(clean_device_id)
# Format and display for each quantization tier
for quant_level in sorted(quant_summary["Quant Factor"].unique(), reverse=True):
st.markdown(f"##### Quantization Level: {quant_level:.2f}")
quant_data = quant_summary[
quant_summary["Quant Factor"] == quant_level
].copy()
quant_data = quant_data.sort_values("Best Score", ascending=False)
# Format scores
quant_data["Best Score"] = quant_data["Best Score"].round(2)
quant_data["Avg Score"] = quant_data["Avg Score"].round(2)
quant_data["Best TG Speed"] = quant_data["Best TG Speed"].round(2)
quant_data["Best PP Speed"] = quant_data["Best PP Speed"].round(2)
display_cols = [
"Device",
"Platform",
"Best Score",
"Avg Score",
"Best TG Speed",
"Best PP Speed",
]
st.dataframe(
quant_data[display_cols],
use_container_width=True,
column_config={
"Device": st.column_config.TextColumn(
"Device",
help="Device brand and model",
),
"Best Score": st.column_config.NumberColumn(
"Best Score",
help="Best performance score achieved",
format="%.2f",
),
"Avg Score": st.column_config.NumberColumn(
"Avg Score", help="Average performance score", format="%.2f"
),
"Best TG Speed": st.column_config.NumberColumn(
"Best TG (t/s)",
help="Best token generation speed",
format="%.2f",
),
"Best PP Speed": st.column_config.NumberColumn(
"Best PP (t/s)",
help="Best prompt processing speed",
format="%.2f",
),
},
)