Spaces:
Running
Running
""" | |
Core module for data visualization components. | |
""" | |
import streamlit as st | |
import plotly.express as px | |
import pandas as pd | |
from typing import Optional, Dict, List, Set | |
def create_performance_plot( | |
df: pd.DataFrame, metric: str, title: str, hover_data: List[str] = None | |
): | |
"""Create a performance comparison plot""" | |
if df.empty: | |
return None | |
if hover_data is None: | |
hover_data = [ | |
"CPU Cores", | |
"Peak Memory (GB)", | |
"performance_score", | |
"quant_factor", | |
] | |
fig = px.bar( | |
df, | |
x="Device", | |
y=metric, | |
color="Platform", | |
title=title, | |
template="plotly_white", | |
barmode="group", | |
hover_data=hover_data, | |
) | |
fig.update_layout( | |
xaxis_title="Device", | |
yaxis_title="Token/sec" if "Token" in metric else metric, | |
legend_title="Platform", | |
plot_bgcolor="white", | |
height=400, | |
) | |
return fig | |
def filter_dataframe(df: pd.DataFrame, filters: Dict) -> pd.DataFrame: | |
"""Apply all filters to the dataframe""" | |
if df.empty: | |
return df | |
filtered_df = df.copy() | |
# Basic filters | |
if filters["model"] != "All": | |
filtered_df = filtered_df[filtered_df["Model ID"] == filters["model"]] | |
if filters["platform"] != "All": | |
filtered_df = filtered_df[filtered_df["Platform"] == filters["platform"]] | |
if filters["device"] != "All": | |
filtered_df = filtered_df[filtered_df["Device"] == filters["device"]] | |
# Flash Attention filter | |
if filters["flash_attn"] != "All": | |
filtered_df = filtered_df[filtered_df["flash_attn"] == filters["flash_attn"]] | |
# Cache Type filters | |
if filters["cache_type_k"] != "All": | |
filtered_df = filtered_df[ | |
filtered_df["cache_type_k"] == filters["cache_type_k"] | |
] | |
if filters["cache_type_v"] != "All": | |
filtered_df = filtered_df[ | |
filtered_df["cache_type_v"] == filters["cache_type_v"] | |
] | |
# Range filters | |
pp_min, pp_max = filters["pp_range"] | |
if pp_min is not None and pp_max is not None: | |
pp_values = filtered_df["PP Config"] | |
filtered_df = filtered_df[(pp_values >= pp_min) & (pp_values <= pp_max)] | |
tg_min, tg_max = filters["tg_range"] | |
if tg_min is not None and tg_max is not None: | |
tg_values = filtered_df["TG Config"] | |
filtered_df = filtered_df[(tg_values >= tg_min) & (tg_values <= tg_max)] | |
n_threads_min, n_threads_max = filters["n_threads"] | |
if n_threads_min is not None and n_threads_max is not None: | |
n_threads = filtered_df["n_threads"] | |
filtered_df = filtered_df[ | |
(n_threads >= n_threads_min) & (n_threads <= n_threads_max) | |
] | |
n_gpu_layers_min, n_gpu_layers_max = filters["n_gpu_layers"] | |
if n_gpu_layers_min is not None and n_gpu_layers_max is not None: | |
n_gpu_layers = filtered_df["n_gpu_layers"] | |
filtered_df = filtered_df[ | |
(n_gpu_layers >= n_gpu_layers_min) & (n_gpu_layers <= n_gpu_layers_max) | |
] | |
# Version filter | |
if filters.get("Version") != "All" and filters.get("Version"): | |
filtered_df = filtered_df[filtered_df["Version"] == filters["Version"]] | |
return filtered_df | |
def render_performance_plots(df: pd.DataFrame, filters: Dict): | |
"""Render performance comparison plots""" | |
if df.empty: | |
st.warning("No data available for plotting.") | |
return | |
# Apply filters | |
filtered_df = filter_dataframe(df, filters) | |
if filtered_df.empty: | |
st.warning("No data matches the selected filters for plotting.") | |
return | |
# Build aggregation dictionary | |
agg_dict = { | |
"Prompt Processing": "mean", | |
"Token Generation": "mean", | |
"performance_score": "mean", | |
"quant_factor": "first", | |
} | |
# Include memory metrics if available | |
if "Memory Usage (%)" in filtered_df.columns: | |
agg_dict["Memory Usage (%)"] = "mean" | |
if "Peak Memory (GB)" in filtered_df.columns: | |
agg_dict["Peak Memory (GB)"] = "mean" | |
# Include device info if available | |
if "CPU Cores" in filtered_df.columns: | |
agg_dict["CPU Cores"] = "first" | |
# Include config values | |
agg_dict.update( | |
{ | |
"PP Config": "first", | |
"TG Config": "first", | |
} | |
) | |
# Group by device and platform for plotting | |
plot_group = filtered_df.groupby(["Device", "Platform"]).agg(agg_dict).reset_index() | |
# Rename columns for display | |
column_mapping = { | |
"Prompt Processing": "PP Avg (t/s)", | |
"Token Generation": "TG Avg (t/s)", | |
"Memory Usage (%) (mean)": "Memory Usage (%)", | |
"Peak Memory (GB) (mean)": "Peak Memory (GB)", | |
"PP Config (first)": "PP Config", | |
"TG Config (first)": "TG Config", | |
"Model Size (first)": "Model Size", | |
"CPU Cores (first)": "CPU Cores", | |
"Total Memory (GB) (first)": "Total Memory (GB)", | |
"n_threads (first)": "n_threads", | |
"flash_attn (first)": "flash_attn", | |
"cache_type_k (first)": "cache_type_k", | |
"cache_type_v (first)": "cache_type_v", | |
"n_context (first)": "n_context", | |
"n_batch (first)": "n_batch", | |
"n_ubatch (first)": "n_ubatch", | |
"performance_score (mean)": "Performance Score", | |
"quant_factor (first)": "Quant Factor", | |
} | |
plot_group = plot_group.rename(columns=column_mapping) | |
# Define hover data | |
hover_data = [ | |
"CPU Cores", | |
"Peak Memory (GB)", | |
"performance_score", | |
"quant_factor", | |
] | |
# Create plots in tabs | |
tab1, tab2, tab3 = st.tabs( | |
["Token Generation", "Prompt Processing", "Overall Score"] | |
) | |
with tab1: | |
fig1 = create_performance_plot( | |
plot_group, | |
"TG Avg (t/s)", | |
f"Token Generation (TG: {plot_group['TG Config'].iloc[0]})", | |
hover_data=hover_data, | |
) | |
if fig1: | |
st.plotly_chart(fig1, use_container_width=True) | |
with tab2: | |
fig2 = create_performance_plot( | |
plot_group, | |
"PP Avg (t/s)", | |
f"Prompt Processing (PP: {plot_group['PP Config'].iloc[0]})", | |
hover_data=hover_data, | |
) | |
if fig2: | |
st.plotly_chart(fig2, use_container_width=True) | |
with tab3: | |
fig3 = create_performance_plot( | |
plot_group, | |
"performance_score", | |
"Overall Performance Score (Normalized)", | |
hover_data=hover_data, | |
) | |
if fig3: | |
st.plotly_chart(fig3, use_container_width=True) | |
def render_leaderboard_table(df: pd.DataFrame, filters: Dict): | |
"""Render the leaderboard table with grouped and formatted data""" | |
if df.empty: | |
st.warning("No data available for the selected filters.") | |
return | |
# Apply filters | |
filtered_df = filter_dataframe(df, filters) | |
if filtered_df.empty: | |
st.warning("No data matches the selected filters.") | |
return | |
# Define the preferred column order (grouped logically) | |
column_order = [ | |
# Performance Score | |
"performance_score", | |
"quant_factor", | |
# Device Info | |
"Device", | |
"Platform", | |
"CPU Cores", | |
"Total Memory (GB)", | |
"Peak Memory (GB)", | |
"Memory Usage (%)", | |
# Benchmark Results | |
"PP Config", | |
"PP Avg (t/s)", | |
"PP Std (t/s)", | |
"TG Config", | |
"TG Avg (t/s)", | |
"TG Std (t/s)", | |
# Model Config | |
"Model ID", | |
"Model Size", | |
"n_threads", | |
"flash_attn", | |
"cache_type_k", | |
"cache_type_v", | |
"n_context", | |
"n_batch", | |
"n_ubatch", | |
"Version", | |
] | |
# Group by selected columns | |
grouping_cols = filters["grouping"] | |
if not grouping_cols: | |
grouping_cols = ["Model ID", "Device", "Platform"] # Default grouping | |
# Create aggregations (excluding grouping columns) | |
agg_dict = { | |
col: agg | |
for col, agg in { | |
"Prompt Processing": ["mean", "std"], | |
"Token Generation": ["mean", "std"], | |
"Peak Memory (GB)": "mean", | |
"Total Memory (GB)": "first", | |
"CPU Cores": "first", | |
"Model Size": "first", | |
"Version": lambda x: ", ".join(sorted(set(x))), | |
"n_gpu_layers": lambda x: ", ".join(sorted(set(str(x)))), | |
"performance_score": "mean", | |
"quant_factor": "first", | |
}.items() | |
if col not in grouping_cols | |
} | |
# Group and aggregate | |
grouped_df = filtered_df.groupby(grouping_cols).agg(agg_dict).reset_index() | |
# Flatten column names | |
grouped_df.columns = [ | |
col[0] if col[1] == "" else f"{col[0]} ({col[1]})" for col in grouped_df.columns | |
] | |
# Rename columns for display | |
column_mapping = { | |
"Prompt Processing (mean)": "PP Avg (t/s)", | |
"Prompt Processing (std)": "PP Std (t/s)", | |
"Token Generation (mean)": "TG Avg (t/s)", | |
"Token Generation (std)": "TG Std (t/s)", | |
"Memory Usage (%) (mean)": "Memory Usage (%)", | |
"Peak Memory (GB) (mean)": "Peak Memory (GB)", | |
"PP Config (first)": "PP Config", | |
"TG Config (first)": "TG Config", | |
"Model Size (first)": "Model Size", | |
"CPU Cores (first)": "CPU Cores", | |
"Total Memory (GB) (first)": "Total Memory (GB)", | |
"n_threads (first)": "n_threads", | |
"flash_attn (first)": "flash_attn", | |
"cache_type_k (first)": "cache_type_k", | |
"cache_type_v (first)": "cache_type_v", | |
"n_context (first)": "n_context", | |
"n_batch (first)": "n_batch", | |
"n_ubatch (first)": "n_ubatch", | |
"Version (<lambda>)": "Version", | |
"performance_score (mean)": "Performance Score", | |
"quant_factor (first)": "Quant Factor", | |
} | |
grouped_df = grouped_df.rename(columns=column_mapping) | |
# Sort by performance score | |
grouped_df = grouped_df.sort_values("Performance Score", ascending=False) | |
# Filter visible columns | |
visible_cols = filters["visible_columns"] | |
if visible_cols: | |
# Map the user-friendly names to actual column names | |
column_name_mapping = { | |
"Device": "Device", | |
"Platform": "Platform", | |
"CPU Cores": "CPU Cores", | |
"Total Memory (GB)": "Total Memory (GB)", | |
"Peak Memory (GB)": "Peak Memory (GB)", | |
"Memory Usage (%)": "Memory Usage (%)", | |
"PP Config": "PP Config", | |
"TG Config": "TG Config", | |
"Prompt Processing (mean)": "PP Avg (t/s)", | |
"Token Generation (mean)": "TG Avg (t/s)", | |
"Prompt Processing (std)": "PP Std (t/s)", | |
"Token Generation (std)": "TG Std (t/s)", | |
"Model": "Model ID", | |
"Model Size": "Model Size", | |
"Model ID": "Model ID", | |
"n_threads": "n_threads", | |
"flash_attn": "flash_attn", | |
"cache_type_k": "cache_type_k", | |
"cache_type_v": "cache_type_v", | |
"n_context": "n_context", | |
"n_batch": "n_batch", | |
"n_ubatch": "n_ubatch", | |
"Version": "Version", | |
"Performance Score": "Performance Score", | |
"Quant Factor": "Quant Factor", | |
} | |
# Convert visible columns and grouping columns to their mapped names | |
mapped_visible = {column_name_mapping.get(col, col) for col in visible_cols} | |
mapped_grouping = { | |
column_name_mapping.get(col, col) for col in filters["grouping"] | |
} | |
# Always include performance score and quant factor | |
mapped_visible.add("Performance Score") | |
mapped_visible.add("Quant Factor") | |
# Combine both sets to get unique columns | |
all_cols = mapped_visible | mapped_grouping | |
# Create final display columns list | |
display_cols = [] | |
# Get all available columns we want to display | |
available_cols = set(all_cols) | |
# Add columns in the predefined order | |
for col in column_order: | |
if col in available_cols: | |
display_cols.append(col) | |
# Add any remaining columns that weren't in our predefined order | |
remaining_cols = sorted(list(available_cols - set(display_cols))) | |
display_cols.extend(remaining_cols) | |
else: | |
# Default columns if none selected | |
display_cols = ["Performance Score", "Quant Factor"] + column_order[:8] | |
# Display the filtered and grouped table | |
st.markdown("#### 📊 Benchmark Results") | |
st.dataframe( | |
grouped_df[display_cols], | |
use_container_width=True, | |
height=400, | |
) | |
def render_device_rankings(df: pd.DataFrame): | |
"""Render device rankings with detailed performance metrics.""" | |
if df.empty: | |
st.warning("No data available for device rankings.") | |
return | |
def clean_device_id(device_id: str) -> str: | |
"""Extract clean device name from normalized ID by removing platform prefix""" | |
if device_id.startswith("iOS/"): | |
return device_id[4:] # Remove "iOS/" | |
return device_id | |
# Create device summary | |
device_summary = ( | |
df.groupby(["Normalized Device ID", "Platform"]) | |
.agg( | |
{ | |
"performance_score": "max", # Best score achieved | |
"Model Size": ["min", "max"], # Size range | |
"Token Generation": "max", # Best token generation speed | |
"Prompt Processing": "max", # Best prompt processing speed | |
"Model ID": lambda x: ", ".join(sorted(set(x))), # All models tested | |
"quant_factor": lambda x: sorted(set(x)), # Quantization levels tested | |
} | |
) | |
.reset_index() | |
) | |
# Flatten column names | |
device_summary.columns = [ | |
"Device ID", # Normalized Device ID for grouping | |
"Platform", | |
"Best Score", | |
"Min Model Size", | |
"Max Model Size", | |
"Best TG Speed", | |
"Best PP Speed", | |
"Tested Models", | |
"Tested Quantizations", | |
] | |
# Add clean device name | |
device_summary["Device"] = device_summary["Device ID"].apply(clean_device_id) | |
# Create three tabs for different ranking views | |
rank_tab1, rank_tab2, rank_tab3 = st.tabs( | |
["Overall Rankings", "Rankings by Model Size", "Rankings by Quantization"] | |
) | |
with rank_tab1: | |
st.subheader("📱 Overall Device Rankings") | |
# Sort by best score | |
overall_rankings = device_summary.sort_values("Best Score", ascending=False) | |
# Format the display columns | |
display_df = overall_rankings.copy() | |
display_df["Best Score"] = display_df["Best Score"].round(2) | |
display_df["Best TG Speed"] = display_df["Best TG Speed"].round(2) | |
display_df["Best PP Speed"] = display_df["Best PP Speed"].round(2) | |
display_df["Model Size Range"] = display_df.apply( | |
lambda x: f"{x['Min Model Size']:.1f}B - {x['Max Model Size']:.1f}B", axis=1 | |
) | |
# Select and reorder columns for display | |
display_cols = [ | |
"Device", | |
"Platform", | |
"Best Score", | |
"Best TG Speed", | |
"Best PP Speed", | |
"Model Size Range", | |
] | |
st.dataframe( | |
display_df[display_cols], | |
use_container_width=True, | |
height=400, | |
column_config={ | |
"Device": st.column_config.TextColumn( | |
"Device", | |
help="Device brand and model", | |
), | |
"Best Score": st.column_config.NumberColumn( | |
"Score", help="Overall performance score (0-100)", format="%.2f" | |
), | |
"Best TG Speed": st.column_config.NumberColumn( | |
"Best TG Speed (t/s)", | |
help="Best token generation speed", | |
format="%.2f", | |
), | |
"Best PP Speed": st.column_config.NumberColumn( | |
"Best PP Speed (t/s)", | |
help="Best prompt processing speed", | |
format="%.2f", | |
), | |
}, | |
) | |
with rank_tab2: | |
st.subheader("📊 Rankings by Model Size") | |
# Define model size categories | |
def get_size_category(size): | |
if size < 1: | |
return "Tiny (<1B)" | |
elif size < 3: | |
return "Small (<3B)" | |
elif size < 7: | |
return "Medium (3-7B)" | |
elif size < 13: | |
return "Large (7-13B)" | |
else: | |
return "Extra Large (>13B)" | |
# Create size-based rankings | |
size_rankings = df.copy() | |
size_rankings["Size Category"] = size_rankings["Model Size"].apply( | |
get_size_category | |
) | |
size_summary = ( | |
size_rankings.groupby(["Normalized Device ID", "Platform", "Size Category"]) | |
.agg( | |
{ | |
"performance_score": ["max", "mean"], | |
"Token Generation": "max", | |
"Prompt Processing": "max", | |
"Model ID": lambda x: ", ".join(sorted(set(x))), | |
} | |
) | |
.reset_index() | |
) | |
# Flatten and rename columns | |
size_summary.columns = [ | |
"Device ID", | |
"Platform", | |
"Size Category", | |
"Best Score", | |
"Avg Score", | |
"Best TG Speed", | |
"Best PP Speed", | |
"Models", | |
] | |
# Add clean device name | |
size_summary["Device"] = size_summary["Device ID"].apply(clean_device_id) | |
# Format and display each category | |
for size_cat in sorted(size_summary["Size Category"].unique()): | |
st.markdown(f"##### {size_cat}") | |
cat_data = size_summary[size_summary["Size Category"] == size_cat].copy() | |
cat_data = cat_data.sort_values("Best Score", ascending=False) | |
# Format scores | |
cat_data["Best Score"] = cat_data["Best Score"].round(2) | |
cat_data["Avg Score"] = cat_data["Avg Score"].round(2) | |
cat_data["Best TG Speed"] = cat_data["Best TG Speed"].round(2) | |
cat_data["Best PP Speed"] = cat_data["Best PP Speed"].round(2) | |
display_cols = [ | |
"Device", | |
"Platform", | |
"Best Score", | |
"Avg Score", | |
"Best TG Speed", | |
"Best PP Speed", | |
] | |
st.dataframe( | |
cat_data[display_cols], | |
use_container_width=True, | |
column_config={ | |
"Device": st.column_config.TextColumn( | |
"Device", | |
help="Device brand and model", | |
), | |
"Best Score": st.column_config.NumberColumn( | |
"Best Score", | |
help="Best performance score achieved", | |
format="%.2f", | |
), | |
"Avg Score": st.column_config.NumberColumn( | |
"Avg Score", help="Average performance score", format="%.2f" | |
), | |
"Best TG Speed": st.column_config.NumberColumn( | |
"Best TG (t/s)", | |
help="Best token generation speed", | |
format="%.2f", | |
), | |
"Best PP Speed": st.column_config.NumberColumn( | |
"Best PP (t/s)", | |
help="Best prompt processing speed", | |
format="%.2f", | |
), | |
}, | |
) | |
with rank_tab3: | |
st.subheader("🔍 Rankings by Quantization") | |
# Group by device and quantization level | |
quant_rankings = df.copy() | |
quant_summary = ( | |
quant_rankings.groupby(["Normalized Device ID", "Platform", "quant_factor"]) | |
.agg( | |
{ | |
"performance_score": ["max", "mean"], | |
"Token Generation": "max", | |
"Prompt Processing": "max", | |
"Model ID": lambda x: ", ".join(sorted(set(x))), | |
} | |
) | |
.reset_index() | |
) | |
# Flatten and rename columns | |
quant_summary.columns = [ | |
"Device ID", | |
"Platform", | |
"Quant Factor", | |
"Best Score", | |
"Avg Score", | |
"Best TG Speed", | |
"Best PP Speed", | |
"Models", | |
] | |
# Add clean device name | |
quant_summary["Device"] = quant_summary["Device ID"].apply(clean_device_id) | |
# Format and display for each quantization tier | |
for quant_level in sorted(quant_summary["Quant Factor"].unique(), reverse=True): | |
st.markdown(f"##### Quantization Level: {quant_level:.2f}") | |
quant_data = quant_summary[ | |
quant_summary["Quant Factor"] == quant_level | |
].copy() | |
quant_data = quant_data.sort_values("Best Score", ascending=False) | |
# Format scores | |
quant_data["Best Score"] = quant_data["Best Score"].round(2) | |
quant_data["Avg Score"] = quant_data["Avg Score"].round(2) | |
quant_data["Best TG Speed"] = quant_data["Best TG Speed"].round(2) | |
quant_data["Best PP Speed"] = quant_data["Best PP Speed"].round(2) | |
display_cols = [ | |
"Device", | |
"Platform", | |
"Best Score", | |
"Avg Score", | |
"Best TG Speed", | |
"Best PP Speed", | |
] | |
st.dataframe( | |
quant_data[display_cols], | |
use_container_width=True, | |
column_config={ | |
"Device": st.column_config.TextColumn( | |
"Device", | |
help="Device brand and model", | |
), | |
"Best Score": st.column_config.NumberColumn( | |
"Best Score", | |
help="Best performance score achieved", | |
format="%.2f", | |
), | |
"Avg Score": st.column_config.NumberColumn( | |
"Avg Score", help="Average performance score", format="%.2f" | |
), | |
"Best TG Speed": st.column_config.NumberColumn( | |
"Best TG (t/s)", | |
help="Best token generation speed", | |
format="%.2f", | |
), | |
"Best PP Speed": st.column_config.NumberColumn( | |
"Best PP (t/s)", | |
help="Best prompt processing speed", | |
format="%.2f", | |
), | |
}, | |
) | |