""" Core module for data visualization components. """ import streamlit as st import plotly.express as px import pandas as pd from typing import Optional, Dict, List, Set import plotly.graph_objects as go def clean_device_id(device_id: str) -> str: """Extract clean device name from normalized ID by removing platform prefix""" if device_id.startswith("iOS/"): return device_id[4:] # Remove "iOS/" return device_id def create_performance_plot( df: pd.DataFrame, metric: str, title: str, hover_data: List[str] = None ): """Create a performance comparison plot""" if df.empty: return None if hover_data is None: hover_data = [ "CPU Cores", "Peak Memory (GB)", "performance_score", "quant_factor", ] fig = px.bar( df, x="Device", y=metric, color="Platform", title=title, template="plotly_white", barmode="group", hover_data=hover_data, ) fig.update_layout( xaxis_title="Device", yaxis_title="Token/sec" if "Token" in metric else metric, legend_title="Platform", plot_bgcolor="white", height=400, ) return fig def filter_dataframe(df: pd.DataFrame, filters: Dict) -> pd.DataFrame: """Apply all filters to the dataframe""" if df.empty: return df filtered_df = df.copy() # Basic filters if filters["model"] != "All": filtered_df = filtered_df[filtered_df["Model ID"] == filters["model"]] if filters["platform"] != "All": filtered_df = filtered_df[filtered_df["Platform"] == filters["platform"]] if filters["device"] != "All": filtered_df = filtered_df[filtered_df["Device"] == filters["device"]] # Flash Attention filter if filters["flash_attn"] != "All": filtered_df = filtered_df[filtered_df["flash_attn"] == filters["flash_attn"]] # Cache Type filters if filters["cache_type_k"] != "All": filtered_df = filtered_df[ filtered_df["cache_type_k"] == filters["cache_type_k"] ] if filters["cache_type_v"] != "All": filtered_df = filtered_df[ filtered_df["cache_type_v"] == filters["cache_type_v"] ] # Range filters pp_min, pp_max = filters["pp_range"] if pp_min is not None and pp_max is not None: pp_values = filtered_df["PP Config"] filtered_df = filtered_df[(pp_values >= pp_min) & (pp_values <= pp_max)] tg_min, tg_max = filters["tg_range"] if tg_min is not None and tg_max is not None: tg_values = filtered_df["TG Config"] filtered_df = filtered_df[(tg_values >= tg_min) & (tg_values <= tg_max)] n_threads_min, n_threads_max = filters["n_threads"] if n_threads_min is not None and n_threads_max is not None: n_threads = filtered_df["n_threads"] filtered_df = filtered_df[ (n_threads >= n_threads_min) & (n_threads <= n_threads_max) ] n_gpu_layers_min, n_gpu_layers_max = filters["n_gpu_layers"] if n_gpu_layers_min is not None and n_gpu_layers_max is not None: n_gpu_layers = filtered_df["n_gpu_layers"] filtered_df = filtered_df[ (n_gpu_layers >= n_gpu_layers_min) & (n_gpu_layers <= n_gpu_layers_max) ] # Version filter if filters.get("Version") != "All" and filters.get("Version"): filtered_df = filtered_df[filtered_df["Version"] == filters["Version"]] return filtered_df def create_model_size_performance_plot(df: pd.DataFrame, device_id: str, title: str): """Create a plot showing model size vs performance metrics for a specific device""" if df.empty: return None # Filter for the selected device device_df = df[df["Normalized Device ID"] == device_id].copy() if device_df.empty: return None # Create a new figure with secondary y-axis fig = go.Figure() # Add Token Generation data (left y-axis) fig.add_trace( go.Scatter( x=device_df["Model Size"], y=device_df["Token Generation"], name="Token Generation", mode="markers", marker=dict(color="#2ecc71"), yaxis="y", ) ) # Add Prompt Processing data (right y-axis) fig.add_trace( go.Scatter( x=device_df["Model Size"], y=device_df["Prompt Processing"], name="Prompt Processing", mode="markers", marker=dict(color="#e74c3c"), yaxis="y2", ) ) # Add trend lines if enough points if len(device_df) > 2: # TG trend line tg_trend = px.scatter( device_df, x="Model Size", y="Token Generation", trendline="lowess" ).data[ 1 ] # Get the trend line trace tg_trend.update( line=dict(color="#2ecc71", dash="solid"), name="TG Trend", showlegend=False, yaxis="y", ) fig.add_trace(tg_trend) # PP trend line pp_trend = px.scatter( device_df, x="Model Size", y="Prompt Processing", trendline="lowess" ).data[ 1 ] # Get the trend line trace pp_trend.update( line=dict(color="#e74c3c", dash="solid"), name="PP Trend", showlegend=False, yaxis="y2", ) fig.add_trace(pp_trend) # Update layout with two y-axes fig.update_layout( title=title, xaxis=dict( title="Model Size (B)", gridcolor="lightgrey", range=[ 0, max(device_df["Model Size"]) * 1.05, ], # Start from 0, add 5% padding to max ), yaxis=dict( title="Token Generation (t/s)", titlefont=dict(color="#2ecc71"), tickfont=dict(color="#2ecc71"), gridcolor="lightgrey", side="left", range=[ 0, max(device_df["Token Generation"]) * 1.05, ], # Start from 0, add 5% padding to max ), yaxis2=dict( title="Prompt Processing (t/s)", titlefont=dict(color="#e74c3c"), tickfont=dict(color="#e74c3c"), anchor="x", overlaying="y", side="right", range=[ 0, max(device_df["Prompt Processing"]) * 1.05, ], # Start from 0, add 5% padding to max ), height=400, showlegend=True, plot_bgcolor="white", legend=dict( yanchor="middle", y=0.8, xanchor="right", x=0.99, bgcolor="rgba(255, 255, 255, 0.8)", # Semi-transparent white background bordercolor="lightgrey", borderwidth=1, ), ) return fig def render_model_size_performance(df: pd.DataFrame, filters: Dict): """Render the model size vs performance section independently""" if df.empty: st.warning("No data available for plotting.") return # Apply all filters from the table size_perf_df = filter_dataframe(df, filters) if size_perf_df.empty: st.warning("No data matches the selected filters.") return # Get the device with highest performance score top_device_id = size_perf_df.loc[size_perf_df["performance_score"].idxmax()][ "Normalized Device ID" ] device_ids = sorted(size_perf_df["Normalized Device ID"].unique()) default_index = device_ids.index(top_device_id) # Create mapping of normalized IDs to display names device_display_names = { device_id: clean_device_id(device_id) for device_id in device_ids } # Device selector for size vs performance plots selected_device_id = st.selectbox( "Select Device", options=device_ids, format_func=lambda x: device_display_names[ x ], # Display clean names in dropdown help="Select a device to view its performance across different model sizes", key="size_perf_device_selector", placeholder="Search for a device...", index=default_index, ) # Create and display the model size vs performance plot size_perf_fig = create_model_size_performance_plot( size_perf_df, selected_device_id, f"Model Size vs Performance Metrics for {device_display_names[selected_device_id]}", ) if size_perf_fig: st.plotly_chart(size_perf_fig, use_container_width=True) else: st.warning("No data available for the selected device.") def render_performance_plots(df: pd.DataFrame, filters: Dict): """Render performance comparison plots""" if df.empty: st.warning("No data available for plotting.") return # Apply filters filtered_df = filter_dataframe(df, filters) if filtered_df.empty: st.warning("No data matches the selected filters for plotting.") return # Add Model Size vs Performance section first st.markdown("### 📊 Model Size vs Performance") render_model_size_performance(df, filters) def render_leaderboard_table(df: pd.DataFrame, filters: Dict): """Render the leaderboard table with grouped and formatted data""" if df.empty: st.warning("No data available for the selected filters.") return # Apply filters filtered_df = filter_dataframe(df, filters) if filtered_df.empty: st.warning("No data matches the selected filters.") return # Define the preferred column order (grouped logically) column_order = [ # Performance Score "performance_score", "quant_factor", # Device Info "Device", "Platform", "CPU Cores", "Total Memory (GB)", "Peak Memory (GB)", "Memory Usage (%)", # Benchmark Results "PP Config", "PP Avg (t/s)", "PP Std (t/s)", "TG Config", "TG Avg (t/s)", "TG Std (t/s)", # Model Config "Model ID", "Model Size", "n_threads", "flash_attn", "cache_type_k", "cache_type_v", "n_context", "n_batch", "n_ubatch", "Version", ] # Group by selected columns grouping_cols = filters["grouping"] if not grouping_cols: grouping_cols = ["Model ID", "Device", "Platform"] # Default grouping # Create aggregations (excluding grouping columns) agg_dict = { col: agg for col, agg in { "Prompt Processing": ["mean", "std"], "Token Generation": ["mean", "std"], "Peak Memory (GB)": "mean", "Total Memory (GB)": "first", "CPU Cores": "first", "Model Size": "first", "Version": lambda x: ", ".join(sorted(set(x))), "n_gpu_layers": lambda x: ", ".join(sorted(set(str(x)))), "performance_score": "mean", "quant_factor": "first", }.items() if col not in grouping_cols } # Group and aggregate grouped_df = filtered_df.groupby(grouping_cols).agg(agg_dict).reset_index() # Flatten column names grouped_df.columns = [ col[0] if col[1] == "" else f"{col[0]} ({col[1]})" for col in grouped_df.columns ] # Rename columns for display column_mapping = { "Prompt Processing (mean)": "PP Avg (t/s)", "Prompt Processing (std)": "PP Std (t/s)", "Token Generation (mean)": "TG Avg (t/s)", "Token Generation (std)": "TG Std (t/s)", "Memory Usage (%) (mean)": "Memory Usage (%)", "Peak Memory (GB) (mean)": "Peak Memory (GB)", "PP Config (first)": "PP Config", "TG Config (first)": "TG Config", "Model Size (first)": "Model Size", "CPU Cores (first)": "CPU Cores", "Total Memory (GB) (first)": "Total Memory (GB)", "n_threads (first)": "n_threads", "flash_attn (first)": "flash_attn", "cache_type_k (first)": "cache_type_k", "cache_type_v (first)": "cache_type_v", "n_context (first)": "n_context", "n_batch (first)": "n_batch", "n_ubatch (first)": "n_ubatch", "Version ()": "Version", "performance_score (mean)": "Performance Score", "quant_factor (first)": "Quant Factor", } grouped_df = grouped_df.rename(columns=column_mapping) # Sort by performance score grouped_df = grouped_df.sort_values("Performance Score", ascending=False) # Filter visible columns visible_cols = filters["visible_columns"] if visible_cols: # Map the user-friendly names to actual column names column_name_mapping = { "Device": "Device", "Platform": "Platform", "CPU Cores": "CPU Cores", "Total Memory (GB)": "Total Memory (GB)", "Peak Memory (GB)": "Peak Memory (GB)", "Memory Usage (%)": "Memory Usage (%)", "PP Config": "PP Config", "TG Config": "TG Config", "Prompt Processing (mean)": "PP Avg (t/s)", "Token Generation (mean)": "TG Avg (t/s)", "Prompt Processing (std)": "PP Std (t/s)", "Token Generation (std)": "TG Std (t/s)", "Model": "Model ID", "Model Size": "Model Size", "Model ID": "Model ID", "n_threads": "n_threads", "flash_attn": "flash_attn", "cache_type_k": "cache_type_k", "cache_type_v": "cache_type_v", "n_context": "n_context", "n_batch": "n_batch", "n_ubatch": "n_ubatch", "Version": "Version", "Performance Score": "Performance Score", "Quant Factor": "Quant Factor", } # Convert visible columns and grouping columns to their mapped names mapped_visible = {column_name_mapping.get(col, col) for col in visible_cols} mapped_grouping = { column_name_mapping.get(col, col) for col in filters["grouping"] } # Always include performance score and quant factor mapped_visible.add("Performance Score") mapped_visible.add("Quant Factor") # Combine both sets to get unique columns all_cols = mapped_visible | mapped_grouping # Create final display columns list display_cols = [] # Get all available columns we want to display available_cols = set(all_cols) # Add columns in the predefined order for col in column_order: if col in available_cols: display_cols.append(col) # Add any remaining columns that weren't in our predefined order remaining_cols = sorted(list(available_cols - set(display_cols))) display_cols.extend(remaining_cols) else: # Default columns if none selected display_cols = ["Performance Score", "Quant Factor"] + column_order[:8] # Display the filtered and grouped table st.markdown("#### 📊 Benchmark Results") st.dataframe( grouped_df[display_cols], use_container_width=True, height=min( 400, (len(grouped_df) + 1) * 35 + 40 ), # Dynamic height based on content hide_index=False, column_config={ "Rank": st.column_config.NumberColumn( "Rank", help="Device ranking based on performance score", ), "Device": st.column_config.TextColumn( "Device", help="Device brand and model", ), "Best Score": st.column_config.NumberColumn( "Score", help="Overall performance score (0-100)", format="%.2f" ), "Best TG Speed": st.column_config.NumberColumn( "Best TG Speed (t/s)", help="Best token generation speed", format="%.2f", ), "Best PP Speed": st.column_config.NumberColumn( "Best PP Speed (t/s)", help="Best prompt processing speed", format="%.2f", ), }, ) def render_device_rankings(df: pd.DataFrame): """Render device rankings with detailed performance metrics.""" if df.empty: st.warning("No data available for device rankings.") return # Create device summary device_summary = ( df.groupby(["Normalized Device ID", "Platform"]) .agg( { "performance_score": "max", # Best score achieved "Model Size": ["min", "max"], # Size range "tg_score": "max", # Use normalized TG score "pp_score": "max", # Use normalized PP score "Model ID": lambda x: ", ".join(sorted(set(x))), # All models tested "quant_factor": lambda x: sorted(set(x)), # Quantization levels tested } ) .reset_index() ) # Flatten column names device_summary.columns = [ "Device ID", # Normalized Device ID for grouping "Platform", "Best Score", "Min Model Size", "Max Model Size", "TG Score", "PP Score", "Tested Models", "Tested Quantizations", ] # Add clean device name device_summary["Device"] = device_summary["Device ID"].apply(clean_device_id) # Create three tabs for different ranking views rank_tab1, rank_tab2, rank_tab3 = st.tabs( ["Overall Rankings", "Rankings by Model Size", "Rankings by Quantization"] ) with rank_tab1: st.subheader("📱 Overall Device Rankings") # Sort by best score overall_rankings = device_summary.sort_values("Best Score", ascending=False) # Add ranking column overall_rankings = overall_rankings.reset_index(drop=True) overall_rankings.index = overall_rankings.index + 1 overall_rankings = overall_rankings.rename_axis("Rank") # Format the display columns display_df = overall_rankings.copy() display_df["Best Score"] = display_df["Best Score"].round(2) display_df["TG Score"] = display_df["TG Score"].round(2) display_df["PP Score"] = display_df["PP Score"].round(2) display_df["Model Size Range"] = display_df.apply( lambda x: f"{x['Min Model Size']:.1f}B - {x['Max Model Size']:.1f}B", axis=1 ) # Select and reorder columns for display display_cols = [ "Device", # Use clean device name for display "Platform", "Best Score", "TG Score", "PP Score", "Model Size Range", ] st.dataframe( display_df[display_cols], use_container_width=True, height=min( 600, (len(display_df) + 1) * 35 + 40 ), # Dynamic height based on content hide_index=False, column_config={ "Rank": st.column_config.NumberColumn( "Rank", help="Device ranking based on performance score", ), "Device": st.column_config.TextColumn( "Device", help="Device brand and model", ), "Best Score": st.column_config.NumberColumn( "Score", help="Overall performance score (0-100)", format="%.2f" ), "TG Score": st.column_config.NumberColumn( "TG Score", help="Normalized Token Generation score (0-100)", format="%.2f", ), "PP Score": st.column_config.NumberColumn( "PP Score", help="Normalized Prompt Processing score (0-100)", format="%.2f", ), }, ) with rank_tab2: st.subheader("📊 Rankings by Model Size") # Define model size categories def get_size_category(size): if size < 1: return "Tiny (<1B)" elif size < 2: return "Small (1-2B)" elif size < 4: return "Medium (2-4B)" elif size < 8: return "Large (4-8B)" else: return "Extra Large (>8B)" # Create size-based rankings size_rankings = df.copy() size_rankings["Size Category"] = size_rankings["Model Size"].apply( get_size_category ) size_summary = ( size_rankings.groupby(["Normalized Device ID", "Platform", "Size Category"]) .agg( { "performance_score": ["max", "mean"], "tg_score": "max", # Use normalized scores "pp_score": "max", # Use normalized scores "Model ID": lambda x: ", ".join(sorted(set(x))), } ) .reset_index() ) # Flatten and rename columns size_summary.columns = [ "Device ID", "Platform", "Size Category", "Best Score", "Avg Score", "TG Score", "PP Score", "Models", ] # Add clean device name size_summary["Device"] = size_summary["Device ID"].apply(clean_device_id) # Format and display each category for size_cat in sorted(size_summary["Size Category"].unique()): st.markdown(f"##### {size_cat}") cat_data = size_summary[size_summary["Size Category"] == size_cat].copy() cat_data = cat_data.sort_values("Best Score", ascending=False) # Add ranking column cat_data = cat_data.reset_index(drop=True) cat_data.index = cat_data.index + 1 cat_data = cat_data.rename_axis("Rank") # Format scores cat_data["Best Score"] = cat_data["Best Score"].round(2) cat_data["Avg Score"] = cat_data["Avg Score"].round(2) cat_data["TG Score"] = cat_data["TG Score"].round(2) cat_data["PP Score"] = cat_data["PP Score"].round(2) display_cols = [ "Device", # Use clean device name for display "Platform", "Best Score", "Avg Score", "TG Score", "PP Score", ] st.dataframe( cat_data[display_cols], use_container_width=True, height=min( 300, (len(cat_data) + 1) * 35 + 40 ), # Slightly smaller for category tables hide_index=False, column_config={ "Rank": st.column_config.NumberColumn( "Rank", help="Device ranking within this size category", ), "Device": st.column_config.TextColumn( "Device", help="Device brand and model", ), "Best Score": st.column_config.NumberColumn( "Best Score", help="Best performance score achieved", format="%.2f", ), "Avg Score": st.column_config.NumberColumn( "Avg Score", help="Average performance score", format="%.2f" ), "TG Score": st.column_config.NumberColumn( "TG Score", help="Normalized Token Generation score (0-100)", format="%.2f", ), "PP Score": st.column_config.NumberColumn( "PP Score", help="Normalized Prompt Processing score (0-100)", format="%.2f", ), }, ) with rank_tab3: st.subheader("🔍 Rankings by Quantization") # Helper function to get quantization name from factor def get_quant_name(factor: float) -> str: if factor >= 1.0: return "No Quantization (F16/F32)" quant_map = { 0.8: "[i]Q8_x", 0.6: "[i]Q6_x", 0.5: "[i]Q5_x", 0.4: "[i]Q4_x", 0.3: "[i]Q3_x", 0.2: "[i]Q2_x", 0.1: "[i]Q1_x", } return quant_map.get(factor, f"Q{int(factor*10)}_x") # Group by device and quantization level quant_rankings = df.copy() quant_summary = ( quant_rankings.groupby(["Normalized Device ID", "Platform", "quant_factor"]) .agg( { "performance_score": ["max", "mean"], "tg_score": "max", "pp_score": "max", "Model ID": lambda x: ", ".join(sorted(set(x))), } ) .reset_index() ) # Flatten and rename columns quant_summary.columns = [ "Device ID", "Platform", "Quant Factor", "Best Score", "Avg Score", "TG Score", "PP Score", "Models", ] # Add clean device name quant_summary["Device"] = quant_summary["Device ID"].apply(clean_device_id) # Format and display for each quantization tier for quant_level in sorted(quant_summary["Quant Factor"].unique(), reverse=True): quant_name = get_quant_name(quant_level) st.markdown(f"##### Quantization Level: {quant_name}") quant_data = quant_summary[ quant_summary["Quant Factor"] == quant_level ].copy() quant_data = quant_data.sort_values("Best Score", ascending=False) # Add ranking column quant_data = quant_data.reset_index(drop=True) quant_data.index = quant_data.index + 1 quant_data = quant_data.rename_axis("Rank") # Format scores quant_data["Best Score"] = quant_data["Best Score"].round(2) quant_data["Avg Score"] = quant_data["Avg Score"].round(2) quant_data["TG Score"] = quant_data["TG Score"].round(2) quant_data["PP Score"] = quant_data["PP Score"].round(2) display_cols = [ "Device", "Platform", "Best Score", "Avg Score", "TG Score", "PP Score", ] st.dataframe( quant_data[display_cols], use_container_width=True, height=min( 300, (len(quant_data) + 1) * 35 + 40 ), # Slightly smaller for quantization tables hide_index=False, column_config={ "Rank": st.column_config.NumberColumn( "Rank", help="Device ranking within this quantization level", ), "Device": st.column_config.TextColumn( "Device", help="Device brand and model", ), "Best Score": st.column_config.NumberColumn( "Best Score", help="Best performance score achieved", format="%.2f", ), "Avg Score": st.column_config.NumberColumn( "Avg Score", help="Average performance score", format="%.2f" ), "TG Score": st.column_config.NumberColumn( "TG Score", help="Normalized Token Generation score (0-100)", format="%.2f", ), "PP Score": st.column_config.NumberColumn( "PP Score", help="Normalized Prompt Processing score (0-100)", format="%.2f", ), }, )