Spaces:

lisabdunlap
/

Whatever-this-is

Sleeping

App Files Files Community

Lisa Dunlap commited on 13 days ago

Commit

4862c84

0 Parent(s):

restart

Browse files

Files changed (41) hide show

.gitattributes +39 -0
.gitignore +7 -0
data/aci_bench/cluster_scores.json +3 -0
data/aci_bench/cluster_scores_df.jsonl +3 -0
data/aci_bench/clustered_results_lightweight.jsonl +3 -0
data/aci_bench/clusters.json +3 -0
data/aci_bench/model_cluster_scores.json +3 -0
data/aci_bench/model_cluster_scores_df.jsonl +3 -0
data/aci_bench/model_scores.json +3 -0
data/aci_bench/model_scores_df.jsonl +3 -0
data/aci_bench/model_stats.json +3 -0
data/aci_bench/parsed_properties.jsonl +3 -0
data/aci_bench/parsing_error_summary.json +3 -0
data/aci_bench/parsing_failures.jsonl +3 -0
data/aci_bench/parsing_stats.json +3 -0
data/aci_bench/summary.txt +33 -0
data/aci_bench/summary_table.json +3 -0
data/aci_bench/summary_table.jsonl +3 -0
data/aci_bench/validated_properties.jsonl +3 -0
data/aci_bench/validation_stats.json +3 -0
lmmvibes/__init__.py +0 -0
lmmvibes/metrics/plotting.py +616 -0
lmmvibes/utils/__init__.py +1 -0
lmmvibes/utils/persistent_storage.py +80 -0
lmmvibes/vis_gradio/__init__.py +13 -0
lmmvibes/vis_gradio/app.py +697 -0
lmmvibes/vis_gradio/clusters_tab.py +199 -0
lmmvibes/vis_gradio/conversation_display.py +507 -0
lmmvibes/vis_gradio/data_loader.py +189 -0
lmmvibes/vis_gradio/debug_tab.py +83 -0
lmmvibes/vis_gradio/demo.py +73 -0
lmmvibes/vis_gradio/examples_tab.py +129 -0
lmmvibes/vis_gradio/frequency_tab.py +307 -0
lmmvibes/vis_gradio/launcher.py +122 -0
lmmvibes/vis_gradio/load_data_tab.py +147 -0
lmmvibes/vis_gradio/metrics_adapter.py +46 -0
lmmvibes/vis_gradio/overview_tab.py +82 -0
lmmvibes/vis_gradio/plots_tab.py +284 -0
lmmvibes/vis_gradio/side_by_side_display.py +202 -0
lmmvibes/vis_gradio/state.py +27 -0
lmmvibes/vis_gradio/utils.py +1673 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,39 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jsonl filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+results/**/plots/*.png -filter -merge -diff -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+__pycache__/
+# Ignore generated plot images
+results/**/plots/*.png
+# Ignore large results directories (data now tracked with LFS)
+results/**

data/aci_bench/cluster_scores.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9800cfb95cb3992d39649d61d01d326f7cd57fdc1e6253cd7a21b83be007762
+size 35290231

data/aci_bench/cluster_scores_df.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da1c901319ffa8aa23f4e53cfd7bf8f81bf1013c30369e589adb3383136a88cb
+size 33773423

data/aci_bench/clustered_results_lightweight.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:194429736798d0857962dd4b719c23608ae29606137ecd5d0fd979cacb1deb4a
+size 92743484

data/aci_bench/clusters.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a00c7a0b16723d80fd3490ef658c913b1384f8eb68c8a549e8b50251c4bdf60
+size 447437

data/aci_bench/model_cluster_scores.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4af5a4765a37b003e115b808a09ec4e95ebce3e302854957893f9b563b3cdc1e
+size 35639398

data/aci_bench/model_cluster_scores_df.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cabadb1369aae14d6dbe08dbc4dee6d701891fe9426fbe52588bbc477a1b5995
+size 33839755

data/aci_bench/model_scores.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93968fdf5604d473d031a4731127603eb3a6f27eba041e7564e52df85dc987f5
+size 35279538

data/aci_bench/model_scores_df.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d2c024085528e9afeda447a975da35099b9f323a57db7e6695e444f6021dd13
+size 33766092

data/aci_bench/model_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e7b3e1831735691cb43135355719f8d822deda3b64af9baeb02eb403cfb1546
+size 127543

data/aci_bench/parsed_properties.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db2a42c37fd60ecd830569cb776973e16da4acbd4ff9581d8a064239e702e66d
+size 2441177

data/aci_bench/parsing_error_summary.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2915c2fa4df41abe202b65cb7f84c1824fd64bad5a993d88c9349e25352b47ff
+size 27

data/aci_bench/parsing_failures.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b10b336d6d58227d03a5f83fa8e0cbbefaadeb73a497363b67e68e3a01cf742
+size 3665

data/aci_bench/parsing_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00197f1b62199cf7d8265acb34f073f0938c694b7230827a67086cd901c3f32e
+size 219

data/aci_bench/summary.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+LMM-Vibes Results Summary
+==================================================
+Total conversations: 720
+Total properties: 4146
+Models analyzed: 1
+Output files:
+  - raw_properties.jsonl: Raw LLM responses
+  - extraction_stats.json: Extraction statistics
+  - extraction_samples.jsonl: Sample inputs/outputs
+  - parsed_properties.jsonl: Parsed property objects
+  - parsing_stats.json: Parsing statistics
+  - parsing_failures.jsonl: Failed parsing attempts
+  - validated_properties.jsonl: Validated properties
+  - validation_stats.json: Validation statistics
+  - clustered_results.jsonl: Complete clustered data
+  - embeddings.parquet: Embeddings data
+  - clustered_results_lightweight.jsonl: Data without embeddings
+  - summary_table.jsonl: Clustering summary
+  - model_cluster_scores.json: Per model-cluster combination metrics
+  - cluster_scores.json: Per cluster metrics (aggregated across models)
+  - model_scores.json: Per model metrics (aggregated across clusters)
+  - full_dataset.json: Complete PropertyDataset (JSON format)
+  - full_dataset.parquet: Complete PropertyDataset (parquet format, or .jsonl if mixed data types)
+Model Rankings (by average quality score):
+  1. openai/gpt-4o: 0.833
+  2. google/gemini-1.5-pro-001: 0.828
+  3. openai/gpt-4o-mini: 0.828
+  4. meta/llama-3.3-70b-instruct: 0.827
+  5. qwen/qwen2.5-7b-instruct: 0.818
+  6. microsoft/phi-3.5-mini-instruct: 0.806

data/aci_bench/summary_table.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dca91c6976f8751e65d262c12e42451e9880386ae51fe93a62e53e355ac9ba9f
+size 58069

data/aci_bench/summary_table.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:098126fa13c7dd247263c87866cbacbcd583229470a34411022d5af130967d52
+size 56818

data/aci_bench/validated_properties.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db2a42c37fd60ecd830569cb776973e16da4acbd4ff9581d8a064239e702e66d
+size 2441177

data/aci_bench/validation_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ba5d5c25ab20c2a8bfa51202ebc7a4c59af49af68fbe385ac0aca9c2960c4ce
+size 137

lmmvibes/__init__.py ADDED Viewed

File without changes

lmmvibes/metrics/plotting.py ADDED Viewed

	@@ -0,0 +1,616 @@

+"""
+Plotting functionality for functional metrics.
+This module provides comprehensive visualization of metrics from functional_metrics.py,
+"""
+import json
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+import warnings
+import plotly.graph_objects as go
+import plotly.express as px
+from plotly.subplots import make_subplots
+import plotly.io as pio
+# Set plotly template
+pio.templates.default = "plotly_white"
+warnings.filterwarnings('ignore')
+def create_model_cluster_dataframe(model_cluster_scores: Dict[str, Any]) -> pd.DataFrame:
+    """Convert model-cluster scores to a tidy dataframe."""
+    rows = []
+    for model, clusters in model_cluster_scores.items():
+        for cluster, metrics in clusters.items():
+            # Filter out "No properties" clusters
+            if cluster == "No properties":
+                continue
+            row = {
+                'model': model,
+                'cluster': cluster,
+                'size': metrics.get('size', 0),
+                'proportion': metrics.get('proportion', 0),
+                'proportion_delta': metrics.get('proportion_delta', 0)
+            }
+            # Add confidence intervals if available
+            if 'proportion_ci' in metrics:
+                ci = metrics['proportion_ci']
+                row.update({
+                    'proportion_ci_lower': ci.get('lower', 0),
+                    'proportion_ci_upper': ci.get('upper', 0),
+                    'proportion_ci_mean': ci.get('mean', 0)
+                })
+            if 'proportion_delta_ci' in metrics:
+                ci = metrics['proportion_delta_ci']
+                row.update({
+                    'proportion_delta_ci_lower': ci.get('lower', 0),
+                    'proportion_delta_ci_upper': ci.get('upper', 0),
+                    'proportion_delta_ci_mean': ci.get('mean', 0)
+                })
+            # Add significance flags
+            row['proportion_delta_significant'] = metrics.get('proportion_delta_significant', False)
+            # Add quality metrics
+            quality = metrics.get('quality', {})
+            quality_delta = metrics.get('quality_delta', {})
+            quality_ci = metrics.get('quality_ci', {})
+            quality_delta_ci = metrics.get('quality_delta_ci', {})
+            quality_delta_significant = metrics.get('quality_delta_significant', {})
+            for metric_name in quality.keys():
+                row[f'quality_{metric_name}'] = quality[metric_name]
+                row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
+                row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
+                if metric_name in quality_ci:
+                    ci = quality_ci[metric_name]
+                    row.update({
+                        f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
+                        f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
+                        f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
+                    })
+                if metric_name in quality_delta_ci:
+                    ci = quality_delta_ci[metric_name]
+                    row.update({
+                        f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
+                        f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
+                        f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
+                    })
+            rows.append(row)
+    return pd.DataFrame(rows)
+def create_cluster_dataframe(cluster_scores: Dict[str, Any]) -> pd.DataFrame:
+    """Convert cluster scores to a tidy dataframe."""
+    rows = []
+    for cluster, metrics in cluster_scores.items():
+        # Filter out "No properties" clusters
+        if cluster == "No properties":
+            continue
+        row = {
+            'cluster': cluster,
+            'size': metrics.get('size', 0),
+            'proportion': metrics.get('proportion', 0)
+        }
+        # Add confidence intervals if available
+        if 'proportion_ci' in metrics:
+            ci = metrics['proportion_ci']
+            row.update({
+                'proportion_ci_lower': ci.get('lower', 0),
+                'proportion_ci_upper': ci.get('upper', 0),
+                'proportion_ci_mean': ci.get('mean', 0)
+            })
+        # Add quality metrics
+        quality = metrics.get('quality', {})
+        quality_delta = metrics.get('quality_delta', {})
+        quality_ci = metrics.get('quality_ci', {})
+        quality_delta_ci = metrics.get('quality_delta_ci', {})
+        quality_delta_significant = metrics.get('quality_delta_significant', {})
+        for metric_name in quality.keys():
+            row[f'quality_{metric_name}'] = quality[metric_name]
+            row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
+            row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
+            if metric_name in quality_ci:
+                ci = quality_ci[metric_name]
+                row.update({
+                    f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
+                    f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
+                    f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
+                })
+            if metric_name in quality_delta_ci:
+                ci = quality_delta_ci[metric_name]
+                row.update({
+                    f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
+                    f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
+                    f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
+                })
+        rows.append(row)
+    return pd.DataFrame(rows)
+def create_model_dataframe(model_scores: Dict[str, Any]) -> pd.DataFrame:
+    """Convert model scores to a tidy dataframe."""
+    rows = []
+    for model, metrics in model_scores.items():
+        row = {
+            'model': model,
+            'size': metrics.get('size', 0),
+            'proportion': metrics.get('proportion', 0)
+        }
+        # Add confidence intervals if available
+        if 'proportion_ci' in metrics:
+            ci = metrics['proportion_ci']
+            row.update({
+                'proportion_ci_lower': ci.get('lower', 0),
+                'proportion_ci_upper': ci.get('upper', 0),
+                'proportion_ci_mean': ci.get('mean', 0)
+            })
+        # Add quality metrics
+        quality = metrics.get('quality', {})
+        quality_delta = metrics.get('quality_delta', {})
+        quality_ci = metrics.get('quality_ci', {})
+        quality_delta_ci = metrics.get('quality_delta_ci', {})
+        quality_delta_significant = metrics.get('quality_delta_significant', {})
+        for metric_name in quality.keys():
+            row[f'quality_{metric_name}'] = quality[metric_name]
+            row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
+            row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
+            if metric_name in quality_ci:
+                ci = quality_ci[metric_name]
+                row.update({
+                    f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
+                    f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
+                    f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
+                })
+            if metric_name in quality_delta_ci:
+                ci = quality_delta_ci[metric_name]
+                row.update({
+                    f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
+                    f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
+                    f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
+                })
+        rows.append(row)
+    return pd.DataFrame(rows)
+def get_quality_metrics(df: pd.DataFrame) -> List[str]:
+    """Extract quality metric names from dataframe columns."""
+    quality_cols = [col for col in df.columns if col.startswith('quality_') and not col.endswith(('_ci_lower', '_ci_upper', '_ci_mean', '_significant'))]
+    return [col.replace('quality_', '') for col in quality_cols]
+def create_interactive_cluster_plot(cluster_df: pd.DataFrame, model_cluster_df: pd.DataFrame,
+                                 metric_col: str, title: str,
+                                 ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
+                                 significant_col: Optional[str] = None) -> go.Figure:
+    """Create an interactive cluster plot with dropdown for view mode."""
+    # Create the figure with subplots
+    fig = make_subplots(
+        rows=1, cols=1,
+        specs=[[{"secondary_y": False}]],
+        subplot_titles=[title]
+    )
+    # Prepare cluster_df - reset index if cluster is the index
+    if 'cluster' not in cluster_df.columns and cluster_df.index.name == 'cluster':
+        cluster_df = cluster_df.reset_index()
+    # Sort clusters by metric value in descending order for consistent ordering
+    cluster_df = cluster_df.sort_values(metric_col, ascending=False)
+    # Add aggregated view (default) - using cluster_df
+    if ci_lower_col and ci_upper_col and ci_lower_col in cluster_df.columns and ci_upper_col in cluster_df.columns:
+        fig.add_trace(
+            go.Bar(
+                x=cluster_df['cluster'],
+                y=cluster_df[metric_col],
+                name='Aggregated (All Models)',
+                error_y=dict(
+                    type='data',
+                    array=cluster_df[ci_upper_col] - cluster_df[metric_col],
+                    arrayminus=cluster_df[metric_col] - cluster_df[ci_lower_col],
+                    visible=True
+                ),
+                visible=True
+            )
+        )
+    else:
+        fig.add_trace(
+            go.Bar(
+                x=cluster_df['cluster'],
+                y=cluster_df[metric_col],
+                name='Aggregated (All Models)',
+                visible=True
+            )
+        )
+    # Grouped by model view - using model_cluster_df
+    for model in model_cluster_df['model'].unique():
+        model_df = model_cluster_df[model_cluster_df['model'] == model]
+        # Sort model_df to match the cluster order
+        model_df = model_df.set_index('cluster').reindex(cluster_df['cluster']).reset_index()
+        if ci_lower_col and ci_upper_col and ci_lower_col in model_cluster_df.columns and ci_upper_col in model_cluster_df.columns:
+            fig.add_trace(
+                go.Bar(
+                    x=model_df['cluster'],
+                    y=model_df[metric_col],
+                    name=f'Model: {model}',
+                    error_y=dict(
+                        type='data',
+                        array=model_df[ci_upper_col] - model_df[metric_col],
+                        arrayminus=model_df[metric_col] - model_df[ci_lower_col],
+                        visible=False
+                    ),
+                    visible=False
+                )
+            )
+        else:
+            fig.add_trace(
+                go.Bar(
+                    x=model_df['cluster'],
+                    y=model_df[metric_col],
+                    name=f'Model: {model}',
+                    visible=False
+                )
+            )
+    # Add significance markers if available (for aggregated view)
+    # Red asterisks (*) indicate clusters with statistically significant quality delta values
+    # (confidence intervals that do not contain 0)
+    if significant_col and significant_col in cluster_df.columns:
+        for i, (cluster, is_sig) in enumerate(zip(cluster_df['cluster'], cluster_df[significant_col])):
+            if is_sig:
+                fig.add_annotation(
+                    x=cluster,
+                    y=cluster_df[cluster_df['cluster'] == cluster][metric_col].iloc[0],
+                    text="*",
+                    showarrow=False,
+                    font=dict(size=16, color="red"),
+                    yshift=10
+                )
+    # Update layout
+    fig.update_layout(
+        title=title,
+        xaxis_title="Cluster",
+        yaxis_title=metric_col.replace('_', ' ').title(),
+        barmode='group',
+        height=500,
+        showlegend=True,
+        annotations=[
+            dict(
+                text="* = Statistically significant (CI does not contain 0)",
+                showarrow=False,
+                xref="paper", yref="paper",
+                x=0.01, y=0.01,
+                xanchor="left", yanchor="bottom",
+                font=dict(size=10, color="red")
+            )
+        ] if significant_col and significant_col in cluster_df.columns else []
+    )
+    # Add dropdown for view selection - only 2 options
+    buttons = []
+    # Aggregated view button (all models combined)
+    visibility = [True] + [False] * len(model_cluster_df['model'].unique())
+    buttons.append(
+        dict(
+            label="Aggregated (All Models)",
+            method="update",
+            args=[{"visible": visibility, "barmode": "group"}]
+        )
+    )
+    # Grouped by model view (each model as separate bars)
+    visibility = [False] + [True] * len(model_cluster_df['model'].unique())
+    buttons.append(
+        dict(
+            label="Grouped by Model",
+            method="update",
+            args=[{"visible": visibility, "barmode": "group"}]
+        )
+    )
+    fig.update_layout(
+        updatemenus=[
+            dict(
+                buttons=buttons,
+                direction="down",
+                showactive=True,
+                x=0.95,
+                xanchor="right",
+                y=1.25,
+                yanchor="top"
+            )
+        ]
+    )
+    return fig
+def create_interactive_heatmap(df: pd.DataFrame, value_col: str, title: str,
+                             pivot_index: str = 'model', pivot_columns: str = 'cluster',
+                             significant_col: Optional[str] = None) -> go.Figure:
+    """Create an interactive heatmap with hover information."""
+    # Create pivot table
+    pivot_df = df.pivot(index=pivot_index, columns=pivot_columns, values=value_col)
+    # Sort by mean values for consistent ordering
+    if pivot_index == 'model':
+        # Sort models by their mean values across clusters
+        model_means = pivot_df.mean(axis=1).sort_values(ascending=False)
+        pivot_df = pivot_df.reindex(model_means.index)
+    else:
+        # Sort clusters by their mean values across models
+        cluster_means = pivot_df.mean(axis=0).sort_values(ascending=False)
+        pivot_df = pivot_df.reindex(columns=cluster_means.index)
+    # Transpose the data for more intuitive visualization (models on x-axis, clusters on y-axis)
+    pivot_df = pivot_df.T
+    # Create heatmap
+    fig = go.Figure(data=go.Heatmap(
+        z=pivot_df.values,
+        x=pivot_df.columns,  # Models
+        y=pivot_df.index,    # Clusters
+        colorscale='RdBu_r' if 'delta' in value_col else 'Viridis',
+        zmid=0 if 'delta' in value_col else None,
+        text=pivot_df.values.round(3),
+        texttemplate="%{text}",
+        textfont={"size": 10},
+        hoverongaps=False
+    ))
+    # Add significance markers if available
+    if significant_col and significant_col in df.columns:
+        sig_pivot = df.pivot(index=pivot_index, columns=pivot_columns, values=significant_col)
+        # Apply same sorting as the main pivot
+        if pivot_index == 'model':
+            sig_pivot = sig_pivot.reindex(model_means.index)
+        else:
+            sig_pivot = sig_pivot.reindex(columns=cluster_means.index)
+        sig_pivot = sig_pivot.T  # Transpose to match the main heatmap
+        for i, cluster in enumerate(pivot_df.index):
+            for j, model in enumerate(pivot_df.columns):
+                if sig_pivot.loc[cluster, model]:
+                    fig.add_annotation(
+                        x=model,
+                        y=cluster,
+                        text="*",
+                        showarrow=False,
+                        font=dict(size=16, color="red"),
+                        xshift=10,
+                        yshift=10
+                    )
+    fig.update_layout(
+        title=title,
+        xaxis_title="Model",
+        yaxis_title="Cluster",
+        height=500,
+        annotations=[
+            dict(
+                text="* = Statistically significant (CI does not contain 0)",
+                showarrow=False,
+                xref="paper", yref="paper",
+                x=0.01, y=0.01,
+                xanchor="left", yanchor="bottom",
+                font=dict(size=10, color="red")
+            )
+        ] if significant_col and significant_col in df.columns else []
+    )
+    return fig
+def create_interactive_model_plot(model_df: pd.DataFrame, model_cluster_df: pd.DataFrame,
+                                metric_col: str, title: str,
+                                ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
+                                significant_col: Optional[str] = None) -> go.Figure:
+    """Create an interactive model plot with dropdown for view mode."""
+    # Create the figure with subplots
+    fig = make_subplots(
+        rows=1, cols=1,
+        specs=[[{"secondary_y": False}]],
+        subplot_titles=[title]
+    )
+    # Prepare model_df - reset index if model is the index
+    if 'model' not in model_df.columns and model_df.index.name == 'model':
+        model_df = model_df.reset_index()
+    # Add aggregated view (default) - using model_df
+    if ci_lower_col and ci_upper_col and ci_lower_col in model_df.columns and ci_upper_col in model_df.columns:
+        fig.add_trace(
+            go.Bar(
+                x=model_df['model'],
+                y=model_df[metric_col],
+                name='Aggregated (All Clusters)',
+                error_y=dict(
+                    type='data',
+                    array=model_df[ci_upper_col] - model_df[metric_col],
+                    arrayminus=model_df[metric_col] - model_df[ci_lower_col],
+                    visible=True
+                ),
+                visible=True
+            )
+        )
+    else:
+        fig.add_trace(
+            go.Bar(
+                x=model_df['model'],
+                y=model_df[metric_col],
+                name='Aggregated (All Clusters)',
+                visible=True
+            )
+        )
+    # Grouped by cluster view - using model_cluster_df
+    for cluster in model_cluster_df['cluster'].unique():
+        cluster_df = model_cluster_df[model_cluster_df['cluster'] == cluster]
+        if ci_lower_col and ci_upper_col and ci_lower_col in cluster_df.columns and ci_upper_col in cluster_df.columns:
+            fig.add_trace(
+                go.Bar(
+                    x=cluster_df['model'],
+                    y=cluster_df[metric_col],
+                    name=f'Cluster: {cluster}',
+                    error_y=dict(
+                        type='data',
+                        array=cluster_df[ci_upper_col] - cluster_df[metric_col],
+                        arrayminus=cluster_df[metric_col] - cluster_df[ci_lower_col],
+                        visible=False
+                    ),
+                    visible=False
+                )
+            )
+        else:
+            fig.add_trace(
+                go.Bar(
+                    x=cluster_df['model'],
+                    y=cluster_df[metric_col],
+                    name=f'Cluster: {cluster}',
+                    visible=False
+                )
+            )
+    # Add significance markers if available (for aggregated view)
+    if significant_col and significant_col in model_df.columns:
+        for i, (model, is_sig) in enumerate(zip(model_df['model'], model_df[significant_col])):
+            if is_sig:
+                fig.add_annotation(
+                    x=model,
+                    y=model_df[model_df['model'] == model][metric_col].iloc[0],
+                    text="*",
+                    showarrow=False,
+                    font=dict(size=16, color="red"),
+                    yshift=10
+                )
+    # Update layout
+    fig.update_layout(
+        title=title,
+        xaxis_title="Model",
+        yaxis_title=metric_col.replace('_', ' ').title(),
+        barmode='group',
+        height=500,
+        showlegend=True
+    )
+    # Add dropdown for view selection - only 2 options
+    buttons = []
+    # Aggregated view button (all clusters combined)
+    visibility = [True] + [False] * len(model_cluster_df['cluster'].unique())
+    buttons.append(
+        dict(
+            label="Aggregated (All Clusters)",
+            method="update",
+            args=[{"visible": visibility, "barmode": "group"}]
+        )
+    )
+    # Grouped by cluster view (each cluster as separate bars)
+    visibility = [False] + [True] * len(model_cluster_df['cluster'].unique())
+    buttons.append(
+        dict(
+            label="Grouped by Cluster",
+            method="update",
+            args=[{"visible": visibility, "barmode": "group"}]
+        )
+    )
+    fig.update_layout(
+        updatemenus=[
+            dict(
+                buttons=buttons,
+                direction="down",
+                showactive=True,
+                x=0.95,
+                xanchor="right",
+                y=1.25,
+                yanchor="top"
+            )
+        ]
+    )
+    return fig
+def create_interactive_model_cluster_plot(df: pd.DataFrame, metric_col: str, title: str,
+                                       ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
+                                       significant_col: Optional[str] = None) -> go.Figure:
+    """Create an interactive model-cluster plot with grouped bars."""
+    # Create grouped bar chart
+    if ci_lower_col and ci_upper_col and ci_lower_col in df.columns and ci_upper_col in df.columns:
+        fig = px.bar(
+            df,
+            x='cluster',
+            y=metric_col,
+            color='model',
+            error_y=df[ci_upper_col] - df[metric_col],
+            error_y_minus=df[metric_col] - df[ci_lower_col],
+            title=title,
+            barmode='group'
+        )
+    else:
+        fig = px.bar(
+            df,
+            x='cluster',
+            y=metric_col,
+            color='model',
+            title=title,
+            barmode='group'
+        )
+    # Add significance markers if available
+    if significant_col and significant_col in df.columns:
+        for i, row in df.iterrows():
+            if row[significant_col]:
+                fig.add_annotation(
+                    x=row['cluster'],
+                    y=row[metric_col],
+                    text="*",
+                    showarrow=False,
+                    font=dict(size=16, color="red"),
+                    yshift=10
+                )
+    fig.update_layout(
+        height=500,
+        xaxis_title="Cluster",
+        yaxis_title=metric_col.replace('_', ' ').title()
+    )
+    return fig

lmmvibes/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Utilities package for LMM-Vibes."""

lmmvibes/utils/persistent_storage.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""
+Utilities for persistent storage in Hugging Face Spaces.
+"""
+import os
+from pathlib import Path
+from typing import Optional
+def get_persistent_data_dir() -> Optional[Path]:
+    """Get the persistent data directory if available.
+    Returns:
+        Path to persistent storage directory if available, None otherwise.
+    """
+    if os.path.isdir("/data"):
+        data_dir = Path("/data/app_data")
+        data_dir.mkdir(exist_ok=True)
+        return data_dir
+    return None
+def get_cache_dir() -> Path:
+    """Get the appropriate cache directory (persistent if available, temp otherwise).
+    Returns:
+        Path to cache directory.
+    """
+    if os.path.isdir("/data"):
+        cache_dir = Path("/data/.cache")
+        cache_dir.mkdir(exist_ok=True)
+        return cache_dir
+    else:
+        # Fallback to temp directory
+        import tempfile
+        return Path(tempfile.gettempdir()) / "app_cache"
+def save_uploaded_file(uploaded_file, filename: str) -> Optional[Path]:
+    """Save an uploaded file to persistent storage.
+    Args:
+        uploaded_file: Gradio uploaded file object
+        filename: Name to save the file as
+    Returns:
+        Path to saved file if successful, None otherwise.
+    """
+    persistent_dir = get_persistent_data_dir()
+    if persistent_dir and uploaded_file:
+        save_path = persistent_dir / filename
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        # Copy the uploaded file to persistent storage
+        import shutil
+        shutil.copy2(uploaded_file, save_path)
+        return save_path
+    return None
+def is_persistent_storage_available() -> bool:
+    """Check if persistent storage is available.
+    Returns:
+        True if persistent storage is available, False otherwise.
+    """
+    return os.path.isdir("/data")
+def get_persistent_results_dir() -> Optional[Path]:
+    """Get the persistent results directory for storing pipeline results.
+    Returns:
+        Path to persistent results directory if available, None otherwise.
+    """
+    persistent_dir = get_persistent_data_dir()
+    if persistent_dir:
+        results_dir = persistent_dir / "results"
+        results_dir.mkdir(exist_ok=True)
+        return results_dir
+    return None

lmmvibes/vis_gradio/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Gradio-based visualization for LMM-Vibes pipeline results.
+This module provides a Gradio interface for exploring model performance,
+cluster analysis, and detailed examples from pipeline output.
+Usage:
+    from lmmvibes.vis_gradio import launch_app
+    launch_app(results_dir="path/to/results")
+"""
+from .app import launch_app, create_app
+__all__ = ["launch_app", "create_app"]

lmmvibes/vis_gradio/app.py ADDED Viewed

	@@ -0,0 +1,697 @@

+"""
+Main Gradio application for LMM-Vibes pipeline results visualization.
+This module creates a comprehensive Gradio interface for exploring model performance,
+cluster analysis, and detailed examples from pipeline output.
+"""
+import gradio as gr
+import pandas as pd
+import numpy as np
+import plotly.graph_objects as go
+from pathlib import Path
+from typing import Dict, List, Any, Optional, Tuple
+import os
+from .data_loader import (
+    load_pipeline_results,
+    load_property_examples,
+    scan_for_result_subfolders,
+    validate_results_directory,
+    get_available_models
+)
+from .utils import (
+    compute_model_rankings,
+    create_model_summary_card,
+    format_cluster_dataframe,
+    create_frequency_comparison_table,
+    create_frequency_comparison_plots,
+    search_clusters_by_text,
+    get_top_clusters_for_model,
+    create_interactive_cluster_viewer,
+    get_cluster_statistics,
+    get_unique_values_for_dropdowns,
+    get_example_data,
+    format_examples_display,
+    get_total_clusters_count
+)
+# ---------------------------------------------------------------------------
+# NEW: centralised state + logic split into per-tab modules
+# ---------------------------------------------------------------------------
+from .state import app_state, BASE_RESULTS_DIR
+# Tab-specific logic (moved out of this file)
+from .load_data_tab import (
+    load_data,
+    get_available_experiments,
+    get_experiment_choices,
+    refresh_experiment_dropdown,
+    load_experiment_data,
+)
+from .overview_tab import create_overview
+from .clusters_tab import view_clusters_interactive, view_clusters_table
+from .examples_tab import (
+    get_dropdown_choices,
+    update_example_dropdowns,
+    view_examples,
+)
+# Frequency and debug remain
+from .frequency_tab import create_frequency_comparison, create_frequency_plots
+from .debug_tab import debug_data_structure
+from .plots_tab import create_plots_tab, create_plot_with_toggle, update_quality_metric_dropdown, update_quality_metric_visibility
+# app_state and BASE_RESULTS_DIR now come from vis_gradio.state
+def update_top_n_slider_maximum():
+    """Update the top N slider maximum based on total clusters in loaded data."""
+    from .state import app_state
+    if not app_state.get("metrics"):
+        return gr.Slider(minimum=1, maximum=10, value=3, step=1)
+    total_clusters = get_total_clusters_count(app_state["metrics"])
+    max_value = max(10, total_clusters)  # At least 10, or total clusters if more
+    return gr.Slider(
+        label="Top N Clusters per Model",
+        minimum=1,
+        maximum=max_value,
+        value=min(3, max_value),
+        step=1,
+        info=f"Number of top clusters to show per model (max: {total_clusters})"
+    )
+def create_app() -> gr.Blocks:
+    """Create the main Gradio application."""
+    # Custom CSS for reduced margins and better sidebar layout
+    custom_css = """
+    .main-container {
+        max-width: 100% !important;
+        margin: 0 !important;
+        padding: 0 !important;
+    }
+    .gradio-container {
+        max-width: 100% !important;
+        margin: 0 !important;
+        padding: 0 10px !important;
+    }
+    .tabs {
+        margin: 0 !important;
+        padding: 0 !important;
+    }
+    .tab-nav {
+        margin: 0 !important;
+        padding: 0 !important;
+    }
+    .tab-content {
+        margin: 0 !important;
+        padding: 10px !important;
+    }
+    .sidebar {
+        border-right: 1px solid #e0e0e0;
+        background-color: #f8f9fa;
+        padding: 15px !important;
+    }
+    .main-content {
+        padding: 10px !important;
+    }
+    """
+    with gr.Blocks(title="LMM-Vibes Pipeline Results Explorer", theme=gr.themes.Soft(), css=custom_css) as app:
+        gr.Markdown("""
+        **Comprehensive analysis of model behavioral properties and performance**
+        Upload your pipeline results directory to explore model performance, cluster analysis, and detailed examples.
+        """)
+        with gr.Row():
+            # Sidebar for data loading and model selection
+            with gr.Column(scale=1, min_width=300, elem_classes=["sidebar"]):
+                gr.Markdown("### Load Data")
+                if BASE_RESULTS_DIR:
+                    gr.Markdown(f"**Base Results Directory:** `{BASE_RESULTS_DIR}`")
+                    gr.Markdown("**WARNING: this might take a while to load**")
+                    gr.Markdown("Select an experiment from the dropdown below to load its results.")
+                else:
+                    gr.Markdown("Provide the path to your pipeline results directory containing either:")
+                    gr.Markdown("• **Legacy format**: `model_stats.json` + `clustered_results.jsonl`")
+                    gr.Markdown("• **Functional format**: `model_cluster_scores.json` + `cluster_scores.json` + `model_scores.json` + `clustered_results.jsonl`")
+                    gr.Markdown("*The app will automatically detect which format you're using.*")
+                if BASE_RESULTS_DIR:
+                    experiment_dropdown = gr.Dropdown(
+                        label="Select Experiment",
+                        choices=get_experiment_choices(),
+                        value="Select an experiment...",
+                        info="Choose an experiment to load its results"
+                    )
+                else:
+                    results_dir_input = gr.Textbox(
+                        label="Results Directory Path",
+                        placeholder="/path/to/your/results/directory",
+                        info="Directory containing pipeline results (legacy or functional format)"
+                    )
+                load_btn = gr.Button("Load Data", variant="primary")
+                data_status = gr.Markdown("")
+                models_info = gr.Markdown("")
+                # Model selection (will be updated after loading)
+                selected_models = gr.CheckboxGroup(
+                    label="Select Models for Analysis",
+                    choices=[],
+                    value=[],
+                    info="Choose which models to include in comparisons"
+                )
+            # Main content area with reduced margins
+            with gr.Column(scale=4, elem_classes=["main-content"]):
+                with gr.Tabs():
+                    # Tab 1: Overview
+                    with gr.TabItem("📊 Overview"):
+                        with gr.Row():
+                            min_cluster_size = gr.Slider(
+                                label="Minimum Cluster Size",
+                                minimum=1, maximum=50, value=5, step=1,
+                                info="Hide clusters with fewer than this many examples"
+                            )
+                            score_significant_only = gr.Checkbox(
+                                label="Show Only Frequency Significant Clusters",
+                                value=False,
+                                info="Only show clusters where the distinctiveness score is statistically significant"
+                            )
+                            quality_significant_only = gr.Checkbox(
+                                label="Show Only Quality Significant Clusters",
+                                value=False,
+                                info="Only show clusters where the quality score is statistically significant"
+                            )
+                        with gr.Row():
+                            sort_by = gr.Dropdown(
+                                label="Sort Clusters By",
+                                choices=[
+                                    ("Proportion Delta (Descending)", "salience_desc"),
+                                    ("Proportion Delta (Ascending)", "salience_asc"),
+                                    ("Quality (Ascending)", "quality_asc"),
+                                    ("Quality (Descending)", "quality_desc"),
+                                    ("Frequency (Descending)", "frequency_desc"),
+                                    ("Frequency (Ascending)", "frequency_asc")
+                                ],
+                                value="quality_asc",
+                                info="How to sort clusters within each model card"
+                            )
+                            top_n_overview = gr.Slider(
+                                label="Top N Clusters per Model",
+                                minimum=1, maximum=10, value=3, step=1,
+                                info="Number of top clusters to show per model"
+                            )
+                        overview_display = gr.HTML(label="Model Overview")
+                        refresh_overview_btn = gr.Button("Refresh Overview")
+                    # Tab 2: View Clusters
+                    with gr.TabItem("📋 View Clusters"):
+                        gr.Markdown("### Interactive Cluster Viewer")
+                        gr.Markdown("Explore clusters with detailed property descriptions. Click on clusters to expand and view all properties within each cluster.")
+                        with gr.Row():
+                            search_clusters = gr.Textbox(
+                                label="Search Clusters",
+                                placeholder="Search in cluster descriptions...",
+                                info="Search for specific terms in cluster descriptions only"
+                            )
+                        clusters_display = gr.HTML(
+                            label="Interactive Cluster Viewer",
+                            value="<p style='color: #666; padding: 20px;'>Load data and select models to view clusters</p>"
+                        )
+                        refresh_clusters_btn = gr.Button("Refresh Clusters")
+                    # Tab 3: View Examples
+                    with gr.TabItem("📋 View Examples"):
+                        # gr.Markdown("### Individual Example Viewer")
+                        # gr.Markdown("Explore individual examples with full prompts, model responses, and property information. Click on examples to expand and view full details.")
+                        with gr.Row():
+                            search_examples = gr.Textbox(
+                                label="Search Clusters",
+                                placeholder="Search in cluster descriptions...",
+                                info="Search for specific terms in cluster descriptions to filter examples"
+                            )
+                        with gr.Row():
+                            with gr.Column(scale=1):
+                                example_prompt_dropdown = gr.Dropdown(
+                                    label="Select Prompt",
+                                    choices=["All Prompts"],
+                                    value="All Prompts",
+                                    info="Choose a specific prompt or 'All Prompts'"
+                                )
+                            with gr.Column(scale=1):
+                                example_model_dropdown = gr.Dropdown(
+                                    label="Select Model",
+                                    choices=["All Models"],
+                                    value="All Models",
+                                    info="Choose a specific model or 'All Models'"
+                                )
+                            with gr.Column(scale=1):
+                                example_property_dropdown = gr.Dropdown(
+                                    label="Select Cluster (Optional)",
+                                    choices=["All Clusters"],
+                                    value="All Clusters",
+                                    info="Choose a specific cluster or 'All Clusters'"
+                                )
+                        with gr.Row():
+                            max_examples_slider = gr.Slider(
+                                label="Max Examples",
+                                minimum=1, maximum=20, value=5, step=1,
+                                info="Maximum number of examples to display"
+                            )
+                            use_accordion_checkbox = gr.Checkbox(
+                                label="Use Accordion for System/Info Messages",
+                                value=True,
+                                info="Group system and info messages in collapsible sections"
+                            )
+                            pretty_print_checkbox = gr.Checkbox(
+                                label="Pretty-print dictionaries",
+                                value=True,
+                                info="Format embedded dictionaries for readability"
+                            )
+                            show_unexpected_behavior_checkbox = gr.Checkbox(
+                                label="Show Unexpected Behavior Only",
+                                value=False,
+                                info="Filter to show only examples with unexpected behavior"
+                            )
+                            view_examples_btn = gr.Button("View Examples", variant="primary")
+                        examples_display = gr.HTML(
+                            label="Examples",
+                            value="<p style='color: #666; padding: 20px;'>Load data and select filters to view examples</p>"
+                        )
+                    # Tab 4: Frequency Comparison
+                    with gr.TabItem("📈 Functional Metrics Tables"):
+                        gr.Markdown("View the three tables created by the functional metrics pipeline:")
+                        gr.Markdown("• **Model-Cluster Scores**: Per model-cluster combination metrics")
+                        gr.Markdown("• **Cluster Scores**: Per cluster metrics (aggregated across all models)")
+                        gr.Markdown("• **Model Scores**: Per model metrics (aggregated across all clusters)")
+                        frequency_table_info = gr.Markdown("")
+                        # Three separate tables for the functional metrics
+                        gr.Markdown("### Model-Cluster Scores")
+                        gr.Markdown("Per model-cluster combination metrics")
+                        model_cluster_table = gr.Dataframe(
+                            label="Model-Cluster Scores",
+                            interactive=False,
+                            wrap=True,
+                            max_height=600,
+                            elem_classes=["frequency-comparison-table"],
+                            show_search="search",
+                            pinned_columns=2
+                        )
+                        gr.Markdown("### Cluster Scores")
+                        gr.Markdown("Per cluster metrics (aggregated across all models)")
+                        cluster_table = gr.Dataframe(
+                            label="Cluster Scores",
+                            interactive=False,
+                            wrap=True,
+                            max_height=600,
+                            elem_classes=["frequency-comparison-table"],
+                            show_search="search",
+                            pinned_columns=2
+                        )
+                        gr.Markdown("### Model Scores")
+                        gr.Markdown("Per model metrics (aggregated across all clusters)")
+                        model_table = gr.Dataframe(
+                            label="Model Scores",
+                            interactive=False,
+                            wrap=True,
+                            max_height=600,
+                            elem_classes=["frequency-comparison-table"],
+                            show_search="search"
+                        )
+                        # Plots section has been removed
+                        # Remove all custom CSS styling - use Gradio defaults
+                    # Tab 5: Plots
+                    with gr.TabItem("📊 Plots"):
+                        plot_display, plot_info, show_ci_checkbox, plot_type_dropdown, quality_metric_dropdown = create_plots_tab()
+                    # (Search Examples tab removed)
+                    # Tab 6: Debug Data
+                    with gr.TabItem("🐛 Debug Data"):
+                        gr.Markdown("### Data Structure Debug")
+                        gr.Markdown("If tables aren't loading correctly, use this tab to inspect your data structure and identify issues.")
+                        debug_display = gr.HTML(
+                            label="Debug Information",
+                            value="<p style='color: #666; padding: 20px;'>Load data to see debug information</p>"
+                        )
+                        debug_btn = gr.Button("Show Debug Info", variant="secondary")
+        # Event handlers
+        if BASE_RESULTS_DIR:
+            # Use dropdown for experiment selection
+            if 'experiment_dropdown' in locals():
+                (experiment_dropdown.change(
+                    fn=load_experiment_data,
+                    inputs=[experiment_dropdown],
+                    outputs=[data_status, models_info, selected_models]
+                ).then(
+                    fn=update_example_dropdowns,
+                    outputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown]
+                ).then(
+                    fn=view_examples,
+                    inputs=[
+                        example_prompt_dropdown,
+                        example_model_dropdown,
+                        example_property_dropdown,
+                        max_examples_slider,
+                        use_accordion_checkbox,
+                        pretty_print_checkbox,
+                        search_examples,
+                        show_unexpected_behavior_checkbox,
+                    ],
+                    outputs=[examples_display]
+                ).then(
+                    fn=update_top_n_slider_maximum,
+                    outputs=[top_n_overview]
+                ).then(
+                    fn=create_frequency_comparison,
+                    inputs=[selected_models],
+                    outputs=[model_cluster_table, cluster_table, model_table, frequency_table_info]
+                ).then(
+                    fn=create_plot_with_toggle,
+                    inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
+                    outputs=[plot_display, plot_info]
+                ).then(
+                    fn=update_quality_metric_dropdown,
+                    outputs=[quality_metric_dropdown]
+                ))
+        else:
+            # Use textbox for manual path entry
+            if 'load_btn' in locals() and 'results_dir_input' in locals():
+                (load_btn.click(
+                    fn=load_data,
+                    inputs=[results_dir_input],
+                    outputs=[data_status, models_info, selected_models]
+                ).then(
+                    fn=update_example_dropdowns,
+                    outputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown]
+                ).then(
+                    fn=view_examples,
+                    inputs=[
+                        example_prompt_dropdown,
+                        example_model_dropdown,
+                        example_property_dropdown,
+                        max_examples_slider,
+                        use_accordion_checkbox,
+                        pretty_print_checkbox,
+                        search_examples,
+                        show_unexpected_behavior_checkbox,
+                    ],
+                    outputs=[examples_display]
+                ).then(
+                    fn=update_top_n_slider_maximum,
+                    outputs=[top_n_overview]
+                ).then(
+                    fn=create_frequency_comparison,
+                    inputs=[selected_models],
+                    outputs=[model_cluster_table, cluster_table, model_table, frequency_table_info]
+                ).then(
+                    fn=create_plot_with_toggle,
+                    inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
+                    outputs=[plot_display, plot_info]
+                ).then(
+                    fn=update_quality_metric_dropdown,
+                    outputs=[quality_metric_dropdown]
+                ))
+        refresh_overview_btn.click(
+            fn=create_overview,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
+            outputs=[overview_display]
+        )
+        refresh_clusters_btn.click(
+            fn=view_clusters_interactive,
+            inputs=[selected_models, search_clusters],
+            outputs=[clusters_display]
+        )
+        # View Examples handlers
+        view_examples_btn.click(
+            fn=view_examples,
+            inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
+            outputs=[examples_display]
+        )
+        # Auto-refresh examples when dropdowns change
+        example_prompt_dropdown.change(
+            fn=view_examples,
+            inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
+            outputs=[examples_display]
+        )
+        example_model_dropdown.change(
+            fn=view_examples,
+            inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
+            outputs=[examples_display]
+        )
+        example_property_dropdown.change(
+            fn=view_examples,
+            inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
+            outputs=[examples_display]
+        )
+        # Auto-refresh examples when search term changes
+        search_examples.change(
+            fn=view_examples,
+            inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
+            outputs=[examples_display]
+        )
+        # Auto-refresh examples when unexpected behavior checkbox changes
+        show_unexpected_behavior_checkbox.change(
+            fn=view_examples,
+            inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
+            outputs=[examples_display]
+        )
+        # Frequency Tab Handlers
+        freq_inputs = [selected_models]
+        freq_outputs = [model_cluster_table, cluster_table, model_table, frequency_table_info]
+        selected_models.change(fn=create_frequency_comparison, inputs=freq_inputs, outputs=freq_outputs)
+        # (Search Examples tab removed – no search_btn handler required)
+        debug_btn.click(
+            fn=debug_data_structure,
+            outputs=[debug_display]
+        )
+        # Plots Tab Handlers
+        show_ci_checkbox.change(
+            fn=create_plot_with_toggle,
+            inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
+            outputs=[plot_display, plot_info]
+        )
+        # Quality metric dropdown handlers (only for quality plots)
+        quality_metric_dropdown.change(
+            fn=create_plot_with_toggle,
+            inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
+            outputs=[plot_display, plot_info]
+        )
+        # Update quality metric visibility and plot based on plot type
+        plot_type_dropdown.change(
+            fn=update_quality_metric_visibility,
+            inputs=[plot_type_dropdown],
+            outputs=[quality_metric_dropdown]
+        ).then(
+            fn=create_plot_with_toggle,
+            inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
+            outputs=[plot_display, plot_info]
+        )
+        # Auto-refresh on model selection change
+        selected_models.change(
+            fn=create_overview,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
+            outputs=[overview_display]
+        )
+        # Auto-refresh on significance filter changes
+        score_significant_only.change(
+            fn=create_overview,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
+            outputs=[overview_display]
+        )
+        quality_significant_only.change(
+            fn=create_overview,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
+            outputs=[overview_display]
+        )
+        # Auto-refresh on sort dropdown change
+        sort_by.change(
+            fn=create_overview,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
+            outputs=[overview_display]
+        )
+        # Auto-refresh on cluster level change
+        # cluster_level.change(
+        #     fn=create_overview,
+        #     inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
+        #     outputs=[overview_display]
+        # )
+        # Auto-refresh on top N change
+        top_n_overview.change(
+            fn=create_overview,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
+            outputs=[overview_display]
+        )
+        # Auto-refresh on minimum cluster size change
+        min_cluster_size.change(
+            fn=create_overview,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
+            outputs=[overview_display]
+        )
+        selected_models.change(
+            fn=view_clusters_interactive,
+            inputs=[selected_models, gr.State("fine"), search_clusters],
+            outputs=[clusters_display]
+        )
+        # Auto-refresh clusters when search term changes (with debouncing)
+        search_clusters.change(
+            fn=view_clusters_interactive,
+            inputs=[selected_models, gr.State("fine"), search_clusters],
+            outputs=[clusters_display]
+        )
+    return app
+def launch_app(results_dir: Optional[str] = None,
+               share: bool = False,
+               server_name: str = "127.0.0.1",
+               server_port: int = 7860,
+               **kwargs) -> None:
+    """Launch the Gradio application.
+    Args:
+        results_dir: Optional path to base results directory containing experiment subfolders
+        share: Whether to create a public link
+        server_name: Server address
+        server_port: Server port
+        **kwargs: Additional arguments for gr.Blocks.launch()
+    """
+    global BASE_RESULTS_DIR
+    # Set the global base results directory
+    if results_dir:
+        BASE_RESULTS_DIR = results_dir
+        print(f"📁 Base results directory set to: {results_dir}")
+        # Check if it's a valid directory
+        if not os.path.exists(results_dir):
+            print(f"⚠️  Warning: Base results directory does not exist: {results_dir}")
+            BASE_RESULTS_DIR = None
+        else:
+            # Scan for available experiments
+            experiments = get_available_experiments(results_dir)
+            print(f"🔍 Found {len(experiments)} experiments: {experiments}")
+    app = create_app()
+    # Auto-load data if results_dir is provided and contains a single experiment
+    if results_dir and os.path.exists(results_dir):
+        experiments = get_available_experiments(results_dir)
+        if len(experiments) == 1:
+            # Auto-load the single experiment
+            experiment_path = os.path.join(results_dir, experiments[0])
+            try:
+                clustered_df, model_stats, model_cluster_df, results_path = load_pipeline_results(experiment_path)
+                app_state['clustered_df'] = clustered_df
+                app_state['model_stats'] = model_stats
+                app_state['model_cluster_df'] = model_cluster_df
+                app_state['results_path'] = results_path
+                app_state['available_models'] = get_available_models(model_stats)
+                app_state['current_results_dir'] = experiment_path
+                print(f"✅ Auto-loaded data from: {experiment_path}")
+            except Exception as e:
+                print(f"❌ Failed to auto-load data: {e}")
+        elif len(experiments) > 1:
+            print(f"📋 Multiple experiments found. Please select one from the dropdown.")
+    print(f"🚀 Launching Gradio app on {server_name}:{server_port}")
+    print(f"Share mode: {share}")
+    print(f"🔧 Additional kwargs: {kwargs}")
+    try:
+        app.launch(
+            share=share,
+            server_name=server_name,
+            server_port=server_port,
+            show_error=True,  # Show detailed error messages
+            quiet=False,  # Show more verbose output
+            **kwargs
+        )
+    except Exception as e:
+        print(f"❌ Failed to launch on port {server_port}: {e}")
+        print("🔄 Trying alternative port configuration...")
+        # Try with a port range instead of port 0
+        try:
+            # Try ports in a reasonable range
+            for alt_port in [8080, 8081, 8082, 8083, 8084, 8085, 8086, 8087, 8088, 8089]:
+                try:
+                    print(f"🔄 Trying port {alt_port}...")
+                    app.launch(
+                        share=share,
+                        server_name=server_name,
+                        server_port=alt_port,
+                        show_error=True,
+                        quiet=False,
+                        **kwargs
+                    )
+                    break  # If successful, break out of the loop
+                except Exception as port_error:
+                    if "Cannot find empty port" in str(port_error):
+                        print(f"   Port {alt_port} is busy, trying next...")
+                        continue
+                    else:
+                        raise port_error
+            else:
+                # If we get here, all ports in our range were busy
+                raise Exception("All attempted ports (8080-8089) are busy")
+        except Exception as e2:
+            print(f"❌ Failed to launch with alternative ports: {e2}")
+            print("💡 Try specifying a different port manually:")
+            print(f"   python -m lmmvibes.vis_gradio.launcher --port 9000")
+            print(f"   python -m lmmvibes.vis_gradio.launcher --auto_port")
+            raise e2

lmmvibes/vis_gradio/clusters_tab.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""Helpers for the **View Clusters** tab – both the interactive HTML and
+fallback dataframe view."""
+from typing import List
+import pandas as pd
+from .state import app_state
+from .utils import (
+    search_clusters_by_text,
+    search_clusters_only,
+    create_interactive_cluster_viewer,
+    get_cluster_statistics,
+    format_cluster_dataframe,
+)
+__all__ = ["view_clusters_interactive", "view_clusters_table"]
+# ---------------------------------------------------------------------------
+# Interactive HTML view
+# ---------------------------------------------------------------------------
+def view_clusters_interactive(
+    selected_models: List[str],
+    cluster_level: str,
+    search_term: str = "",
+) -> str:
+    if app_state["clustered_df"] is None:
+        return (
+            "<p style='color: #e74c3c; padding: 20px;'>❌ Please load data first "
+            "using the 'Load Data' tab</p>"
+        )
+    df = app_state["clustered_df"].dropna(subset=["property_description"]).copy()
+    # Apply search filter first
+    if search_term and search_term.strip():
+        df = search_clusters_only(df, search_term.strip(), cluster_level)
+    # Build interactive viewer
+    cluster_html = create_interactive_cluster_viewer(df, selected_models, cluster_level)
+    # Statistics summary at the top
+    stats = get_cluster_statistics(df, selected_models)
+    if not stats:
+        return (
+            "<p style='color: #e74c3c; padding: 20px;'>❌ No cluster data available</p>"
+        )
+    # Get additional metrics from cluster_scores
+    cluster_scores = app_state.get("metrics", {}).get("cluster_scores", {})
+    # Calculate average quality scores and frequency
+    total_frequency = 0
+    quality_scores_list = []
+    metric_names = set()
+    for cluster_name, cluster_data in cluster_scores.items():
+        total_frequency += cluster_data.get("proportion", 0) * 100
+        quality_scores = cluster_data.get("quality", {})
+        if quality_scores:
+            quality_scores_list.extend(quality_scores.values())
+            metric_names.update(quality_scores.keys())
+    avg_quality = sum(quality_scores_list) / len(quality_scores_list) if quality_scores_list else 0
+    metrics_suffix = f" ({', '.join(sorted(metric_names))})" if metric_names else ""
+    stats_html = f"""
+    <div style="
+        background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
+        color: white;
+        padding: 20px;
+        border-radius: 8px;
+        margin-bottom: 20px;
+        box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+    ">
+        <h3 style="margin: 0 0 15px 0;">Cluster Statistics</h3>
+        <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
+            <div>
+                <div style="font-size: 24px; font-weight: bold;">{stats['total_properties']:,}</div>
+                <div style="opacity: 0.9;">Total Properties</div>
+            </div>
+            <div>
+                <div style="font-size: 24px; font-weight: bold;">{stats['total_models']}</div>
+                <div style="opacity: 0.9;">Models</div>
+            </div>
+            <div>
+                <div style="font-size: 24px; font-weight: bold;">{avg_quality:.3f}</div>
+                <div style="opacity: 0.9;">Avg Quality{metrics_suffix}</div>
+            </div>
+    """
+    if cluster_level == "fine" and "fine_clusters" in stats:
+        stats_html += f"""
+            <div>
+                <div style="font-size: 24px; font-weight: bold;">{stats['fine_clusters']}</div>
+                <div style="opacity: 0.9;">Fine Clusters</div>
+            </div>
+            <div>
+                <div style="font-size: 24px; font-weight: bold;">{stats['avg_properties_per_fine_cluster']:.1f}</div>
+                <div style="opacity: 0.9;">Avg Properties/Cluster</div>
+            </div>
+        """
+    elif cluster_level == "coarse" and "coarse_clusters" in stats:
+        stats_html += f"""
+            <div>
+                <div style="font-size: 24px; font-weight: bold;">{stats['coarse_clusters']}</div>
+                <div style="opacity: 0.9;">Coarse Clusters</div>
+            </div>
+            <div>
+                <div style="font-size: 24px; font-weight: bold;">{stats['avg_properties_per_coarse_cluster']:.1f}</div>
+                <div style="opacity: 0.9;">Avg Properties/Cluster</div>
+            </div>
+        """
+    stats_html += """
+        </div>
+    </div>
+    """
+    # Add a note if coarse clusters were requested but not available
+    if cluster_level == "coarse" and "coarse_clusters" not in stats and "fine_clusters" in stats:
+        stats_html += """
+        <div style="
+            background: #fff3cd;
+            border-left: 4px solid #ffc107;
+            padding: 10px 15px;
+            margin-bottom: 15px;
+            border-radius: 4px;
+        ">
+            ⚠️ <strong>Note:</strong> Coarse clusters not available in this dataset. Showing fine clusters instead.
+        </div>
+        """
+    # Additional filter chips
+    filter_info = ""
+    if search_term and search_term.strip():
+        filter_info += f"""
+        <div style="
+            background: #e3f2fd;
+            border-left: 4px solid #2196f3;
+            padding: 10px 15px;
+            margin-bottom: 15px;
+            border-radius: 4px;
+        ">
+            🔍 <strong>Search Filter:</strong> "{search_term}"
+        </div>
+        """
+    if selected_models:
+        filter_info += f"""
+        <div style="
+            background: #f3e5f5;
+            border-left: 4px solid #9c27b0;
+            padding: 10px 15px;
+            margin-bottom: 15px;
+            border-radius: 4px;
+        ">
+            🎯 <strong>Selected Models:</strong> {', '.join(selected_models)}
+        </div>
+        """
+    return stats_html + filter_info + cluster_html
+# ---------------------------------------------------------------------------
+# Dataframe fallback view
+# ---------------------------------------------------------------------------
+def view_clusters_table(
+    selected_models: List[str],
+    cluster_level: str,
+    search_term: str = "",
+) -> pd.DataFrame:
+    if app_state["clustered_df"] is None:
+        return pd.DataFrame({"Message": ["Please load data first using the 'Load Data' tab"]})
+    df = app_state["clustered_df"].copy()
+    if search_term and search_term.strip():
+        df = search_clusters_only(df, search_term.strip(), cluster_level)
+    formatted_df = format_cluster_dataframe(df, selected_models, cluster_level)
+    if formatted_df.empty:
+        if search_term and search_term.strip():
+            return pd.DataFrame({"Message": [f"No results found for search term '{search_term}'. Try a different search term."]})
+        elif selected_models:
+            available_models = df["model"].unique().tolist() if "model" in df.columns else []
+            return pd.DataFrame({"Message": [
+                f"No data found for selected models: {', '.join(selected_models)}. "
+                f"Available models: {', '.join(available_models)}"
+            ]})
+        else:
+            return pd.DataFrame({"Message": [
+                "No data available. Please check your data files and try reloading."
+            ]})
+    return formatted_df

lmmvibes/vis_gradio/conversation_display.py ADDED Viewed

	@@ -0,0 +1,507 @@

+from __future__ import annotations
+"""Conversation display helpers for vis_gradio.
+This module encapsulates everything related to:
+• safely parsing model responses (lists / dicts / JSON strings)
+• pretty-printing embedded dictionaries for readability
+• converting multiple conversation formats to the OpenAI chat list format
+• rendering that list as HTML (including accordion grouping + raw JSON viewer).
+Moving this logic out of utils.py keeps the latter lean and focussed on general
+analytics utilities.
+"""
+from typing import List, Dict, Any
+import ast
+import json
+import html
+import markdown
+import re
+__all__: List[str] = [
+    "convert_to_openai_format",
+    "display_openai_conversation_html",
+    "pretty_print_embedded_dicts",
+]
+# ---------------------------------------------------------------------------
+# Pretty-printing helpers
+# ---------------------------------------------------------------------------
+def _find_balanced_spans(text: str):
+    """Return (start, end) spans of balanced {...} or [...] regions in *text*."""
+    spans, stack = [], []
+    for i, ch in enumerate(text):
+        if ch in "{[":
+            stack.append((ch, i))
+        elif ch in "]}" and stack:
+            opener, start = stack.pop()
+            if (opener, ch) in {("{", "}"), ("[", "]")} and not stack:
+                spans.append((start, i + 1))
+    return spans
+def _try_parse_slice(slice_: str):
+    """Attempt to parse *slice_* into a Python object; return None on failure."""
+    try:
+        return ast.literal_eval(slice_)
+    except Exception:
+        try:
+            return json.loads(slice_)
+        except Exception:
+            return None
+def _find_code_spans(text: str) -> List[tuple]:
+    """Return spans for markdown code regions to be preserved as-is.
+    Includes:
+    - fenced code blocks delimited by ``` ... ```
+    - inline code segments delimited by `...`
+    """
+    spans: List[tuple] = []
+    # Fenced blocks ``` ... ``` (language spec allowed after opening fence)
+    idx = 0
+    while True:
+        start = text.find("```", idx)
+        if start == -1:
+            break
+        # Find the end fence
+        end = text.find("```", start + 3)
+        if end == -1:
+            # Unclosed fence: treat rest of string as code
+            spans.append((start, len(text)))
+            break
+        spans.append((start, end + 3))
+        idx = end + 3
+    # Inline code `...`
+    for m in re.finditer(r"`[^`]*`", text, flags=re.DOTALL):
+        spans.append((m.start(), m.end()))
+    # Sort and merge overlapping spans
+    spans.sort()
+    merged: List[tuple] = []
+    for s, e in spans:
+        if not merged or s > merged[-1][1]:
+            merged.append((s, e))
+        else:
+            merged[-1] = (merged[-1][0], max(merged[-1][1], e))
+    return merged
+def _is_inside_any_span(start: int, end: int, spans: List[tuple]) -> bool:
+    for s, e in spans:
+        if start >= s and end <= e:
+            return True
+    return False
+def pretty_print_embedded_dicts(text: str) -> str:
+    """Replace dicts or list-of-dicts with a `<pre>` block, except inside code.
+    Dict-like regions that fall within markdown code spans (inline backticks
+    or fenced code blocks) are left untouched so code examples render verbatim.
+    """
+    if not text:
+        return text
+    code_spans = _find_code_spans(text)
+    new_parts, last_idx = [], 0
+    for start, end in _find_balanced_spans(text):
+        candidate = text[start:end]
+        parsed = _try_parse_slice(candidate)
+        is_good = isinstance(parsed, dict) or (
+            isinstance(parsed, list) and parsed and all(isinstance(d, dict) for d in parsed)
+        )
+        if is_good and not _is_inside_any_span(start, end, code_spans):
+            new_parts.append(html.escape(text[last_idx:start], quote=False))
+            pretty = json.dumps(parsed, indent=2, ensure_ascii=False)
+            new_parts.append(
+                f"<pre style='background:#f8f9fa;padding:10px;border-radius:4px;overflow-x:auto;'>{pretty}</pre>"
+            )
+            last_idx = end
+    new_parts.append(html.escape(text[last_idx:], quote=False))
+    return "".join(new_parts)
+# ---------------------------------------------------------------------------
+# Format conversion
+# ---------------------------------------------------------------------------
+def convert_to_openai_format(response_data: Any):
+    """Convert various response payloads into the OpenAI chat format list."""
+    if isinstance(response_data, list):
+        return response_data
+    if isinstance(response_data, str):
+        # Try Python literal first (handles single quotes)
+        try:
+            parsed = ast.literal_eval(response_data)
+            if isinstance(parsed, list):
+                return parsed
+        except (ValueError, SyntaxError):
+            pass
+        # Try JSON
+        try:
+            parsed = json.loads(response_data)
+            if isinstance(parsed, list):
+                return parsed
+        except json.JSONDecodeError:
+            pass
+        # Fallback plain-text assistant message
+        return [{"role": "assistant", "content": response_data}]
+    # Fallback for any other type
+    return [{"role": "assistant", "content": str(response_data)}]
+# ---------------------------------------------------------------------------
+# HTML rendering
+# ---------------------------------------------------------------------------
+def _markdown(text: str, *, pretty_print_dicts: bool = True) -> str:
+    """Render markdown, optionally pretty-printing any embedded dicts."""
+    processed = pretty_print_embedded_dicts(text) if pretty_print_dicts else html.escape(text, quote=False)
+    # Configure extensions for proper code block handling
+    extensions = ["fenced_code"]
+    extension_configs = {}
+    try:
+        import pygments
+        extensions.append("codehilite")
+        extension_configs['codehilite'] = {
+            'css_class': 'highlight',
+            'use_pygments': True,
+            'guess_lang': True,
+            'linenums': False
+        }
+    except ImportError:
+        pass
+    # Convert newlines to <br> only outside of code blocks
+    # Process fenced code blocks first, then handle line breaks
+    result = markdown.markdown(processed, extensions=extensions, extension_configs=extension_configs)
+    # Add line breaks for non-code content (simple approach)
+    # This replaces single newlines with <br> but preserves code blocks
+    import re
+    # Split by code blocks to avoid affecting them
+    code_block_pattern = r'(<pre[^>]*>.*?</pre>|<code[^>]*>.*?</code>)'
+    parts = re.split(code_block_pattern, result, flags=re.DOTALL)
+    for i in range(0, len(parts), 2):  # Process non-code parts only
+        if i < len(parts):
+            # Replace single newlines with <br>, but not double newlines (paragraphs)
+            parts[i] = re.sub(r'(?<!\n)\n(?!\n)', '<br>\n', parts[i])
+    return ''.join(parts)
+def display_openai_conversation_html(conversation_data: List[Dict[str, Any]], *, use_accordion: bool = True, pretty_print_dicts: bool = True) -> str:
+    """Convert an OpenAI-style conversation list into styled HTML for Gradio."""
+    if not conversation_data:
+        return "<p>No conversation data available</p>"
+    # Collapsed raw JSON section for debugging
+    raw_json = json.dumps(conversation_data, indent=2, ensure_ascii=False)
+    html_out = f"""
+    <details style="margin: 8px 0;">
+        <summary style="cursor: pointer; font-weight: 600;">
+            Click to see raw response ({len(conversation_data)})
+        </summary>
+        <div style="padding: 8px 15px;">
+            <pre style="white-space: pre-wrap; word-wrap: break-word; background: #f8f9fa; padding: 10px; border-radius: 4px; overflow-x: auto;">{raw_json}</pre>
+        </div>
+    </details>
+    """
+    role_colors = {
+        "system": "#ff6b6b",
+        "info": "#4ecdc4",
+        "assistant": "#45b7d1",
+        "tool": "#96ceb4",
+        "user": "#feca57",
+    }
+    def _format_msg(role: str, content: Any) -> str:
+        if isinstance(content, dict) or (isinstance(content, list) and content and all(isinstance(d, dict) for d in content)):
+            if pretty_print_dicts:
+                content_html = (
+                    f"<pre style='background: #f8f9fa; padding: 10px; border-radius: 4px; overflow-x: auto;'>{json.dumps(content, indent=2, ensure_ascii=False)}</pre>"
+                )
+            else:
+                content_html = f"<code>{html.escape(json.dumps(content, ensure_ascii=False))}</code>"
+        elif isinstance(content, str):
+            content_html = _markdown(content, pretty_print_dicts=pretty_print_dicts)
+        elif content is None:
+            content_html = "<em>(No content)</em>"
+        else:
+            content_html = str(content)
+        color = role_colors.get(role.lower(), "#95a5a6")
+        return (
+            f"<div style='border-left: 4px solid {color}; margin: 8px 0; background-color: #ffffff; padding: 12px; border-radius: 0 8px 8px 0;'>"
+            f"<div style='font-weight: 600; color: {color}; margin-bottom: 8px; text-transform: capitalize; font-size: 14px;'>{role}</div>"
+            f"<div style='color: #333; line-height: 1.6; font-family: \"Segoe UI\", Tahoma, Geneva, Verdana, sans-serif;'>{content_html}</div>"
+            "</div>"
+        )
+    if use_accordion:
+        system_msgs, info_msgs, other_msgs = [], [], []
+        for m in conversation_data:
+            if not isinstance(m, dict):
+                continue
+            role = m.get("role", "unknown").lower()
+            content = m.get("content", "")
+            if isinstance(content, dict) and "text" in content:
+                content = content["text"]
+            if role == "system":
+                system_msgs.append((role, content))
+            elif role == "info":
+                info_msgs.append((role, content))
+            else:
+                other_msgs.append((role, content))
+        def _accordion(title: str, items: List):
+            if not items:
+                return ""
+            inner = "".join(_format_msg(r, c) for r, c in items)
+            return (
+                f"<details style='margin: 8px 0;'>"
+                f"<summary style='cursor: pointer; font-weight: 600;'>"
+                f"{html.escape(title)} ({len(items)})"  # e.g. "Click to see system messages (3)"
+                f"</summary>"
+                f"<div style='padding: 8px 15px;'>{inner}</div>"
+                "</details>"
+            )
+        html_out += _accordion("Click to see system messages", system_msgs)
+        html_out += _accordion("Click to see info messages", info_msgs)
+        for r, c in other_msgs:
+            html_out += _format_msg(r, c)
+    else:
+        # No accordion: just render everything
+        for m in conversation_data:
+            if not isinstance(m, dict):
+                continue
+            role = m.get("role", "unknown").lower()
+            content = m.get("content", "")
+            if isinstance(content, dict) and "text" in content:
+                content = content["text"]
+            html_out += _format_msg(role, content)
+    # CSS for proper code block styling and summary hover effects
+    css_styles = """
+    <style>
+    :root {
+        /* Code block color palette - GitHub Light inspired */
+        --code-bg: #f6f8fa;
+        --code-text: #24292f;
+        --code-comment: #6a737d;
+        --code-keyword: #d73a49;
+        --code-string: #032f62;
+        --code-number: #005cc5;
+        --code-operator: #24292f;
+        --code-function: #6f42c1;
+        --code-border: #d0d7de;
+        /* Inline code colors - same light theme */
+        --inline-code-bg: #f3f4f6;
+        --inline-code-text: #24292f;
+        --inline-code-border: #d1d5db;
+        /* Code block structure */
+        --code-border-radius: 8px;
+        --code-padding: 16px;
+        --code-font-size: 14px;
+        --code-line-height: 1.5;
+        --code-font-family: 'JetBrains Mono', 'Fira Code', 'Cascadia Code', 'SF Mono', Consolas, 'Liberation Mono', Menlo, Courier, monospace;
+    }
+    /* Base code styling */
+    pre, code {
+        font-family: var(--code-font-family) !important;
+        font-size: var(--code-font-size) !important;
+        line-height: var(--code-line-height) !important;
+        font-variant-ligatures: normal !important;
+        -webkit-font-smoothing: antialiased !important;
+        -moz-osx-font-smoothing: grayscale !important;
+    }
+    /* Fenced code blocks - light theme */
+    .highlight, .codehilite, pre.highlight, pre.codehilite,
+    .language-python, .language-text, .language-bash {
+        background: var(--code-bg) !important;
+        color: var(--code-text) !important;
+        border: 1px solid var(--code-border) !important;
+        border-radius: var(--code-border-radius) !important;
+        padding: var(--code-padding) !important;
+        margin: 12px 0 !important;
+        overflow-x: auto !important;
+        box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05) !important;
+        position: relative !important;
+        white-space: pre !important;
+        display: block !important;
+    }
+    .highlight pre, .codehilite pre {
+        background: transparent !important;
+        color: inherit !important;
+        margin: 0 !important;
+        padding: 0 !important;
+        border: none !important;
+        border-radius: 0 !important;
+        overflow: visible !important;
+        white-space: pre !important;
+        display: block !important;
+    }
+    /* Ensure code blocks preserve formatting */
+    .highlight code, .codehilite code {
+        white-space: pre !important;
+        display: block !important;
+        padding: 0 !important;
+        margin: 0 !important;
+        background: transparent !important;
+        border: none !important;
+        font-size: inherit !important;
+        line-height: inherit !important;
+    }
+    /* Add language label for fenced blocks */
+    .highlight::before, .codehilite::before {
+        content: 'python';
+        position: absolute;
+        top: 8px;
+        right: 12px;
+        background: rgba(0, 0, 0, 0.05);
+        color: #586069;
+        padding: 2px 8px;
+        border-radius: 4px;
+        font-size: 11px;
+        font-weight: 500;
+        text-transform: uppercase;
+        letter-spacing: 0.5px;
+    }
+    /* Syntax highlighting for Python - Light theme */
+    .highlight .k, .codehilite .k,    /* keywords */
+    .highlight .kn, .codehilite .kn,  /* keyword.namespace */
+    .highlight .kp, .codehilite .kp,  /* keyword.pseudo */
+    .highlight .kr, .codehilite .kr,  /* keyword.reserved */
+    .highlight .kt, .codehilite .kt   /* keyword.type */
+    {
+        color: var(--code-keyword) !important;
+        font-weight: 600 !important;
+    }
+    .highlight .s, .codehilite .s,    /* strings */
+    .highlight .s1, .codehilite .s1,  /* string.single */
+    .highlight .s2, .codehilite .s2,  /* string.double */
+    .highlight .se, .codehilite .se   /* string.escape */
+    {
+        color: var(--code-string) !important;
+    }
+    .highlight .c, .codehilite .c,    /* comments */
+    .highlight .c1, .codehilite .c1,  /* comment.single */
+    .highlight .cm, .codehilite .cm   /* comment.multiline */
+    {
+        color: var(--code-comment) !important;
+        font-style: italic !important;
+    }
+    .highlight .m, .codehilite .m,    /* numbers */
+    .highlight .mi, .codehilite .mi,  /* number.integer */
+    .highlight .mf, .codehilite .mf,  /* number.float */
+    .highlight .mo, .codehilite .mo   /* number.octal */
+    {
+        color: var(--code-number) !important;
+        font-weight: 600 !important;
+    }
+    .highlight .nf, .codehilite .nf,  /* function names */
+    .highlight .fm, .codehilite .fm   /* function.magic */
+    {
+        color: var(--code-function) !important;
+        font-weight: 600 !important;
+    }
+    .highlight .o, .codehilite .o,    /* operators */
+    .highlight .ow, .codehilite .ow   /* operator.word */
+    {
+        color: var(--code-operator) !important;
+    }
+    /* Inline code - light theme */
+    p code, li code, div code, span code,
+    h1 code, h2 code, h3 code, h4 code, h5 code, h6 code {
+        background: var(--inline-code-bg) !important;
+        color: var(--inline-code-text) !important;
+        border: 1px solid var(--inline-code-border) !important;
+        padding: 2px 6px !important;
+        border-radius: 4px !important;
+        font-size: 0.9em !important;
+        font-weight: 600 !important;
+        white-space: nowrap !important;
+        box-shadow: none !important;
+        display: inline !important;
+    }
+    /* Code blocks inside paragraphs should not be treated as inline */
+    p pre, li pre, div pre {
+        background: var(--code-bg) !important;
+        color: var(--code-text) !important;
+        border: 1px solid var(--code-border) !important;
+        border-radius: var(--code-border-radius) !important;
+        padding: var(--code-padding) !important;
+        margin: 8px 0 !important;
+        white-space: pre !important;
+        overflow-x: auto !important;
+        display: block !important;
+    }
+    /* Scrollbar styling for code blocks - light theme */
+    .highlight::-webkit-scrollbar, .codehilite::-webkit-scrollbar,
+    pre::-webkit-scrollbar {
+        height: 8px !important;
+        background: #f1f3f4 !important;
+        border-radius: 4px !important;
+    }
+    .highlight::-webkit-scrollbar-thumb, .codehilite::-webkit-scrollbar-thumb,
+    pre::-webkit-scrollbar-thumb {
+        background: #c1c8cd !important;
+        border-radius: 4px !important;
+    }
+    .highlight::-webkit-scrollbar-thumb:hover, .codehilite::-webkit-scrollbar-thumb:hover,
+    pre::-webkit-scrollbar-thumb:hover {
+        background: #a8b3ba !important;
+    }
+    """
+    if use_accordion:
+        css_styles += """
+        /* Accordion styling */
+        details > summary {
+            list-style: none !important;
+            cursor: pointer !important;
+        }
+        details > summary:hover {
+            background-color: transparent !important;
+            box-shadow: none !important;
+            transform: none !important;
+        }
+        details > summary::-webkit-details-marker,
+        details > summary::marker {
+            display: none !important;
+        }
+        """
+    css_styles += "</style>"
+    html_out = css_styles + html_out
+    return html_out

lmmvibes/vis_gradio/data_loader.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""
+Data loading functionality for the LMM-Vibes Gradio app.
+This module handles loading pipeline results and converting them to formats
+suitable for the Gradio interface.
+"""
+import json
+import pandas as pd
+from pathlib import Path
+from typing import Dict, List, Any, Tuple, Optional
+import os
+from .state import app_state
+from lmmvibes.metrics.plotting import create_model_cluster_dataframe
+class DataCache:
+    """Simple cache for loaded data to avoid re-loading."""
+    _cache = {}
+    @classmethod
+    def get(cls, key: str):
+        return cls._cache.get(key)
+    @classmethod
+    def set(cls, key: str, value: Any):
+        cls._cache[key] = value
+    @classmethod
+    def clear(cls):
+        cls._cache.clear()
+def scan_for_result_subfolders(base_dir: str) -> List[str]:
+    """Scan for subfolders that might contain pipeline results."""
+    base_path = Path(base_dir)
+    if not base_path.exists():
+        return []
+    # Look for subfolders that contain the required files
+    subfolders = []
+    for item in base_path.iterdir():
+        if item.is_dir():
+            # Check if this subfolder contains pipeline results
+            required_files = [
+                "model_cluster_scores.json",
+                "cluster_scores.json",
+                "model_scores.json",
+                "clustered_results_lightweight.jsonl"
+            ]
+            if all((item / f).exists() for f in required_files):
+                subfolders.append(item.name)
+    return subfolders
+def validate_results_directory(results_dir: str) -> Tuple[bool, str]:
+    """Validate that the results directory contains the expected files."""
+    results_path = Path(results_dir)
+    if not results_path.exists():
+        return False, f"Directory does not exist: {results_dir}"
+    if not results_path.is_dir():
+        return False, f"Path is not a directory: {results_dir}"
+    # Check for FunctionalMetrics format files
+    required_files = [
+        "model_cluster_scores.json",
+        "cluster_scores.json",
+        "model_scores.json",
+    ]
+    missing_files = []
+    for filename in required_files:
+        if not (results_path / filename).exists():
+            missing_files.append(filename)
+    # Check for clustered results
+    if not (results_path / "clustered_results_lightweight.jsonl").exists():
+        missing_files.append("clustered_results_lightweight.jsonl")
+    if missing_files:
+        return False, f"Missing required files: {', '.join(missing_files)}"
+    return True, ""
+def get_available_models(metrics: Dict[str, Any]) -> List[str]:
+    """Extract available models from metrics data."""
+    model_cluster_scores = metrics.get("model_cluster_scores", {})
+    return list(model_cluster_scores.keys())
+def get_all_models(metrics: Dict[str, Any]) -> List[str]:
+    """Get all available models from metrics data."""
+    return get_available_models(metrics)
+def load_pipeline_results(results_dir: str) -> Tuple[pd.DataFrame, Dict[str, Any], pd.DataFrame, Path]:
+    """Load pipeline outputs (FunctionalMetrics format only).
+    Returns:
+        clustered_df: DataFrame of per-conversation data loaded from clustered_results.jsonl
+        metrics: Dict containing the three FunctionalMetrics score dictionaries
+        model_cluster_df: DataFrame created from model_cluster_scores for plotting/analysis
+        results_path: Path to the results directory
+    """
+    cache_key = f"pipeline_results_{results_dir}"
+    cached = DataCache.get(cache_key)
+    if cached:
+        return cached
+    results_path = Path(results_dir)
+    if not results_path.exists():
+        raise FileNotFoundError(f"Results directory does not exist: {results_dir}")
+    # ------------------------------------------------------------------
+    # 1. Load FunctionalMetrics score files (must ALL be present)
+    # ------------------------------------------------------------------
+    required_files = [
+        "model_cluster_scores.json",
+        "cluster_scores.json",
+        "model_scores.json",
+    ]
+    missing = [f for f in required_files if not (results_path / f).exists()]
+    if missing:
+        raise FileNotFoundError(
+            f"Missing required metrics files in {results_dir}: {', '.join(missing)}"
+        )
+    with open(results_path / "model_cluster_scores.json") as f:
+        model_cluster_scores = json.load(f)
+    with open(results_path / "cluster_scores.json") as f:
+        cluster_scores = json.load(f)
+    with open(results_path / "model_scores.json") as f:
+        model_scores = json.load(f)
+    metrics = {
+        "model_cluster_scores": model_cluster_scores,
+        "cluster_scores": cluster_scores,
+        "model_scores": model_scores,
+    }
+    # ------------------------------------------------------------------
+    # 2. Load clustered conversation data (JSON-Lines)
+    # ------------------------------------------------------------------
+    clustered_path = results_path / "clustered_results_lightweight.jsonl"
+    if not clustered_path.exists():
+        raise FileNotFoundError(f"clustered_results_lightweight.jsonl not found in {results_dir}")
+    try:
+        clustered_df = pd.read_json(clustered_path, lines=True)
+    except Exception as e:
+        raise ValueError(f"Could not load clustered results: {e}")
+    # ------------------------------------------------------------------
+    # 3. Create model_cluster_df from metrics for plotting/analysis
+    # ------------------------------------------------------------------
+    model_cluster_df = create_model_cluster_dataframe(model_cluster_scores)
+    result = (clustered_df, metrics, model_cluster_df, results_path)
+    DataCache.set(cache_key, result)
+    return result
+def load_property_examples(results_path: Path, property_ids: List[str]) -> pd.DataFrame:
+    """Load specific property examples on-demand"""
+    if not property_ids:
+        return pd.DataFrame()
+    cache_key = f"examples_{results_path}_{hash(tuple(sorted(property_ids)))}"
+    cached = DataCache.get(cache_key)
+    if cached is not None:
+        return cached
+    # Load full dataset to get prompt/response details
+    clustered_path = results_path / "clustered_results_lightweight.jsonl"
+    if not clustered_path.exists():
+        raise FileNotFoundError("Could not load example data - clustered_results_lightweight.jsonl not found")
+    try:
+        full_df = pd.read_json(clustered_path, lines=True)
+        result = full_df[full_df['id'].isin(property_ids)]
+        DataCache.set(cache_key, result)
+        return result
+    except Exception as e:
+        raise ValueError(f"Failed to load examples: {e}")

lmmvibes/vis_gradio/debug_tab.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""Logic for the **Debug Data** tab."""
+from __future__ import annotations
+from .state import app_state
+__all__ = ["debug_data_structure"]
+def debug_data_structure() -> str:
+    if app_state["clustered_df"] is None:
+        return "<p style='color: #e74c3c;'>❌ No data loaded</p>"
+    df = app_state["clustered_df"]
+    n_rows = len(df)
+    n_cols = len(df.columns)
+    # Check for both naming patterns
+    has_fine_clusters = ("property_description_fine_cluster_id" in df.columns or
+                        "fine_cluster_id" in df.columns)
+    has_coarse_clusters = ("property_description_coarse_cluster_id" in df.columns or
+                          "coarse_cluster_id" in df.columns)
+    sample_rows = min(3, len(df))
+    sample_data = df.head(sample_rows).to_html(
+        escape=False,
+        classes="table table-striped",
+        table_id="debug-table",
+    )
+    html = f"""
+    <div style="max-width: 1200px; margin: 0 auto;">
+        <h3>🐛 Data Structure Debug Info</h3>
+        <div style="background: #f8f9fa; padding: 15px; border-radius: 8px; margin: 15px 0;">
+            <h4>Basic Statistics</h4>
+            <ul>
+                <li><strong>Rows:</strong> {n_rows:,}</li>
+                <li><strong>Columns:</strong> {n_cols}</li>
+                <li><strong>Fine Clusters Available:</strong> {'✅ Yes' if has_fine_clusters else '❌ No'}</li>
+                <li><strong>Coarse Clusters Available:</strong> {'✅ Yes' if has_coarse_clusters else '❌ No'}</li>
+            </ul>
+        </div>
+        <div style="background: #f8f9fa; padding: 15px; border-radius: 8px; margin: 15px 0;">
+            <h4>Available Columns</h4>
+            <div style="max-height: 200px; overflow-y: auto; background: white; padding: 10px; border-radius: 4px;">
+                <ul>
+    """
+    for col in sorted(df.columns):
+        unique_values = df[col].nunique() if df[col].dtype == "object" else "N/A"
+        html += f"<li><code>{col}</code> - {df[col].dtype} (unique values: {unique_values})</li>"
+    html += f"""
+                </ul>
+            </div>
+        </div>
+        <div style="background: #f8f9fa; padding: 15px; border-radius: 8px; margin: 15px 0;">
+            <h4>Sample Data (First {sample_rows} rows)</h4>
+            <div style="max-height: 400px; overflow: auto; background: white; padding: 10px; border-radius: 4px;">
+                {sample_data}
+            </div>
+        </div>
+    </div>
+    <style>
+    #debug-table {{
+        font-size: 12px;
+        width: 100%;
+    }}
+    #debug-table th, #debug-table td {{
+        padding: 4px 8px;
+        border: 1px solid #ddd;
+    }}
+    #debug-table th {{
+        background: #f1f1f1;
+    }}
+    </style>
+    """
+    return html

lmmvibes/vis_gradio/demo.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+Demo script showing different ways to use the LMM-Vibes Gradio visualization.
+This demonstrates the Python API for launching the Gradio app.
+"""
+import argparse
+from pathlib import Path
+from lmmvibes.vis_gradio import launch_app, create_app
+def demo_basic_launch():
+    """Demo: Basic launch without pre-loading data."""
+    print("🚀 Demo: Basic launch - data can be loaded through the UI")
+    launch_app()
+def demo_preload_data(results_dir: str):
+    """Demo: Launch with pre-loaded data."""
+    print(f"🚀 Demo: Launch with pre-loaded data from {results_dir}")
+    launch_app(results_dir=results_dir)
+def demo_custom_settings(results_dir: str = None):
+    """Demo: Launch with custom settings."""
+    print("🚀 Demo: Launch with custom settings")
+    launch_app(
+        results_dir=results_dir,
+        share=True,  # Create public shareable link
+        server_name="0.0.0.0",  # Allow access from other machines
+        server_port=8080,  # Custom port
+    )
+def demo_programmatic_access():
+    """Demo: Create app object for programmatic access."""
+    print("🚀 Demo: Programmatic app creation")
+    # Create the app object without launching
+    app = create_app()
+    # You could modify the app here if needed
+    # app.title = "My Custom Title"
+    # Launch when ready
+    print("Launching app...")
+    app.launch(share=False, server_port=7861)
+def main():
+    parser = argparse.ArgumentParser(description="LMM-Vibes Gradio Visualization Demo")
+    parser.add_argument("--results_dir", help="Path to results directory for demos")
+    parser.add_argument("--demo", choices=[
+        "basic", "preload", "custom", "programmatic"
+    ], default="basic", help="Which demo to run")
+    args = parser.parse_args()
+    if args.demo == "basic":
+        demo_basic_launch()
+    elif args.demo == "preload":
+        if not args.results_dir:
+            print("❌ Error: --results_dir required for preload demo")
+            return
+        demo_preload_data(args.results_dir)
+    elif args.demo == "custom":
+        demo_custom_settings(args.results_dir)
+    elif args.demo == "programmatic":
+        demo_programmatic_access()
+if __name__ == "__main__":
+    main()

lmmvibes/vis_gradio/examples_tab.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""Logic for the **View Examples** tab – dropdown population + example renderer."""
+from __future__ import annotations
+from typing import Any, List, Tuple
+import gradio as gr
+from .state import app_state
+from .utils import (
+    get_unique_values_for_dropdowns,
+    get_example_data,
+    format_examples_display,
+    search_clusters_only,
+)
+__all__: List[str] = [
+    "get_dropdown_choices",
+    "update_example_dropdowns",
+    "view_examples",
+    "get_filter_options",
+    "update_filter_dropdowns",
+]
+# ---------------------------------------------------------------------------
+# Dropdown helpers
+# ---------------------------------------------------------------------------
+def get_dropdown_choices() -> Tuple[List[str], List[str], List[str]]:
+    if app_state["clustered_df"] is None:
+        return [], [], []
+    choices = get_unique_values_for_dropdowns(app_state["clustered_df"])
+    prompts = ["All Prompts"] + choices["prompts"]
+    models = ["All Models"] + choices["models"]
+    properties = ["All Clusters"] + choices["properties"]
+    return prompts, models, properties
+def update_example_dropdowns() -> Tuple[Any, Any, Any]:
+    prompts, models, properties = get_dropdown_choices()
+    return (
+        gr.update(choices=prompts, value="All Prompts" if prompts else None),
+        gr.update(choices=models, value="All Models" if models else None),
+        gr.update(choices=properties, value="All Clusters" if properties else None),
+    )
+# ---------------------------------------------------------------------------
+# Example viewer
+# ---------------------------------------------------------------------------
+def view_examples(
+    selected_prompt: str,
+    selected_model: str,
+    selected_property: str,
+    max_examples: int = 5,
+    use_accordion: bool = True,
+    pretty_print_dicts: bool = True,
+    search_term: str = "",
+    show_unexpected_behavior: bool = False,
+) -> str:
+    if app_state["clustered_df"] is None:
+        return (
+            "<p style='color: #e74c3c; padding: 20px;'>❌ Please load data first "
+            "using the 'Load Data' tab</p>"
+        )
+    # Apply search filter first if search term is provided
+    df = app_state["clustered_df"]
+    if search_term and isinstance(search_term, str) and search_term.strip():
+        df = search_clusters_only(df, search_term.strip(), 'fine')  # Default to fine clusters
+        if df.empty:
+            return f"<p style='color: #e74c3c; padding: 20px;'>❌ No clusters found matching '{search_term}'</p>"
+    examples = get_example_data(
+        df,
+        selected_prompt if selected_prompt != "All Prompts" else None,
+        selected_model if selected_model != "All Models" else None,
+        selected_property if selected_property != "All Clusters" else None,
+        max_examples,
+        show_unexpected_behavior=show_unexpected_behavior,
+        randomize=(
+            (selected_prompt == "All Prompts") and
+            (selected_model == "All Models") and
+            (selected_property == "All Clusters") and
+            (not search_term or not str(search_term).strip())
+        ),
+    )
+    return format_examples_display(
+        examples,
+        selected_prompt,
+        selected_model,
+        selected_property,
+        use_accordion=use_accordion,
+        pretty_print_dicts=pretty_print_dicts,
+    )
+# ---------------------------------------------------------------------------
+# Filter dropdown helpers for frequency comparison
+# ---------------------------------------------------------------------------
+def get_filter_options() -> Tuple[List[str], List[str]]:
+    if not app_state["model_stats"]:
+        return ["All Models"], ["All Metrics"]
+    available_models = ["All Models"] + list(app_state["model_stats"].keys())
+    quality_metrics = set()
+    for model_data in app_state["model_stats"].values():
+        clusters = model_data.get("fine", []) + model_data.get("coarse", [])
+        for cluster in clusters:
+            quality_score = cluster.get("quality_score", {})
+            if isinstance(quality_score, dict):
+                quality_metrics.update(quality_score.keys())
+    available_metrics = ["All Metrics"] + sorted(list(quality_metrics))
+    return available_models, available_metrics
+def update_filter_dropdowns() -> Tuple[Any, Any]:
+    models, metrics = get_filter_options()
+    return (
+        gr.update(choices=models, value="All Models" if models else None),
+        gr.update(choices=metrics, value="All Metrics" if metrics else None),
+    )

lmmvibes/vis_gradio/frequency_tab.py ADDED Viewed

	@@ -0,0 +1,307 @@

+"""Logic for the **Frequency Comparison** tab."""
+from typing import List, Tuple, Dict, Any
+import pandas as pd
+from .state import app_state
+# ---------------------------------------------------------------------------
+# NOTE: app_state currently stores metrics under the legacy key 'model_stats'.
+# During later cleanup this module will switch to 'metrics'. For now we treat
+# the value as already being the new FunctionalMetrics dict.
+# ---------------------------------------------------------------------------
+__all__ = ["create_frequency_comparison", "create_frequency_plots"]
+def create_frequency_comparison(
+    selected_models: List[str],
+) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
+    """Create frequency comparison tables for the 3 functional metrics tables."""
+    if not app_state["model_stats"]:
+        empty_df = pd.DataFrame({"Message": ["Please load data first"]})
+        return empty_df, empty_df, empty_df, ""
+    if not selected_models:
+        empty_df = pd.DataFrame({"Message": ["Please select at least one model"]})
+        return empty_df, empty_df, empty_df, ""
+    # Get the functional metrics data
+    metrics_data = app_state["model_stats"]
+    # Debug: Print data structure info
+    print(f"DEBUG: Creating frequency comparison tables")
+    print(f"  - Selected models: {selected_models}")
+    print(f"  - Available keys in metrics_data: {list(metrics_data.keys())}")
+    if "model_cluster_scores" in metrics_data:
+        model_cluster_scores = metrics_data["model_cluster_scores"]
+        print(f"  - Model cluster scores keys: {list(model_cluster_scores.keys())}")
+        for model in selected_models:
+            if model in model_cluster_scores:
+                clusters = model_cluster_scores[model]
+                print(f"  - {model}: {len(clusters)} clusters")
+            else:
+                print(f"  - {model}: NOT FOUND in model_cluster_scores")
+    if "cluster_scores" in metrics_data:
+        cluster_scores = metrics_data["cluster_scores"]
+        print(f"  - Cluster scores: {len(cluster_scores)} clusters")
+    if "model_scores" in metrics_data:
+        model_scores = metrics_data["model_scores"]
+        print(f"  - Model scores: {list(model_scores.keys())}")
+    # Create the three tables
+    model_cluster_df = create_model_cluster_table(metrics_data, selected_models)
+    cluster_df = create_cluster_table(metrics_data, selected_models)
+    model_df = create_model_table(metrics_data, selected_models)
+    print(f"  - Created tables with rows: Model-Cluster={len(model_cluster_df)}, Cluster={len(cluster_df)}, Model={len(model_df)}")
+    info_text = f"**Model-Cluster Scores:** {len(model_cluster_df)} rows | **Cluster Scores:** {len(cluster_df)} rows | **Model Scores:** {len(model_df)} rows"
+    return model_cluster_df, cluster_df, model_df, info_text
+def create_model_cluster_table(metrics_data: Dict[str, Any], selected_models: List[str]) -> pd.DataFrame:
+    """Create table for model-cluster scores."""
+    model_cluster_scores = metrics_data.get("model_cluster_scores", {})
+    print(f"DEBUG: Creating model-cluster table")
+    print(f"  - Available models in model_cluster_scores: {list(model_cluster_scores.keys())}")
+    print(f"  - Selected models: {selected_models}")
+    rows = []
+    for model_name, clusters in model_cluster_scores.items():
+        if model_name not in selected_models:
+            print(f"  - Skipping {model_name} (not in selected_models)")
+            continue
+        print(f"  - Processing {model_name} with {len(clusters)} clusters")
+        for cluster_name, metrics in clusters.items():
+            # Filter out "No properties" clusters
+            if cluster_name == "No properties":
+                continue
+            # Basic metrics
+            size = metrics.get("size", 0)
+            proportion = metrics.get("proportion", 0) * 100  # Convert to percentage
+            proportion_delta = metrics.get("proportion_delta", 0) * 100  # Convert to percentage
+            # Quality metrics - show each metric separately
+            quality = metrics.get("quality", {})
+            quality_delta = metrics.get("quality_delta", {})
+            # Create base row
+            row = {
+                "Model": model_name,
+                "Cluster": cluster_name,
+                "Size": size,
+                "Proportion (%)": f"{proportion:.1f}",
+                "Proportion Delta (%)": f"{proportion_delta:.1f}",
+                # "Examples": len(metrics.get("examples", []))
+            }
+            # Add quality metrics for each individual metric
+            for metric_name, quality_val in quality.items():
+                row[f"Quality_{metric_name.title()}"] = f"{quality_val:.3f}"
+            for metric_name, delta_val in quality_delta.items():
+                row[f"Quality_Delta_{metric_name.title()}"] = f"{delta_val:+.3f}"
+            # Confidence intervals
+            proportion_ci = metrics.get("proportion_ci", {})
+            proportion_delta_ci = metrics.get("proportion_delta_ci", {})
+            # Significance flags
+            proportion_delta_significant = metrics.get("proportion_delta_significant", False)
+            quality_delta_significant = metrics.get("quality_delta_significant", {})
+            # Format confidence intervals
+            proportion_ci_str = format_ci(proportion_ci)
+            proportion_delta_ci_str = format_ci(proportion_delta_ci)
+            # Add confidence intervals and significance
+            row.update({
+                "Proportion CI": proportion_ci_str,
+                "Proportion Delta CI": proportion_delta_ci_str,
+                "Proportion Delta Significant": "Yes" if proportion_delta_significant else "No",
+            })
+            # Add quality delta significance for each metric
+            for metric_name, is_significant in quality_delta_significant.items():
+                row[f"Quality_Delta_{metric_name.title()}_Significant"] = "Yes" if is_significant else "No"
+            rows.append(row)
+    print(f"  - Created {len(rows)} rows for model-cluster table")
+    return pd.DataFrame(rows)
+def create_cluster_table(metrics_data: Dict[str, Any], selected_models: List[str]) -> pd.DataFrame:
+    """Create table for cluster scores (aggregated across all models)."""
+    cluster_scores = metrics_data.get("cluster_scores", {})
+    print(f"DEBUG: Creating cluster table")
+    print(f"  - Available clusters: {list(cluster_scores.keys())}")
+    print(f"  - Number of clusters: {len(cluster_scores)}")
+    rows = []
+    for cluster_name, metrics in cluster_scores.items():
+        # Filter out "No properties" clusters
+        if cluster_name == "No properties":
+            continue
+        # Basic metrics
+        size = metrics.get("size", 0)
+        proportion = metrics.get("proportion", 0) * 100  # Convert to percentage
+        # Quality metrics - show each metric separately
+        quality = metrics.get("quality", {})
+        quality_delta = metrics.get("quality_delta", {})
+        # Create base row
+        row = {
+            "Cluster": cluster_name,
+            "Size": size,
+            "Proportion (%)": f"{proportion:.1f}",
+            # "Examples": len(metrics.get("examples", []))
+        }
+        # Add quality metrics for each individual metric
+        for metric_name, quality_val in quality.items():
+            row[f"Quality_{metric_name.title()}"] = f"{quality_val:.3f}"
+        for metric_name, delta_val in quality_delta.items():
+            row[f"Quality_Delta_{metric_name.title()}"] = f"{delta_val:+.3f}"
+        # Confidence intervals
+        proportion_ci = metrics.get("proportion_ci", {})
+        quality_ci = metrics.get("quality_ci", {})
+        quality_delta_ci = metrics.get("quality_delta_ci", {})
+        # Significance flags
+        quality_delta_significant = metrics.get("quality_delta_significant", {})
+        # Format confidence intervals
+        proportion_ci_str = format_ci(proportion_ci)
+        quality_ci_str = format_ci(quality_ci)
+        quality_delta_ci_str = format_ci(quality_delta_ci)
+        # Add confidence intervals and significance
+        row.update({
+            "Proportion CI": proportion_ci_str,
+        })
+        # Add quality CI and significance for each metric
+        for metric_name in quality.keys():
+            if metric_name in quality_ci:
+                ci = quality_ci[metric_name]
+                row[f"Quality_{metric_name.title()}_CI"] = format_ci(ci)
+        for metric_name in quality_delta.keys():
+            if metric_name in quality_delta_ci:
+                ci = quality_delta_ci[metric_name]
+                row[f"Quality_Delta_{metric_name.title()}_CI"] = format_ci(ci)
+            row[f"Quality_Delta_{metric_name.title()}_Significant"] = "Yes" if quality_delta_significant.get(metric_name, False) else "No"
+        rows.append(row)
+    print(f"  - Created {len(rows)} rows for cluster table")
+    return pd.DataFrame(rows)
+def create_model_table(metrics_data: Dict[str, Any], selected_models: List[str]) -> pd.DataFrame:
+    """Create table for model scores (aggregated across all clusters)."""
+    model_scores = metrics_data.get("model_scores", {})
+    print(f"DEBUG: Creating model table")
+    print(f"  - Available models in model_scores: {list(model_scores.keys())}")
+    print(f"  - Selected models: {selected_models}")
+    rows = []
+    for model_name, metrics in model_scores.items():
+        # Filter by selected models
+        if model_name not in selected_models:
+            print(f"  - Skipping {model_name} (not in selected_models)")
+            continue
+        print(f"  - Processing {model_name}")
+        # Basic metrics
+        size = metrics.get("size", 0)
+        proportion = metrics.get("proportion", 0) * 100  # Convert to percentage
+        # Quality metrics - show each metric separately
+        quality = metrics.get("quality", {})
+        quality_delta = metrics.get("quality_delta", {})
+        # Create base row
+        row = {
+            "Model": model_name,
+            "Size": size,
+            # "Proportion (%)": f"{proportion:.1f}",
+            # "Examples": len(metrics.get("examples", []))
+        }
+        # Add quality metrics for each individual metric
+        for metric_name, quality_val in quality.items():
+            row[f"Quality_{metric_name.title()}"] = f"{quality_val:.3f}"
+        # for metric_name, delta_val in quality_delta.items():
+        #     row[f"Quality_Delta_{metric_name.title()}"] = f"{delta_val:+.3f}"
+        # Confidence intervals
+        proportion_ci = metrics.get("proportion_ci", {})
+        quality_ci = metrics.get("quality_ci", {})
+        quality_delta_ci = metrics.get("quality_delta_ci", {})
+        # Significance flags
+        quality_delta_significant = metrics.get("quality_delta_significant", {})
+        # Format confidence intervals
+        proportion_ci_str = format_ci(proportion_ci)
+        # Add confidence intervals and significance
+        row.update({
+            "Proportion CI": proportion_ci_str,
+        })
+        # Add quality CI and significance for each metric
+        for metric_name in quality.keys():
+            if metric_name in quality_ci:
+                ci = quality_ci[metric_name]
+                row[f"Quality_{metric_name.title()}_CI"] = format_ci(ci)
+        # for metric_name in quality_delta.keys():
+        #     if metric_name in quality_delta_ci:
+        #         ci = quality_delta_ci[metric_name]
+        #         row[f"Quality_Delta_{metric_name.title()}_CI"] = format_ci(ci)
+        #     row[f"Quality_Delta_{metric_name.title()}_Significant"] = "Yes" if quality_delta_significant.get(metric_name, False) else "No"
+        rows.append(row)
+    print(f"  - Created {len(rows)} rows for model table")
+    return pd.DataFrame(rows)
+def format_ci(ci_dict: Dict[str, Any]) -> str:
+    """Format confidence interval dictionary to string."""
+    if not ci_dict or not isinstance(ci_dict, dict):
+        return "N/A"
+    lower = ci_dict.get("lower")
+    upper = ci_dict.get("upper")
+    mean = ci_dict.get("mean")
+    if lower is not None and upper is not None:
+        return f"[{lower:.3f}, {upper:.3f}]"
+    elif mean is not None:
+        return f"Mean: {mean:.3f}"
+    else:
+        return "N/A"
+def create_frequency_plots(*_args, **_kwargs):
+    """Removed for now – kept as a stub for backward compatibility."""
+    return None, None

lmmvibes/vis_gradio/launcher.py ADDED Viewed

	@@ -0,0 +1,122 @@

+#!/usr/bin/env python3
+"""
+CLI launcher for LMM-Vibes Gradio visualization app.
+Usage:
+    python -m lmmvibes.vis_gradio.launcher --results_dir path/to/results
+    Or directly:
+    python lmmvibes/vis_gradio/launcher.py --results_dir path/to/results
+"""
+import argparse
+import sys
+from pathlib import Path
+def main():
+    parser = argparse.ArgumentParser(
+        description="Launch LMM-Vibes Gradio visualization app",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Launch with auto-loaded data from a base results directory
+    python -m lmmvibes.vis_gradio.launcher --results_dir /path/to/results
+    # Launch with public sharing enabled
+    python -m lmmvibes.vis_gradio.launcher --results_dir /path/to/results --share
+    # Launch on specific port
+    python -m lmmvibes.vis_gradio.launcher --results_dir /path/to/results --port 8080
+    # Launch with automatic port selection
+    python -m lmmvibes.vis_gradio.launcher --results_dir /path/to/results --auto_port
+    # Launch without auto-loading (manual selection in app)
+    python -m lmmvibes.vis_gradio.launcher
+        """
+    )
+    parser.add_argument(
+        "--results_dir",
+        type=str,
+        help="Path to base results directory containing experiment subfolders (optional - can be loaded in the app)"
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        help="Create a public shareable link"
+    )
+    parser.add_argument(
+        "--server_name",
+        type=str,
+        default="127.0.0.1",
+        help="Server address (default: 127.0.0.1)"
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=7860,
+        help="Server port (default: 7860). Use --auto_port to automatically find an available port."
+    )
+    parser.add_argument(
+        "--auto_port",
+        action="store_true",
+        help="Automatically find an available port by trying ports 8080-8089"
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Enable debug mode"
+    )
+    args = parser.parse_args()
+    # Handle auto_port option
+    if args.auto_port:
+        # Use a high port range for auto-port mode
+        args.port = 8080
+        print("🔍 Auto-port mode enabled - will try ports 8080-8089")
+    # Validate results directory if provided
+    if args.results_dir:
+        results_path = Path(args.results_dir)
+        if not results_path.exists():
+            print(f"❌ Error: Results directory does not exist: {args.results_dir}")
+            sys.exit(1)
+        if not results_path.is_dir():
+            print(f"❌ Error: Path is not a directory: {args.results_dir}")
+            sys.exit(1)
+    # Import and launch the app
+    try:
+        from .app import launch_app
+        print("🚀 Launching LMM-Vibes Gradio Visualization App...")
+        print(f"🌐 Server: http://{args.server_name}:{args.port}")
+        if args.share:
+            print("🔗 Public sharing enabled")
+        launch_app(
+            results_dir=args.results_dir,
+            share=args.share,
+            server_name=args.server_name,
+            server_port=args.port,
+            debug=args.debug
+        )
+    except ImportError as e:
+        print(f"❌ Error: Failed to import required modules: {e}")
+        print("💡 Make sure you have gradio installed: pip install gradio")
+        sys.exit(1)
+    except Exception as e:
+        print(f"❌ Error launching app: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

lmmvibes/vis_gradio/load_data_tab.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""
+Utilities for the "Load Data" tab – loading pipeline results and scanning for
+available experiment folders.
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import List, Tuple
+import gradio as gr
+# ---------------------------------------------------------------------------
+# Loading utilities updated for FunctionalMetrics
+# ---------------------------------------------------------------------------
+from .state import app_state, BASE_RESULTS_DIR
+from .data_loader import (
+    load_pipeline_results,
+    scan_for_result_subfolders,
+    validate_results_directory,
+)
+# Metrics helpers
+from .metrics_adapter import get_all_models
+__all__ = [
+    "load_data",
+    "get_available_experiments",
+    "get_experiment_choices",
+    "refresh_experiment_dropdown",
+    "load_experiment_data",
+]
+def load_data(results_dir: str) -> Tuple[str, str, str]:
+    """Load pipeline results from *results_dir* and update the shared *app_state*.
+    Returns a tuple of (summary_markdown, models_info_markdown, models_checkbox_update).
+    """
+    try:
+        # 1. Validate directory structure
+        is_valid, error_msg = validate_results_directory(results_dir)
+        if not is_valid:
+            return "", f"❌ Error: {error_msg}", ""
+        # 2. Handle optional sub-folder selection (first match for now)
+        subfolders = scan_for_result_subfolders(results_dir)
+        final_dir = results_dir
+        if subfolders and "." not in subfolders:
+            final_dir = str(Path(results_dir) / subfolders[0])
+        # 3. Load results into memory
+        clustered_df, metrics, model_cluster_df, results_path = load_pipeline_results(final_dir)
+        # 4. Stash in global state so other tabs can use it
+        app_state["clustered_df"] = clustered_df
+        app_state["metrics"] = metrics
+        app_state["model_cluster_df"] = model_cluster_df
+        # Temporary alias for legacy modules
+        app_state["model_stats"] = metrics
+        app_state["results_path"] = results_path
+        app_state["available_models"] = get_all_models(metrics)
+        app_state["current_results_dir"] = final_dir
+        # 5. Compose status messages
+        n_models = len(metrics.get("model_cluster_scores", {}))
+        n_properties = len(clustered_df)
+        summary = f"""
+        ✅ **Successfully loaded pipeline results!**
+        **Data Summary:**
+        - **Models:** {n_models}
+        - **Properties:** {n_properties:,}
+        - **Results Directory:** {Path(final_dir).name}
+        """
+        # Check for both naming patterns for fine clusters
+        if ("fine_cluster_id" in clustered_df.columns or
+            "property_description_fine_cluster_id" in clustered_df.columns):
+            fine_id_col = ("fine_cluster_id" if "fine_cluster_id" in clustered_df.columns
+                          else "property_description_fine_cluster_id")
+            n_fine_clusters = clustered_df[fine_id_col].nunique()
+            summary += f"\n- **Fine Clusters:** {n_fine_clusters}"
+        # Check for both naming patterns for coarse clusters
+        if ("coarse_cluster_id" in clustered_df.columns or
+            "property_description_coarse_cluster_id" in clustered_df.columns):
+            coarse_id_col = ("coarse_cluster_id" if "coarse_cluster_id" in clustered_df.columns
+                            else "property_description_coarse_cluster_id")
+            n_coarse_clusters = clustered_df[coarse_id_col].nunique()
+            summary += f"\n- **Coarse Clusters:** {n_coarse_clusters}"
+        model_choices = app_state["available_models"]
+        models_info = f"Available models: {', '.join(model_choices)}"
+        # Gradio update object for the CheckboxGroup
+        return summary, models_info, gr.update(choices=model_choices, value=model_choices)
+    except Exception as e:
+        error_msg = f"❌ Error loading results: {e}"
+        return "", error_msg, gr.update(choices=[], value=[])
+def get_available_experiments(base_dir: str) -> List[str]:
+    """Return experiment sub-directories that contain the expected result files."""
+    if not base_dir or not os.path.exists(base_dir):
+        return []
+    experiments: List[str] = []
+    try:
+        for item in os.listdir(base_dir):
+            item_path = os.path.join(base_dir, item)
+            if os.path.isdir(item_path):
+                if (
+                    os.path.exists(os.path.join(item_path, "model_stats.json"))
+                    or os.path.exists(os.path.join(item_path, "clustered_results_lightweight.jsonl"))
+                ):
+                    experiments.append(item)
+    except Exception as e:
+        print(f"Error scanning experiments: {e}")
+    return sorted(experiments)
+def get_experiment_choices() -> List[str]:
+    """Return dropdown choices for the experiment selector."""
+    if not BASE_RESULTS_DIR:
+        return []
+    experiments = get_available_experiments(BASE_RESULTS_DIR)
+    return ["Select an experiment..."] + experiments
+def refresh_experiment_dropdown() -> gr.update:
+    """Gradio helper to refresh the experiment dropdown choices."""
+    choices = get_experiment_choices()
+    return gr.update(choices=choices, value="Select an experiment...")
+def load_experiment_data(experiment_name: str) -> Tuple[str, str, str]:
+    """Wrapper used by Gradio events to load a *selected* experiment."""
+    if not BASE_RESULTS_DIR or experiment_name == "Select an experiment...":
+        return "", "Please select a valid experiment", gr.update(choices=[], value=[])
+    experiment_path = os.path.join(BASE_RESULTS_DIR, experiment_name)
+    print(f"🔍 Loading experiment: {experiment_name} from {experiment_path}")
+    return load_data(experiment_path)

lmmvibes/vis_gradio/metrics_adapter.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""Lightweight access helpers for FunctionalMetrics score dictionaries.
+The Gradio UI now receives the *raw* FunctionalMetrics output as a
+```
+metrics = {
+    "model_cluster_scores": {...},
+    "cluster_scores": {...},
+    "model_scores": {...},
+}
+```
+This module centralises the most common look-ups so that the rest of the
+codebase does *not* need to know the exact key names.  If the format
+changes again we only need to update these helpers.
+"""
+from typing import Dict, Any, List
+__all__ = [
+    "get_model_clusters",
+    "get_all_models",
+    "get_all_clusters",
+]
+def get_model_clusters(metrics: Dict[str, Any], model_name: str) -> Dict[str, Any]:
+    """Return the per-cluster dictionary for a given model.
+    Args:
+        metrics: The dict returned by ``load_pipeline_results``.
+        model_name: Name of the model.
+    """
+    if model_name == "all":
+        # For "all" model, return cluster_scores (aggregated across all models)
+        return metrics.get("cluster_scores", {})
+    else:
+        return metrics.get("model_cluster_scores", {}).get(model_name, {})
+def get_all_models(metrics: Dict[str, Any]) -> List[str]:
+    """Return the list of model names present in the metrics dict."""
+    models = list(metrics.get("model_cluster_scores", {}).keys())
+    # Add "all" as the first option to show aggregated metrics across all models
+    return ["all"] + models
+def get_all_clusters(metrics: Dict[str, Any]) -> List[str]:
+    """Return the list of cluster names (across all models)."""
+    return list(metrics.get("cluster_scores", {}).keys())

lmmvibes/vis_gradio/overview_tab.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""Logic helpers for the **Overview** tab."""
+from typing import List
+from .state import app_state
+from .utils import compute_model_rankings_new, create_model_summary_card_new
+__all__ = ["create_overview"]
+def create_overview(
+    selected_models: List[str],
+    top_n: int,
+    score_significant_only: bool = False,
+    quality_significant_only: bool = False,
+    sort_by: str = "quality_asc",
+    min_cluster_size: int = 1,
+) -> str:
+    """Return the HTML snippet that summarises model performance."""
+    if not app_state["metrics"]:
+        return "Please load data first using the 'Load Data' tab."
+    if not selected_models:
+        return "Please select at least one model to display."
+    # 1. Compute global rankings and filter to selection
+    model_rankings = compute_model_rankings_new(app_state["metrics"])
+    filtered_rankings = [
+        (name, stats) for name, stats in model_rankings if name in selected_models
+    ]
+    # Sort so "all" appears first, then the rest by their rankings
+    all_models = [(name, stats) for name, stats in filtered_rankings if name == "all"]
+    other_models = [(name, stats) for name, stats in filtered_rankings if name != "all"]
+    filtered_rankings = all_models + other_models
+    if not filtered_rankings:
+        return "No data available for selected models."
+    # 2. Assemble HTML
+    overview_html = """
+    <div style="max-width: 1600px; margin: 0 auto;">
+        <p style="color: #666; margin-bottom: 10px;">
+            Top distinctive clusters where each model shows unique behavioural patterns.
+            Frequency shows what percentage of a model's battles resulted in that behavioural pattern.
+        </p>
+        <details style="margin-bottom:25px;">
+            <summary style="cursor:pointer; color:#4c6ef5; font-weight:600;">ℹ️  What do "proportion delta", "Quality Δ", and significance tags mean?</summary>
+            <div style="margin-top:12px; font-size:14px; line-height:1.5; color:#333;">
+                <strong>Proportion Delta</strong><br>
+                For each cluster we compute how often <em>this model</em> appears in that cluster compared with the average across all models.<br>
+                • A positive value (e.g. <code>+0.15</code>) means the model hits the behaviour more often than average.<br>
+                • A negative value (e.g. <code>-0.08</code>) means it appears less often.<br>
+                It is derived from the&nbsp;<code>proportion_delta</code>&nbsp;field in <code>model_cluster_scores.json</code>.<br><br>
+                <strong>Quality Δ</strong><br>
+                The difference between the cluster's quality score(s) for this model and the model's <em>overall</em> quality baseline, shown for each individual metric (e.g., helpfulness, accuracy).<br>
+                Positive values (green) indicate the model performs better than its average in that behaviour; negative values (red) indicate worse.<br>
+                This is derived from the <code>quality_delta</code> metric dictionary in <code>model_cluster_scores.json</code>.<br><br>
+                <strong>Significance Tags (FREQ/QUAL)</strong><br>
+                The <span style="background: #28a745; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">FREQ</span> and <span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">QUAL</span> tags indicate <em>statistical significance</em> based on confidence intervals:<br>
+                • <strong>FREQ</strong> (green): The proportion delta is statistically significant (confidence interval doesn't include zero)<br>
+                • <strong>QUAL</strong> (blue): At least one quality metric delta is statistically significant<br>
+                These tags help identify which behavioral patterns are reliably different from the model's baseline performance.
+            </div>
+        </details>
+    """
+    for model_name, _ in filtered_rankings:
+        card_html = create_model_summary_card_new(
+            model_name,
+            app_state["metrics"],
+            # top_n etc.
+            top_n,
+            score_significant_only=score_significant_only,
+            quality_significant_only=quality_significant_only,
+            sort_by=sort_by,
+            min_cluster_size=min_cluster_size,
+        )
+        overview_html += card_html
+    overview_html += "</div>"
+    return overview_html

lmmvibes/vis_gradio/plots_tab.py ADDED Viewed

	@@ -0,0 +1,284 @@

+"""
+Plots tab for the LMM-Vibes Gradio app.
+This module provides functionality to display the model cluster proportion and quality plots.
+"""
+import gradio as gr
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from typing import Tuple, List
+from .state import app_state
+def create_proportion_plot(show_ci: bool = False) -> Tuple[go.Figure, str]:
+    """Create a grouped bar plot of proportion by property and model."""
+    if app_state.get("model_cluster_df") is None:
+        return None, "No model cluster data loaded. Please load data first."
+    model_cluster_df = app_state["model_cluster_df"]
+    print("DataFrame shape:", model_cluster_df.shape)
+    print("Columns:", model_cluster_df.columns.tolist())
+    print("Proportion range:", model_cluster_df['proportion'].min(), "to", model_cluster_df['proportion'].max())
+    print("Sample data:")
+    print(model_cluster_df[['model', 'cluster', 'proportion']].head(10))
+    if model_cluster_df.empty:
+        return None, "No model cluster data available."
+    # Ensure proportion values are numeric and in reasonable range
+    model_cluster_df = model_cluster_df.copy()
+    model_cluster_df['proportion'] = pd.to_numeric(model_cluster_df['proportion'], errors='coerce')
+    # Check for any unreasonable values
+    print("After conversion - Proportion range:", model_cluster_df['proportion'].min(), "to", model_cluster_df['proportion'].max())
+    print("Proportion values > 1:", (model_cluster_df['proportion'] > 1).sum())
+    print("Proportion values < 0:", (model_cluster_df['proportion'] < 0).sum())
+    # Create property name mapping with proper ordering
+    unique_properties = sorted(model_cluster_df['cluster'].unique())
+    property_mapping = {prop: f"P{i+1}" for i, prop in enumerate(unique_properties)}
+    # Create abbreviated property column for plotting
+    model_cluster_df['property_abbr'] = model_cluster_df['cluster'].map(property_mapping)
+    # Filter out "No properties" clusters
+    model_cluster_df = model_cluster_df[model_cluster_df['cluster'] != "No properties"]
+    # Prepare confidence interval data if requested
+    error_y_data = None
+    if show_ci and 'proportion_ci_lower' in model_cluster_df.columns and 'proportion_ci_upper' in model_cluster_df.columns:
+        # Calculate error bar values
+        model_cluster_df['y_error'] = model_cluster_df['proportion_ci_upper'] - model_cluster_df['proportion']
+        model_cluster_df['y_error_minus'] = model_cluster_df['proportion'] - model_cluster_df['proportion_ci_lower']
+        # Replace NaN values with 0
+        model_cluster_df['y_error'] = model_cluster_df['y_error'].fillna(0)
+        model_cluster_df['y_error_minus'] = model_cluster_df['y_error_minus'].fillna(0)
+        error_y_data = model_cluster_df['y_error']
+        error_y_minus_data = model_cluster_df['y_error_minus']
+    # Create a grouped bar plot of 'proportion' by property (x) and model (hue)
+    fig = px.bar(
+        model_cluster_df,
+        x="property_abbr",
+        y="proportion",
+        color="model",
+        barmode="group",
+        title="Proportion by Property and Model",
+        labels={"proportion": "Proportion", "property_abbr": "Property", "model": "Model"},
+        error_y="y_error" if error_y_data is not None else None,
+        error_y_minus="y_error_minus" if error_y_data is not None else None
+    )
+    # Set the x-axis order to ensure P1, P2, P3, etc.
+    property_order = [f"P{i+1}" for i in range(len(unique_properties))]
+    fig.update_xaxes(categoryorder='array', categoryarray=property_order)
+    fig.update_layout(xaxis_tickangle=45)
+    # save figure to file
+    fig.write_html("model_cluster_proportion_plot.html")
+    # Create property mapping string
+    mapping_text = "**Property Mapping:**\n\n"
+    for prop, abbr in property_mapping.items():
+        mapping_text += f"**{abbr}:** {prop}\n\n"
+    # Add confidence interval info if enabled
+    if show_ci:
+        if 'proportion_ci_lower' in model_cluster_df.columns and 'proportion_ci_upper' in model_cluster_df.columns:
+            mapping_text += "---\n\n**Confidence Intervals:**\n"
+            mapping_text += "Error bars show 95% confidence intervals for proportion values.\n"
+        else:
+            mapping_text += "---\n\n**Note:** Confidence interval data not available in the loaded dataset.\n"
+    return fig, mapping_text
+def create_quality_plot(quality_metric: str = "helpfulness", show_ci: bool = False) -> Tuple[go.Figure, str]:
+    """Create a grouped bar plot of quality by property and model."""
+    if app_state.get("model_cluster_df") is None:
+        return None, "No model cluster data loaded. Please load data first."
+    model_cluster_df = app_state["model_cluster_df"]
+    if model_cluster_df.empty:
+        return None, "No model cluster data available."
+    # Check if the quality metric exists in the data
+    quality_col = f"quality_{quality_metric}"
+    if quality_col not in model_cluster_df.columns:
+        # Get available quality metrics for better error message
+        available_metrics = [col.replace("quality_", "") for col in model_cluster_df.columns
+                           if col.startswith("quality_")
+                           and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant", "_delta"))]
+        if not available_metrics:
+            return None, f"No quality metrics found in the data. Available columns: {list(model_cluster_df.columns)}"
+        return None, f"Quality metric '{quality_metric}' not found. Available metrics: {available_metrics}"
+    # Create a copy for plotting
+    plot_df = model_cluster_df.copy()
+    # Ensure quality values are numeric
+    plot_df[quality_col] = pd.to_numeric(plot_df[quality_col], errors='coerce')
+    # Check if we have any valid quality data
+    if plot_df[quality_col].isna().all():
+        return None, f"No valid quality data found for metric '{quality_metric}'. All values are missing or invalid."
+    # Create property name mapping with proper ordering (same as proportion plot)
+    unique_properties = sorted(plot_df['cluster'].unique())
+    property_mapping = {prop: f"P{i+1}" for i, prop in enumerate(unique_properties)}
+    # Create abbreviated property column for plotting
+    plot_df['property_abbr'] = plot_df['cluster'].map(property_mapping)
+    # Filter out "No properties" clusters
+    plot_df = plot_df[plot_df['cluster'] != "No properties"]
+    # Prepare confidence interval data if requested
+    error_y_data = None
+    if show_ci:
+        ci_lower_col = f"{quality_col}_ci_lower"
+        ci_upper_col = f"{quality_col}_ci_upper"
+        if ci_lower_col in plot_df.columns and ci_upper_col in plot_df.columns:
+            # Calculate error bar values
+            plot_df['y_error'] = plot_df[ci_upper_col] - plot_df[quality_col]
+            plot_df['y_error_minus'] = plot_df[quality_col] - plot_df[ci_lower_col]
+            # Replace NaN values with 0
+            plot_df['y_error'] = plot_df['y_error'].fillna(0)
+            plot_df['y_error_minus'] = plot_df['y_error_minus'].fillna(0)
+            error_y_data = plot_df['y_error']
+            error_y_minus_data = plot_df['y_error_minus']
+    # Create a grouped bar plot of quality by property (x) and model (hue)
+    fig = px.bar(
+        plot_df,
+        x="property_abbr",
+        y=quality_col,
+        color="model",
+        barmode="group",
+        title=f"Quality ({quality_metric.title()}) by Property and Model",
+        labels={quality_col: f"Quality ({quality_metric.title()})", "property_abbr": "Property", "model": "Model"},
+        error_y="y_error" if error_y_data is not None else None,
+        error_y_minus="y_error_minus" if error_y_data is not None else None
+    )
+    # Set the x-axis order to ensure P1, P2, P3, etc. (same as proportion plot)
+    property_order = [f"P{i+1}" for i in range(len(unique_properties))]
+    fig.update_xaxes(categoryorder='array', categoryarray=property_order)
+    fig.update_layout(xaxis_tickangle=45)
+    # save figure to file
+    fig.write_html(f"model_cluster_quality_{quality_metric}_plot.html")
+    # Create property mapping string (same as proportion plot)
+    mapping_text = "**Property Mapping:**\n\n"
+    for prop, abbr in property_mapping.items():
+        mapping_text += f"**{abbr}:** {prop}\n\n"
+    # Add confidence interval info if enabled
+    if show_ci:
+        ci_lower_col = f"{quality_col}_ci_lower"
+        ci_upper_col = f"{quality_col}_ci_upper"
+        if ci_lower_col in plot_df.columns and ci_upper_col in plot_df.columns:
+            mapping_text += "---\n\n**Confidence Intervals:**\n"
+            mapping_text += f"Error bars show 95% confidence intervals for {quality_metric} values.\n"
+        else:
+            mapping_text += "---\n\n**Note:** Confidence interval data not available for this quality metric.\n"
+    return fig, mapping_text
+def get_available_quality_metrics() -> List[str]:
+    """Get available quality metrics from the loaded DataFrame."""
+    if app_state.get("model_cluster_df") is None:
+        return ["helpfulness", "accuracy", "harmlessness", "honesty"]
+    model_cluster_df = app_state["model_cluster_df"]
+    # Find all quality columns (excluding CI and other suffix columns)
+    quality_columns = [col for col in model_cluster_df.columns
+                      if col.startswith("quality_")
+                      and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant", "_delta"))]
+    # Extract metric names by removing "quality_" prefix
+    available_quality_metrics = [col.replace("quality_", "") for col in quality_columns]
+    # If no quality metrics found, provide defaults
+    if not available_quality_metrics:
+        available_quality_metrics = ["helpfulness", "accuracy", "harmlessness", "honesty"]
+    return available_quality_metrics
+def update_quality_metric_dropdown() -> gr.Dropdown:
+    """Update the quality metric dropdown with available metrics."""
+    available_metrics = get_available_quality_metrics()
+    return gr.Dropdown(
+        label="Quality Metric",
+        choices=available_metrics,
+        value=available_metrics[0] if available_metrics else "helpfulness",
+        info="Select which quality metric to display"
+    )
+def update_quality_metric_visibility(plot_type: str) -> gr.Dropdown:
+    """Update the quality metric dropdown visibility based on plot type."""
+    available_metrics = get_available_quality_metrics()
+    return gr.Dropdown(
+        label="Quality Metric",
+        choices=available_metrics,
+        value=available_metrics[0] if available_metrics else "helpfulness",
+        info="Select which quality metric to display",
+        visible=(plot_type == "quality")
+    )
+def create_plot_with_toggle(plot_type: str, quality_metric: str = "helpfulness", show_ci: bool = False) -> Tuple[go.Figure, str]:
+    """Create a plot based on the selected type (frequency or quality)."""
+    if plot_type == "frequency":
+        return create_proportion_plot(show_ci)
+    elif plot_type == "quality":
+        return create_quality_plot(quality_metric, show_ci)
+    else:
+        return None, f"Unknown plot type: {plot_type}"
+def create_plots_tab() -> Tuple[gr.Plot, gr.Markdown, gr.Checkbox, gr.Dropdown, gr.Dropdown]:
+    """Create the plots tab interface with a toggle between frequency and quality plots."""
+    gr.Markdown("Interactive grouped bar plot showing either frequency (proportion) or quality metrics by property and model. **If the plot looks wonky, just unclick and re-click the signifigance checkbox to have it resize**")
+    # Plot controls in a row
+    with gr.Row():
+        # Plot type toggle
+        plot_type_dropdown = gr.Dropdown(
+            label="Plot Type",
+            choices=["frequency", "quality"],
+            value="frequency",
+            info="Choose between frequency (proportion) or quality metrics"
+        )
+        # Quality metric dropdown (only visible for quality plots)
+        quality_metric_dropdown = gr.Dropdown(
+            label="Quality Metric",
+            choices=get_available_quality_metrics(),
+            value=get_available_quality_metrics()[0] if get_available_quality_metrics() else "helpfulness",
+            info="Select which quality metric to display",
+            visible=False  # Initially hidden, shown when quality is selected
+        )
+    # Add checkbox for confidence intervals
+    show_ci_checkbox = gr.Checkbox(
+        label="Show Confidence Intervals",
+        value=True,
+        info="Display 95% confidence intervals as error bars (if available in data)"
+    )
+    plot_display = gr.Plot(
+        label="Model-Cluster Analysis Plot",
+        value=None
+    )
+    plot_info = gr.Markdown("")
+    return plot_display, plot_info, show_ci_checkbox, plot_type_dropdown, quality_metric_dropdown

lmmvibes/vis_gradio/side_by_side_display.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""
+Side-by-side display component for comparing model responses.
+This module provides functionality to display two model responses side by side
+for comparison, specifically designed for datasets with model_a_response and
+model_b_response fields.
+"""
+from typing import Dict, Any, Optional
+from .conversation_display import convert_to_openai_format, display_openai_conversation_html
+import html
+def display_side_by_side_responses(
+    model_a: str,
+    model_b: str,
+    model_a_response: Any,
+    model_b_response: Any,
+    use_accordion: bool = True,
+    pretty_print_dicts: bool = True,
+    score: Optional[float] = None,
+    winner: Optional[str] = None
+) -> str:
+    """
+    Display two model responses side by side for comparison.
+    Args:
+        model_a: Name of model A
+        model_b: Name of model B
+        model_a_response: Response data from model A
+        model_b_response: Response data from model B
+        use_accordion: If True, group system and info messages in collapsible accordions
+        pretty_print_dicts: If True, pretty-print embedded dictionaries
+        score: Optional score for the comparison
+        winner: Optional winner indication ('model_a', 'model_b', or 'tie')
+    Returns:
+        HTML string for side-by-side display
+    """
+    # Convert responses to OpenAI format
+    conversation_a = convert_to_openai_format(model_a_response) if model_a_response != 'N/A' else None
+    conversation_b = convert_to_openai_format(model_b_response) if model_b_response != 'N/A' else None
+    # Generate conversation HTML for each model
+    if conversation_a:
+        html_a = display_openai_conversation_html(
+            conversation_a,
+            use_accordion=use_accordion,
+            pretty_print_dicts=pretty_print_dicts
+        )
+    else:
+        html_a = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
+    if conversation_b:
+        html_b = display_openai_conversation_html(
+            conversation_b,
+            use_accordion=use_accordion,
+            pretty_print_dicts=pretty_print_dicts
+        )
+    else:
+        html_b = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
+    # Create winner badges if winner is specified
+    winner_badge_a = ""
+    winner_badge_b = ""
+    if winner:
+        if winner == 'model_a':
+            winner_badge_a = """
+            <span style="
+                background: #28a745;
+                color: white;
+                padding: 4px 8px;
+                border-radius: 12px;
+                font-size: 12px;
+                font-weight: bold;
+                margin-left: 10px;
+            ">
+                🏆 Winner
+            </span>
+            """
+        elif winner == 'model_b':
+            winner_badge_b = """
+            <span style="
+                background: #28a745;
+                color: white;
+                padding: 4px 8px;
+                border-radius: 12px;
+                font-size: 12px;
+                font-weight: bold;
+                margin-left: 10px;
+            ">
+                🏆 Winner
+            </span>
+            """
+        elif winner == 'tie':
+            tie_badge = """
+            <span style="
+                background: #6c757d;
+                color: white;
+                padding: 4px 8px;
+                border-radius: 12px;
+                font-size: 12px;
+                font-weight: bold;
+                margin-left: 10px;
+            ">
+                🤝 Tie
+            </span>
+            """
+            winner_badge_a = tie_badge
+            winner_badge_b = tie_badge
+    # Add score badge if available
+    score_info = ""
+    if score is not None and score != 'N/A':
+        try:
+            score_val = float(score)
+            score_color = '#28a745' if score_val >= 0 else '#dc3545'
+            score_info = f"""
+            <div style="text-align: center; margin-bottom: 15px;">
+                <span style="
+                    background: {score_color};
+                    color: white;
+                    padding: 6px 12px;
+                    border-radius: 15px;
+                    font-size: 14px;
+                    font-weight: bold;
+                ">
+                    Comparison Score: {score_val:.3f}
+                </span>
+            </div>
+            """
+        except (ValueError, TypeError):
+            pass
+    # Create the side-by-side layout
+    side_by_side_html = f"""
+    <div style="margin-bottom: 20px;">
+        {score_info}
+        <div style="display: flex; gap: 20px; margin-top: 10px;">
+            <!-- Model A Column -->
+            <div style="flex: 1; border: 2px solid #e9ecef; border-radius: 8px; padding: 15px; background-color: #f8f9fa;">
+                <h4 style="margin: 0 0 15px 0; padding-bottom: 10px; border-bottom: 2px solid #dee2e6; color: #495057; display: flex; align-items: center;">
+                    <span style="background: #007bff; color: white; padding: 4px 8px; border-radius: 4px; font-size: 12px; margin-right: 10px;">A</span>
+                    {html.escape(model_a)}
+                    {winner_badge_a}
+                </h4>
+                <div style="font-size: 13px; line-height: 1.5;">
+                    {html_a}
+                </div>
+            </div>
+            <!-- Model B Column -->
+            <div style="flex: 1; border: 2px solid #e9ecef; border-radius: 8px; padding: 15px; background-color: #f8f9fa;">
+                <h4 style="margin: 0 0 15px 0; padding-bottom: 10px; border-bottom: 2px solid #dee2e6; color: #495057; display: flex; align-items: center;">
+                    <span style="background: #fd7e14; color: white; padding: 4px 8px; border-radius: 4px; font-size: 12px; margin-right: 10px;">B</span>
+                    {html.escape(model_b)}
+                    {winner_badge_b}
+                </h4>
+                <div style="font-size: 13px; line-height: 1.5;">
+                    {html_b}
+                </div>
+            </div>
+        </div>
+    </div>
+    """
+    return side_by_side_html
+def is_side_by_side_dataset(example: Dict[str, Any]) -> bool:
+    """
+    Check if an example contains side-by-side comparison data.
+    Args:
+        example: Example dictionary from the dataset
+    Returns:
+        True if the example has both model_a_response and model_b_response
+    """
+    # Check if this is a side-by-side dataset by looking for both model_a_response and model_b_response
+    return 'model_a_response' in example and 'model_b_response' in example and \
+           example.get('model_a_response') is not None and example.get('model_b_response') is not None
+def extract_side_by_side_data(row: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Extract side-by-side comparison data from a row.
+    Args:
+        row: Row from the dataset
+    Returns:
+        Dictionary with extracted side-by-side data
+    """
+    return {
+        'model_a': row.get('model_a', 'Model A'),
+        'model_b': row.get('model_b', 'Model B'),
+        'model_a_response': row.get('model_a_response', 'N/A'),
+        'model_b_response': row.get('model_b_response', 'N/A'),
+        'winner': row.get('winner', None),
+        'score': row.get('score', None)
+    }

lmmvibes/vis_gradio/state.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Shared application state for the LMM-Vibes Gradio viewer.
+This module centralises mutable globals so they can be imported from any other
+sub-module without circular-import problems.
+"""
+from typing import Any, Dict, Optional
+import os
+from pathlib import Path
+# Global runtime state – mutable and shared across all tabs
+app_state: Dict[str, Any] = {
+    "clustered_df": None,
+    # NEW canonical key for the FunctionalMetrics dict
+    "metrics": None,
+    # DEPRECATED alias kept temporarily so that untouched modules continue to work
+    "model_stats": None,
+    "results_path": None,
+    "available_models": [],
+    "current_results_dir": None,
+}
+# Base directory that contains experiment result folders. Can be changed at
+# runtime via launch_app(results_dir=…).  A value of None means "not set".
+# Prefer persistent storage in Spaces at /data/data when available.
+_default_base = "/data/data" if Path("/data/data").exists() else "data"
+BASE_RESULTS_DIR: Optional[str] = os.getenv("BASE_RESULTS_DIR", _default_base)

lmmvibes/vis_gradio/utils.py ADDED Viewed

	@@ -0,0 +1,1673 @@

+"""
+Utility functions for Gradio pipeline results app.
+This module contains common utility functions used across different components.
+"""
+import numpy as np
+import pandas as pd
+import json
+import markdown
+import plotly.graph_objects as go
+import plotly.express as px
+from typing import Dict, List, Any, Optional, Tuple
+import html
+import ast
+# Conversation rendering helpers are now in a dedicated module for clarity
+from . import conversation_display as _convdisp
+from .conversation_display import (
+    convert_to_openai_format,
+    display_openai_conversation_html,
+    pretty_print_embedded_dicts,
+)
+# NEW IMPLEMENTATION ---------------------------------------------------
+from .metrics_adapter import get_model_clusters, get_all_models
+# ---------------------------------------------------------------------------
+# NEW helper utilities for FunctionalMetrics format
+# ---------------------------------------------------------------------------
+def format_confidence_interval(ci: dict | None, decimals: int = 3) -> str:
+    """Return a pretty string for a CI dict of the form {"lower": x, "upper": y}."""
+    if not ci or not isinstance(ci, dict):
+        return "N/A"
+    lower, upper = ci.get("lower"), ci.get("upper")
+    if lower is None or upper is None:
+        return "N/A"
+    return f"[{lower:.{decimals}f}, {upper:.{decimals}f}]"
+def get_confidence_interval_width(ci: dict | None) -> float | None:
+    """Return CI width (upper-lower) if possible."""
+    if not ci or not isinstance(ci, dict):
+        return None
+    lower, upper = ci.get("lower"), ci.get("upper")
+    if lower is None or upper is None:
+        return None
+    return upper - lower
+def has_confidence_intervals(record: dict | None) -> bool:
+    """Simple check whether any *_ci key with lower/upper exists in a metrics record."""
+    if not record or not isinstance(record, dict):
+        return False
+    for k, v in record.items():
+        if k.endswith("_ci") and isinstance(v, dict) and {"lower", "upper"}.issubset(v.keys()):
+            return True
+    return False
+def extract_quality_score(quality_field: Any) -> float | None:
+    """Given a quality field that may be a dict of metric values or a scalar, return its mean."""
+    if quality_field is None:
+        return None
+    if isinstance(quality_field, (int, float)):
+        return float(quality_field)
+    if isinstance(quality_field, dict) and quality_field:
+        return float(np.mean(list(quality_field.values())))
+    return None
+# ---------------------------------------------------------------------------
+# UPDATED: get_top_clusters_for_model for FunctionalMetrics format
+# ---------------------------------------------------------------------------
+def get_top_clusters_for_model(metrics: Dict[str, Any], model_name: str, top_n: int = 10) -> List[Tuple[str, Dict[str, Any]]]:
+    """Return the top N clusters (by salience) for a given model.
+    Args:
+        metrics: The FunctionalMetrics dictionary (3-file format) loaded via data_loader.
+        model_name: Name of the model to inspect.
+        top_n: Number of clusters to return.
+    Returns:
+        List of (cluster_name, cluster_dict) tuples sorted by descending proportion_delta.
+    """
+    clusters_dict = get_model_clusters(metrics, model_name)
+    if not clusters_dict:
+        return []
+    # Filter out "No properties" clusters
+    clusters_dict = {k: v for k, v in clusters_dict.items() if k != "No properties"}
+    sorted_items = sorted(
+        clusters_dict.items(), key=lambda kv: kv[1].get("proportion_delta", 0), reverse=True
+    )
+    return sorted_items[:top_n]
+def compute_model_rankings_new(metrics: Dict[str, Any]) -> List[tuple]:
+    """Compute rankings of models based on mean salience (proportion_delta).
+    Args:
+        metrics: The FunctionalMetrics dict loaded by data_loader.
+    Returns:
+        List[Tuple[str, Dict[str, float]]]: sorted list of (model_name, summary_dict)
+    """
+    model_scores: Dict[str, Dict[str, float]] = {}
+    for model in get_all_models(metrics):
+        clusters = get_model_clusters(metrics, model)
+        # Filter out "No properties" clusters
+        clusters = {k: v for k, v in clusters.items() if k != "No properties"}
+        if not clusters:
+            continue
+        saliences = [c.get("proportion_delta", 0.0) for c in clusters.values()]
+        model_scores[model] = {
+            "avg_salience": float(np.mean(saliences)),
+            "median_salience": float(np.median(saliences)),
+            "num_clusters": len(saliences),
+            "top_salience": float(max(saliences)),
+            "std_salience": float(np.std(saliences)),
+        }
+    return sorted(model_scores.items(), key=lambda x: x[1]["avg_salience"], reverse=True)
+def create_model_summary_card_new(
+    model_name: str,
+    metrics: Dict[str, Any],
+    top_n: int = 3,
+    score_significant_only: bool = False,
+    quality_significant_only: bool = False,
+    sort_by: str = "quality_asc",
+    min_cluster_size: int = 1,
+) -> str:
+    """Generate a **styled** HTML summary card for a single model.
+    The new implementation recreates the legacy card design the user prefers:
+    • Card header with battle count
+    • Each cluster displayed as a vertically-spaced block (NOT a table)
+    • Frequency, distinctiveness factor and CI inline; quality score right-aligned
+    """
+    clusters_dict = get_model_clusters(metrics, model_name)
+    if not clusters_dict:
+        return f"<div style='padding:20px'>No cluster data for {model_name}</div>"
+    # Filter out "No properties" clusters
+    clusters_dict = {k: v for k, v in clusters_dict.items() if k != "No properties"}
+    # Filter clusters ----------------------------------------------------
+    all_clusters = [c for c in clusters_dict.values() if c.get("size", 0) >= min_cluster_size]
+    if score_significant_only:
+        if model_name == "all":
+            # For "all" model, we don't have proportion_delta_significant, so skip this filter
+            pass
+        else:
+            all_clusters = [c for c in all_clusters if c.get("proportion_delta_significant", False)]
+    if quality_significant_only:
+        all_clusters = [c for c in all_clusters if any(c.get("quality_delta_significant", {}).values())]
+    if not all_clusters:
+        return f"<div style='padding:20px'>No clusters pass filters for {model_name}</div>"
+    # Count significant properties ---------------------------------------
+    significant_frequency_count = 0
+    significant_quality_count = 0
+    for cluster in clusters_dict.values():
+        if cluster.get("size", 0) >= min_cluster_size:
+            # Count frequency significance
+            if model_name != "all" and cluster.get("proportion_delta_significant", False):
+                significant_frequency_count += 1
+            # Count quality significance (sum across all metrics)
+            quality_delta_significant = cluster.get("quality_delta_significant", {})
+            significant_quality_count += sum(quality_delta_significant.values())
+    # Sort ---------------------------------------------------------------
+    def _mean_quality(c: dict[str, Any]) -> float:
+        vals = list(c.get("quality", {}).values())
+        return float(np.mean(vals)) if vals else 0.0
+    sort_key_map = {
+        "quality_asc": (_mean_quality, False),
+        "quality_desc": (_mean_quality, True),
+        "frequency_desc": (lambda c: c.get("proportion", 0), True),
+        "frequency_asc": (lambda c: c.get("proportion", 0), False),
+        "salience_desc": (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), True),
+        "salience_asc": (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), False),
+    }
+    key_fn, reverse = sort_key_map.get(sort_by, (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), True))
+    sorted_clusters = sorted(all_clusters, key=key_fn, reverse=reverse)[:top_n]
+    # Determine total conversations for this model ----------------
+    if model_name == "all":
+        # For "all" model, sum the individual model totals to avoid double-counting
+        model_scores = metrics.get("model_scores", {})
+        total_battles = sum(model_data.get("size", 0) for model_data in model_scores.values())
+    else:
+        model_scores_entry = metrics.get("model_scores", {}).get(model_name, {})
+        total_battles = model_scores_entry.get("size")
+        if total_battles is None:
+            # Fallback: deduplicate example IDs across clusters
+            total_battles = sum(c.get("size", 0) for c in clusters_dict.values())
+    # Card header --------------------------------------------------------
+    html_parts: list[str] = [f"""
+    <div style="padding: 20px; border:1px solid #e0e0e0; border-radius:8px; margin-bottom:25px;">
+      <h3 style="margin-top:0; font-size: 20px;">{html.escape(model_name)}</h3>
+      <p style="margin: 4px 0 8px 0; color:#555; font-size:13px;">
+        {total_battles} battles · Top clusters by frequency
+      </p>
+      <p style="margin: 0 0 18px 0; color:#666; font-size:12px;">
+        📊 {significant_frequency_count} significant frequency properties · {significant_quality_count} significant quality properties
+      </p>
+    """]
+    # Cluster blocks -----------------------------------------------------
+    for i, cluster in enumerate(sorted_clusters):
+        name = html.escape(next(k for k, v in clusters_dict.items() if v is cluster))
+        prop = cluster.get("proportion", 0)
+        freq_pct = prop * 100
+        size = cluster.get("size", 0)
+        # Check significance flags
+        is_proportion_significant = False
+        if model_name != "all":
+            is_proportion_significant = cluster.get("proportion_delta_significant", False)
+        quality_delta_significant = cluster.get("quality_delta_significant", {})
+        is_quality_significant = any(quality_delta_significant.values())
+        # Create significance indicators
+        significance_indicators = []
+        if is_proportion_significant:
+            significance_indicators.append('<span style="background: #28a745; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">FREQ</span>')
+        if is_quality_significant:
+            significance_indicators.append('<span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">QUAL</span>')
+        significance_html = " ".join(significance_indicators) if significance_indicators else ""
+        # Distinctiveness factor heuristic
+        if model_name == "all":
+            # For "all" model, proportion_delta doesn't make sense, so show proportion instead
+            distinct_factor = prop
+            distinct_text = f"{freq_pct:.1f}% of all conversations"
+        else:
+            sal = cluster.get("proportion_delta", 0)
+            distinct_factor = 1 + (sal / prop) if prop else 1
+            distinct_text = f"proportion delta: {sal:+.3f}"
+        # Confidence interval (frequency based)
+        ci = cluster.get("proportion_ci")
+        ci_str = format_confidence_interval(ci) if ci else "N/A"
+        # Quality delta – show each metric separately
+        quality_delta = cluster.get("quality_delta", {})
+        quality_delta_html = ""
+        if quality_delta:
+            quality_delta_parts = []
+            for metric_name, delta_value in quality_delta.items():
+                color = "#28a745" if delta_value >= 0 else "#dc3545"
+                quality_delta_parts.append(f'<div style="color:{color}; font-weight:500;">{metric_name}: {delta_value:+.3f}</div>')
+            quality_delta_html = "".join(quality_delta_parts)
+        else:
+            quality_delta_html = '<span style="color:#666;">No quality data</span>'
+        # Get light color for this cluster
+        cluster_color = get_light_color_for_cluster(name, i)
+        html_parts.append(f"""
+        <div style="border-left: 4px solid #4c6ef5; padding: 12px 16px; margin-bottom: 10px; background:{cluster_color}; border-radius: 4px;">
+          <div style="display:flex; justify-content:space-between; align-items:flex-start;">
+            <div style="max-width:80%;">
+              <div style="margin-bottom:4px;">
+                <strong style="font-size:14px;">{name}</strong>
+              </div>
+              <span style="font-size:12px; color:#555;">{freq_pct:.1f}% frequency ({size} out of {total_battles} total) · {distinct_text}</span>
+            </div>
+            <div style="font-size:12px; font-weight:normal; white-space:nowrap; text-align:right;">
+              {quality_delta_html}
+              {significance_html}
+            </div>
+          </div>
+        </div>
+        """)
+    # Close card div -----------------------------------------------------
+    html_parts.append("</div>")
+    return "\n".join(html_parts)
+def format_cluster_dataframe(clustered_df: pd.DataFrame,
+                           selected_models: Optional[List[str]] = None,
+                           cluster_level: str = 'fine') -> pd.DataFrame:
+    """Format cluster DataFrame for display in Gradio."""
+    df = clustered_df.copy()
+    # Debug information
+    print(f"DEBUG: format_cluster_dataframe called")
+    print(f"  - Input DataFrame shape: {df.shape}")
+    print(f"  - Selected models: {selected_models}")
+    print(f"  - Available models in data: {df['model'].unique().tolist() if 'model' in df.columns else 'No model column'}")
+    # Filter by models if specified
+    if selected_models:
+        print(f"  - Filtering by {len(selected_models)} selected models")
+        df = df[df['model'].isin(selected_models)]
+        print(f"  - After filtering shape: {df.shape}")
+        print(f"  - Models after filtering: {df['model'].unique().tolist()}")
+    else:
+        print(f"  - No model filtering applied")
+    # Select relevant columns based on cluster level using correct column names from pipeline
+    if cluster_level == 'fine':
+        id_col = 'property_description_fine_cluster_id'
+        label_col = 'property_description_fine_cluster_label'
+        # Also check for alternative naming without prefix
+        alt_id_col = 'fine_cluster_id'
+        alt_label_col = 'fine_cluster_label'
+    else:
+        id_col = 'property_description_coarse_cluster_id'
+        label_col = 'property_description_coarse_cluster_label'
+        # Also check for alternative naming without prefix
+        alt_id_col = 'coarse_cluster_id'
+        alt_label_col = 'coarse_cluster_label'
+    # Try both naming patterns
+    if id_col in df.columns and label_col in df.columns:
+        # Use the expected naming pattern
+        cols = ['question_id', 'model', 'property_description', id_col, label_col, 'score']
+    elif alt_id_col in df.columns and alt_label_col in df.columns:
+        # Use the alternative naming pattern
+        cols = ['question_id', 'model', 'property_description', alt_id_col, alt_label_col, 'score']
+    else:
+        # Fall back to basic columns if cluster columns are missing
+        cols = ['question_id', 'model', 'property_description', 'score']
+    # Keep only existing columns
+    available_cols = [col for col in cols if col in df.columns]
+    df = df[available_cols]
+    print(f"  - Final DataFrame shape: {df.shape}")
+    print(f"  - Final columns: {df.columns.tolist()}")
+    return df
+def truncate_cluster_name(cluster_desc: str, max_length: int = 50) -> str:
+    """Truncate cluster description to fit in table column."""
+    if len(cluster_desc) <= max_length:
+        return cluster_desc
+    return cluster_desc[:max_length-3] + "..."
+def create_frequency_comparison_table(model_stats: Dict[str, Any],
+                                     selected_models: List[str],
+                                     cluster_level: str = "fine",  # Ignored – kept for backward-compat
+                                     top_n: int = 50,
+                                     selected_model: str | None = None,
+                                     selected_quality_metric: str | None = None) -> pd.DataFrame:
+    """Create a comparison table for the new FunctionalMetrics format.
+    The old signature is kept (cluster_level arg is ignored) so that callers
+    can be updated incrementally.
+    """
+    if not selected_models:
+        return pd.DataFrame()
+    # ------------------------------------------------------------------
+    # 1. Collect per-model, per-cluster rows
+    # ------------------------------------------------------------------
+    all_rows: List[dict] = []
+    for model in selected_models:
+        model_clusters = get_model_clusters(model_stats, model)  # type: ignore[arg-type]
+        if not model_clusters:
+            continue
+        # Optional filter by a single model after the fact
+        if selected_model and model != selected_model:
+            continue
+        for cluster_name, cdata in model_clusters.items():
+            # Filter out "No properties" clusters
+            if cluster_name == "No properties":
+                continue
+            # Basic numbers
+            freq_pct = cdata.get("proportion", 0.0) * 100.0
+            prop_ci = cdata.get("proportion_ci")
+            # Quality per metric dicts ------------------------------------------------
+            quality_dict = cdata.get("quality", {}) or {}
+            quality_ci_dict = cdata.get("quality_ci", {}) or {}
+            # Significance flags
+            sal_sig = bool(cdata.get("proportion_delta_significant", False))
+            quality_sig_flags = cdata.get("quality_delta_significant", {}) or {}
+            all_rows.append({
+                "cluster": cluster_name,
+                "model": model,
+                "frequency": freq_pct,
+                "proportion_ci": prop_ci,
+                "quality": quality_dict,
+                "quality_ci": quality_ci_dict,
+                "score_significant": sal_sig,
+                "quality_significant_any": any(quality_sig_flags.values()),
+                "quality_significant_metric": quality_sig_flags.get(selected_quality_metric) if selected_quality_metric else None,
+            })
+    if not all_rows:
+        return pd.DataFrame()
+    df_all = pd.DataFrame(all_rows)
+    # Aggregate frequency across models ----------------------------------
+    freq_sum = df_all.groupby("cluster")["frequency"].sum().sort_values(ascending=False)
+    top_clusters = freq_sum.head(top_n).index.tolist()
+    df_top = df_all[df_all["cluster"].isin(top_clusters)].copy()
+    table_rows: List[dict] = []
+    for clu in top_clusters:
+        subset = df_top[df_top["cluster"] == clu]
+        avg_freq = subset["frequency"].mean()
+        # Aggregate CI (mean of bounds)
+        ci_lowers = [ci.get("lower") for ci in subset["proportion_ci"] if isinstance(ci, dict)]
+        ci_uppers = [ci.get("upper") for ci in subset["proportion_ci"] if isinstance(ci, dict)]
+        freq_ci = {
+            "lower": float(np.mean(ci_lowers)) if ci_lowers else None,
+            "upper": float(np.mean(ci_uppers)) if ci_uppers else None,
+        } if ci_lowers and ci_uppers else None
+        # Quality aggregation -----------------------------------------------------
+        q_vals: List[float] = []
+        q_ci_l: List[float] = []
+        q_ci_u: List[float] = []
+        quality_sig_any = False
+        for _, row in subset.iterrows():
+            q_dict = row["quality"]
+            if selected_quality_metric:
+                if selected_quality_metric in q_dict:
+                    q_vals.append(q_dict[selected_quality_metric])
+                ci_metric = row["quality_ci"].get(selected_quality_metric) if isinstance(row["quality_ci"], dict) else None
+                if ci_metric:
+                    q_ci_l.append(ci_metric.get("lower"))
+                    q_ci_u.append(ci_metric.get("upper"))
+                quality_sig_any = quality_sig_any or bool(row["quality_significant_metric"])
+            else:
+                q_vals.extend(q_dict.values())
+                for ci in row["quality_ci"].values():
+                    if isinstance(ci, dict):
+                        q_ci_l.append(ci.get("lower"))
+                        q_ci_u.append(ci.get("upper"))
+                quality_sig_any = quality_sig_any or row["quality_significant_any"]
+        quality_val = float(np.mean(q_vals)) if q_vals else None
+        quality_ci = {
+            "lower": float(np.mean(q_ci_l)),
+            "upper": float(np.mean(q_ci_u)),
+        } if q_ci_l and q_ci_u else None
+        score_sig = subset["score_significant"].any()
+        table_rows.append({
+            "Cluster": clu,
+            "Frequency (%)": f"{avg_freq:.1f}",
+            "Freq CI": format_confidence_interval(freq_ci),
+            "Quality": f"{quality_val:.3f}" if quality_val is not None else "N/A",
+            "Quality CI": format_confidence_interval(quality_ci) if quality_ci else "N/A",
+            "Score Significance": "Yes" if score_sig else "No",
+            "Quality Significance": "Yes" if quality_sig_any else "No",
+        })
+    return pd.DataFrame(table_rows)
+def create_frequency_comparison_plots(model_stats: Dict[str, Any],
+                                     selected_models: List[str],
+                                     cluster_level: str = 'fine',
+                                     top_n: int = 50,
+                                     show_confidence_intervals: bool = False) -> Tuple[go.Figure, go.Figure]:
+    """Create frequency comparison plots (matching frequencies_tab.py exactly)."""
+    print(f"\nDEBUG: Plotting function called with:")
+    print(f"  - Selected models: {selected_models}")
+    print(f"  - Cluster level: {cluster_level}")
+    print(f"  - Top N: {top_n}")
+    print(f"  - Available models in stats: {list(model_stats.keys())}")
+    # Use the same data preparation logic as the table function
+    # Collect all clusters across all models for the chart (exact copy from frequencies_tab.py)
+    all_clusters_data = []
+    for model_name, model_data in model_stats.items():
+        if model_name not in selected_models:
+            continue
+        clusters = model_data.get(cluster_level, [])
+        for cluster in clusters:
+            # Filter out "No properties" clusters
+            if cluster.get('property_description') == "No properties":
+                continue
+            # Get confidence intervals for quality scores if available
+            quality_score_ci = cluster.get('quality_score_ci', {})
+            has_quality_ci = bool(quality_score_ci)
+            # Get distinctiveness score confidence intervals (correct structure)
+            score_ci = cluster.get('score_ci', {})
+            ci_lower = score_ci.get('lower') if score_ci else None
+            ci_upper = score_ci.get('upper') if score_ci else None
+            all_clusters_data.append({
+                'property_description': cluster['property_description'],
+                'model': model_name,
+                'frequency': cluster.get('proportion', 0) * 100,  # Convert to percentage
+                'size': cluster.get('size', 0),
+                'cluster_size_global': cluster.get('cluster_size_global', 0),
+                'has_ci': has_confidence_intervals(cluster),
+                'ci_lower': ci_lower,
+                'ci_upper': ci_upper,
+                'has_quality_ci': has_quality_ci
+            })
+    if not all_clusters_data:
+        # Return empty figures
+        empty_fig = go.Figure()
+        empty_fig.add_annotation(text="No data available", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
+        return empty_fig, empty_fig
+    clusters_df = pd.DataFrame(all_clusters_data)
+    # Get all unique clusters for the chart
+    all_unique_clusters = clusters_df['property_description'].unique()
+    total_clusters = len(all_unique_clusters)
+    # Show all clusters by default
+    top_n_for_chart = min(top_n, total_clusters)
+    # Calculate total frequency per cluster and get top clusters
+    cluster_totals = clusters_df.groupby('property_description')['frequency'].sum().sort_values(ascending=False)
+    top_clusters = cluster_totals.head(top_n_for_chart).index.tolist()
+    # Get quality scores for the same clusters to sort by quality
+    quality_data_for_sorting = []
+    for model_name, model_data in model_stats.items():
+        if model_name not in selected_models:
+            continue
+        clusters = model_data.get(cluster_level, [])
+        for cluster in clusters:
+            # Filter out "No properties" clusters
+            if cluster.get('property_description') == "No properties":
+                continue
+            if cluster['property_description'] in top_clusters:
+                quality_data_for_sorting.append({
+                    'property_description': cluster['property_description'],
+                    'quality_score': extract_quality_score(cluster.get('quality_score', 0))
+                })
+    # Calculate average quality score per cluster and sort
+    if quality_data_for_sorting:
+        quality_df_for_sorting = pd.DataFrame(quality_data_for_sorting)
+        avg_quality_per_cluster = quality_df_for_sorting.groupby('property_description')['quality_score'].mean().sort_values(ascending=True)  # Low to high
+        top_clusters = avg_quality_per_cluster.index.tolist()
+        # Reverse the order so low quality appears at top of chart
+        top_clusters = top_clusters[::-1]
+    # Filter data to only include top clusters
+    chart_data = clusters_df[clusters_df['property_description'].isin(top_clusters)]
+    if chart_data.empty:
+        # Return empty figures
+        empty_fig = go.Figure()
+        empty_fig.add_annotation(text="No data available", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
+        return empty_fig, empty_fig
+    # Get unique models for colors
+    models = chart_data['model'].unique()
+    # Use a color palette that avoids yellow - using Set1 which has better contrast
+    colors = px.colors.qualitative.Set1[:len(models)]
+    # Create horizontal bar chart for frequencies
+    fig = go.Figure()
+    # Add a bar for each model
+    for i, model in enumerate(models):
+        model_data = chart_data[chart_data['model'] == model]
+        # Sort by cluster order (same as top_clusters)
+        model_data = model_data.set_index('property_description').reindex(top_clusters).reset_index()
+        # Fill NaN values with 0 for missing clusters
+        model_data['frequency'] = model_data['frequency'].fillna(0)
+        model_data['has_ci'] = model_data['has_ci'].fillna(False)
+        # For CI columns, replace NaN with None using where() instead of fillna(None)
+        model_data['ci_lower'] = model_data['ci_lower'].where(pd.notna(model_data['ci_lower']), None)
+        model_data['ci_upper'] = model_data['ci_upper'].where(pd.notna(model_data['ci_upper']), None)
+        # Ensure frequency is numeric and non-negative
+        model_data['frequency'] = pd.to_numeric(model_data['frequency'], errors='coerce').fillna(0)
+        model_data['frequency'] = model_data['frequency'].clip(lower=0)
+        # Debug: print model data for first model
+        if i == 0:  # Only print for first model to avoid spam
+            print(f"DEBUG: Model {model} data sample:")
+            print(f"  - Clusters: {len(model_data)}")
+            print(f"  - Frequency range: {model_data['frequency'].min():.2f} - {model_data['frequency'].max():.2f}")
+            print(f"  - Non-zero frequencies: {(model_data['frequency'] > 0).sum()}")
+            if len(model_data) > 0:
+                print(f"  - Sample row: {model_data.iloc[0][['property_description', 'frequency']].to_dict()}")
+        # Remove any rows where property_description is NaN (these are clusters this model doesn't appear in)
+        model_data = model_data.dropna(subset=['property_description'])
+        # Get confidence intervals for error bars
+        ci_lower = []
+        ci_upper = []
+        for _, row in model_data.iterrows():
+            freq_value = row.get('frequency', 0)
+            if (row.get('has_ci', False) and
+                pd.notna(row.get('ci_lower')) and
+                pd.notna(row.get('ci_upper')) and
+                freq_value > 0):  # Only calculate CIs for non-zero frequencies
+                # IMPORTANT: These are distinctiveness score CIs, not frequency CIs
+                # The distinctiveness score measures how much more/less frequently
+                # a model exhibits this behavior compared to the median model
+                # We can use this to estimate uncertainty in the frequency measurement
+                distinctiveness_ci_width = row['ci_upper'] - row['ci_lower']
+                # Convert to frequency uncertainty (approximate)
+                # A wider distinctiveness CI suggests more uncertainty in the frequency
+                freq_uncertainty = distinctiveness_ci_width * freq_value * 0.1
+                ci_lower.append(max(0, freq_value - freq_uncertainty))
+                ci_upper.append(freq_value + freq_uncertainty)
+            else:
+                ci_lower.append(None)
+                ci_upper.append(None)
+        # Debug: Check the data going into the plot
+        print(f"DEBUG: Adding trace for model {model}:")
+        print(f"  - Y values (clusters): {model_data['property_description'].tolist()[:3]}...")  # First 3 clusters
+        print(f"  - X values (frequencies): {model_data['frequency'].tolist()[:3]}...")  # First 3 frequencies
+        print(f"  - Total data points: {len(model_data)}")
+        fig.add_trace(go.Bar(
+            y=model_data['property_description'],
+            x=model_data['frequency'],
+            name=model,
+            orientation='h',
+            marker_color=colors[i],
+            error_x=dict(
+                type='data',
+                array=[u - l if u is not None and l is not None else None for l, u in zip(ci_lower, ci_upper)],
+                arrayminus=[f - l if f is not None and l is not None else None for f, l in zip(model_data['frequency'], ci_lower)],
+                visible=show_confidence_intervals,
+                thickness=1,
+                width=3,
+                color='rgba(0,0,0,0.3)'
+            ),
+            hovertemplate='<b>%{y}</b><br>' +
+                        f'Model: {model}<br>' +
+                        'Frequency: %{x:.1f}%<br>' +
+                        'CI: %{customdata[0]}<extra></extra>',
+            customdata=[[
+                format_confidence_interval({
+                    'lower': l,
+                    'upper': u
+                }) if l is not None and u is not None else "N/A"
+                for l, u in zip(ci_lower, ci_upper)
+            ]]
+        ))
+    # Update layout
+    fig.update_layout(
+        title=f"Model Frequencies in Top {len(top_clusters)} Clusters",
+        xaxis_title="Frequency (%)",
+        yaxis_title="Cluster Description",
+        barmode='group',  # Group bars side by side
+        height=max(600, len(top_clusters) * 25),  # Adjust height based on number of clusters
+        showlegend=True,
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=1.02,
+            xanchor="right",
+            x=1
+        )
+    )
+    # Update y-axis to show truncated cluster names
+    fig.update_yaxes(
+        tickmode='array',
+        ticktext=[truncate_cluster_name(desc, 60) for desc in top_clusters],
+        tickvals=top_clusters
+    )
+    # Create quality score chart
+    # Get quality scores for the same clusters (single score per cluster)
+    quality_data = []
+    quality_cis = []  # Add confidence intervals for quality scores
+    for cluster_desc in top_clusters:
+        # Get the first available quality score for this cluster
+        for model_name, model_data in model_stats.items():
+            clusters = model_data.get(cluster_level, [])
+            for cluster in clusters:
+                if cluster['property_description'] == cluster_desc:
+                    quality_score = extract_quality_score(cluster.get('quality_score', 0))
+                    quality_data.append({
+                        'property_description': cluster_desc,
+                        'quality_score': quality_score
+                    })
+                    # Get quality score confidence intervals
+                    quality_ci = cluster.get('quality_score_ci', {})
+                    if isinstance(quality_ci, dict) and quality_ci:
+                        # Get the first available quality CI
+                        for score_key, ci_data in quality_ci.items():
+                            if isinstance(ci_data, dict):
+                                ci_lower = ci_data.get('lower')
+                                ci_upper = ci_data.get('upper')
+                                if ci_lower is not None and ci_upper is not None:
+                                    quality_cis.append({
+                                        'property_description': cluster_desc,
+                                        'ci_lower': ci_lower,
+                                        'ci_upper': ci_upper
+                                    })
+                                    break
+                        else:
+                            quality_cis.append({
+                                'property_description': cluster_desc,
+                                'ci_lower': None,
+                                'ci_upper': None
+                            })
+                    else:
+                        quality_cis.append({
+                            'property_description': cluster_desc,
+                            'ci_lower': None,
+                            'ci_upper': None
+                        })
+                    break
+            if any(q['property_description'] == cluster_desc for q in quality_data):
+                break
+    if quality_data:
+        quality_df = pd.DataFrame(quality_data)
+        quality_cis_df = pd.DataFrame(quality_cis) if quality_cis else None
+        # Create quality score chart with single bars
+        fig_quality = go.Figure()
+        # Prepare confidence intervals for error bars
+        ci_lower = []
+        ci_upper = []
+        for _, row in quality_df.iterrows():
+            cluster_desc = row['property_description']
+            if quality_cis_df is not None:
+                ci_row = quality_cis_df[quality_cis_df['property_description'] == cluster_desc]
+                if not ci_row.empty:
+                    ci_lower.append(ci_row.iloc[0]['ci_lower'])
+                    ci_upper.append(ci_row.iloc[0]['ci_upper'])
+                else:
+                    ci_lower.append(None)
+                    ci_upper.append(None)
+            else:
+                ci_lower.append(None)
+                ci_upper.append(None)
+        # Add a single bar for each cluster
+        fig_quality.add_trace(go.Bar(
+            y=[truncate_cluster_name(desc, 60) for desc in quality_df['property_description']],
+            x=quality_df['quality_score'],
+            orientation='h',
+            marker_color='lightblue',  # Single color for all bars
+            name='Quality Score',
+            showlegend=False,
+            error_x=dict(
+                type='data',
+                array=[u - l if u is not None and l is not None else None for l, u in zip(ci_lower, ci_upper)],
+                arrayminus=[q - l if q is not None and l is not None else None for q, l in zip(quality_df['quality_score'], ci_lower)],
+                visible=show_confidence_intervals,
+                thickness=1,
+                width=3,
+                color='rgba(0,0,0,0.3)'
+            ),
+            hovertemplate='<b>%{y}</b><br>' +
+                        'Quality Score: %{x:.3f}<br>' +
+                        'CI: %{customdata[0]}<extra></extra>',
+            customdata=[[
+                format_confidence_interval({
+                    'lower': l,
+                    'upper': u
+                }) if l is not None and u is not None else "N/A"
+                for l, u in zip(ci_lower, ci_upper)
+            ]]
+        ))
+        # Update layout
+        fig_quality.update_layout(
+            title=f"Quality Scores",
+            xaxis_title="Quality Score",
+            yaxis_title="",  # No y-axis title to save space
+            height=max(600, len(top_clusters) * 25),  # Same height as main chart
+            showlegend=False,
+            yaxis=dict(showticklabels=False)  # Hide y-axis labels to save space
+        )
+    else:
+        # Create empty quality figure
+        fig_quality = go.Figure()
+        fig_quality.add_annotation(text="No quality score data available",
+                                 xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
+    return fig, fig_quality
+def search_clusters_by_text(clustered_df: pd.DataFrame,
+                          search_term: str,
+                          search_in: str = 'description') -> pd.DataFrame:
+    """Search clusters by text in descriptions or other fields."""
+    if not search_term:
+        return clustered_df.head(100)  # Return first 100 if no search
+    search_term = search_term.lower()
+    if search_in == 'description':
+        mask = clustered_df['property_description'].str.lower().str.contains(search_term, na=False)
+    elif search_in == 'model':
+        mask = clustered_df['model'].str.lower().str.contains(search_term, na=False)
+    elif search_in == 'cluster_label':
+        # Use correct column names from pipeline
+        fine_label_col = 'property_description_fine_cluster_label'
+        coarse_label_col = 'property_description_coarse_cluster_label'
+        mask = pd.Series([False] * len(clustered_df))
+        if fine_label_col in clustered_df.columns:
+            mask |= clustered_df[fine_label_col].str.lower().str.contains(search_term, na=False)
+        if coarse_label_col in clustered_df.columns:
+            mask |= clustered_df[coarse_label_col].str.lower().str.contains(search_term, na=False)
+    else:
+        # Search in all text columns using correct column names
+        text_cols = ['property_description', 'model',
+                    'property_description_fine_cluster_label',
+                    'property_description_coarse_cluster_label']
+        mask = pd.Series([False] * len(clustered_df))
+        for col in text_cols:
+            if col in clustered_df.columns:
+                mask |= clustered_df[col].str.lower().str.contains(search_term, na=False)
+    return clustered_df[mask].head(100)
+def search_clusters_only(clustered_df: pd.DataFrame,
+                       search_term: str,
+                       cluster_level: str = 'fine') -> pd.DataFrame:
+    """Search only over cluster labels, not individual property descriptions."""
+    if not search_term:
+        return clustered_df
+    search_term = search_term.lower()
+    # Use the correct column names based on cluster level
+    if cluster_level == 'fine':
+        label_col = 'property_description_fine_cluster_label'
+        alt_label_col = 'fine_cluster_label'
+    else:
+        label_col = 'property_description_coarse_cluster_label'
+        alt_label_col = 'coarse_cluster_label'
+    # Try both naming patterns
+    if label_col in clustered_df.columns:
+        mask = clustered_df[label_col].str.lower().str.contains(search_term, na=False)
+    elif alt_label_col in clustered_df.columns:
+        mask = clustered_df[alt_label_col].str.lower().str.contains(search_term, na=False)
+    else:
+        # If neither column exists, return empty DataFrame
+        return pd.DataFrame()
+    return clustered_df[mask]
+def create_interactive_cluster_viewer(clustered_df: pd.DataFrame,
+                                    selected_models: Optional[List[str]] = None,
+                                    cluster_level: str = 'fine') -> str:
+    """Create interactive cluster viewer HTML similar to Streamlit version."""
+    if clustered_df.empty:
+        return "<p>No cluster data available</p>"
+    df = clustered_df.copy()
+    # Debug information
+    print(f"DEBUG: create_interactive_cluster_viewer called")
+    print(f"  - Input DataFrame shape: {df.shape}")
+    print(f"  - Selected models: {selected_models}")
+    print(f"  - Available models in data: {df['model'].unique().tolist() if 'model' in df.columns else 'No model column'}")
+    # Filter by models if specified
+    if selected_models:
+        print(f"  - Filtering by {len(selected_models)} selected models")
+        df = df[df['model'].isin(selected_models)]
+        print(f"  - After filtering shape: {df.shape}")
+        print(f"  - Models after filtering: {df['model'].unique().tolist()}")
+    else:
+        print(f"  - No model filtering applied")
+    if df.empty:
+        return f"<p>No data found for selected models: {', '.join(selected_models or [])}</p>"
+    # Get cluster scores data for quality and frequency information
+    from .state import app_state
+    cluster_scores = app_state.get("metrics", {}).get("cluster_scores", {})
+    # Use the actual column names from the pipeline output (matching Streamlit version)
+    if cluster_level == 'fine':
+        id_col = 'property_description_fine_cluster_id'
+        label_col = 'property_description_fine_cluster_label'
+        # Also check for alternative naming without prefix
+        alt_id_col = 'fine_cluster_id'
+        alt_label_col = 'fine_cluster_label'
+    else:
+        id_col = 'property_description_coarse_cluster_id'
+        label_col = 'property_description_coarse_cluster_label'
+        # Also check for alternative naming without prefix
+        alt_id_col = 'coarse_cluster_id'
+        alt_label_col = 'coarse_cluster_label'
+    # Track if we fall back from coarse to fine
+    fell_back_to_fine = False
+    # Check if required columns exist and provide helpful debug info
+    # Try both naming patterns
+    if id_col in df.columns and label_col in df.columns:
+        # Use the expected naming pattern
+        pass
+    elif alt_id_col in df.columns and alt_label_col in df.columns:
+        # Use the alternative naming pattern
+        id_col = alt_id_col
+        label_col = alt_label_col
+    else:
+        # If coarse clusters are not available, try to fall back to fine clusters
+        if cluster_level == 'coarse':
+            # Check if fine clusters are available
+            fine_id_col = 'property_description_fine_cluster_id'
+            fine_label_col = 'property_description_fine_cluster_label'
+            fine_alt_id_col = 'fine_cluster_id'
+            fine_alt_label_col = 'fine_cluster_label'
+            if (fine_id_col in df.columns and fine_label_col in df.columns) or (fine_alt_id_col in df.columns and fine_alt_label_col in df.columns):
+                # Fall back to fine clusters
+                if fine_id_col in df.columns and fine_label_col in df.columns:
+                    id_col = fine_id_col
+                    label_col = fine_label_col
+                else:
+                    id_col = fine_alt_id_col
+                    label_col = fine_alt_label_col
+                cluster_level = 'fine'  # Update the cluster level for display
+                fell_back_to_fine = True
+            else:
+                # No cluster columns available at all
+                available_cols = list(df.columns)
+                return f"""
+                <div style="padding: 20px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px;">
+                    <h4>❌ Missing cluster columns in data</h4>
+                    <p><strong>Expected:</strong> {id_col}, {label_col} OR {alt_id_col}, {alt_label_col}</p>
+                    <p><strong>Available columns:</strong> {', '.join(available_cols)}</p>
+                    <p>Please ensure your data contains clustering results from the LMM-Vibes pipeline.</p>
+                </div>
+                """
+        else:
+            # For fine clusters, show the original error
+            available_cols = list(df.columns)
+            return f"""
+            <div style="padding: 20px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px;">
+                <h4>❌ Missing {cluster_level} cluster columns in data</h4>
+                <p><strong>Expected:</strong> {id_col}, {label_col} OR {alt_id_col}, {alt_label_col}</p>
+                <p><strong>Available columns:</strong> {', '.join(available_cols)}</p>
+                <p>Please ensure your data contains clustering results from the LMM-Vibes pipeline.</p>
+            </div>
+            """
+    # Group by cluster to get cluster information
+    try:
+        print(f"  - Grouping by cluster columns: {id_col}, {label_col}")
+        cluster_groups = df.groupby([id_col, label_col]).agg({
+            'property_description': ['count', lambda x: x.unique().tolist()],
+            'model': lambda x: x.unique().tolist()
+        }).reset_index()
+        # Flatten column names
+        cluster_groups.columns = [
+            id_col, label_col, 'size', 'property_descriptions', 'models'
+        ]
+        # Sort by size (largest first)
+        cluster_groups = cluster_groups.sort_values('size', ascending=False)
+        # Filter out "No properties" clusters
+        cluster_groups = cluster_groups[cluster_groups[label_col] != "No properties"]
+        print(f"  - Found {len(cluster_groups)} clusters")
+        print(f"  - Cluster sizes: {cluster_groups['size'].tolist()}")
+        print(f"  - Models per cluster: {[len(models) for models in cluster_groups['models']]}")
+    except Exception as e:
+        return f"""
+        <div style="padding: 20px; background: #f8d7da; border: 1px solid #f5c6cb; border-radius: 8px;">
+            <h4>❌ Error processing cluster data</h4>
+            <p><strong>Error:</strong> {str(e)}</p>
+            <p>Please check your data format and try again.</p>
+        </div>
+        """
+    if len(cluster_groups) == 0:
+        return """
+        <div style="padding: 20px; background: #d1ecf1; border: 1px solid #bee5eb; border-radius: 8px;">
+            <h4>ℹ️ No clusters found</h4>
+            <p>No clusters match your current filters. Try selecting different models or adjusting your search.</p>
+        </div>
+        """
+    # Create HTML
+    html = f"""
+    <div style="max-width: 1600px; margin: 0 auto;">
+        <h3>🔍 Interactive Cluster Viewer ({cluster_level.title()} Level)</h3>
+        <p style="color: #666; margin-bottom: 20px;">
+            Click on clusters below to explore their property descriptions.
+            Showing {len(cluster_groups)} clusters sorted by size.
+        </p>
+    """
+    # Add a note if we fell back from coarse to fine clusters
+    if cluster_level == 'fine' and fell_back_to_fine:
+        html += """
+        <div style="padding: 15px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px; margin-bottom: 20px;">
+            <strong>Note:</strong> Coarse clusters not available in this dataset. Showing fine clusters instead.
+        </div>
+        """
+    for i, row in cluster_groups.iterrows():
+        cluster_id = row[id_col]
+        cluster_label = row[label_col]
+        cluster_size = row['size']
+        property_descriptions = row['property_descriptions']
+        models_in_cluster = row['models']
+        # Get quality and frequency information from cluster_scores
+        cluster_metrics = cluster_scores.get(cluster_label, {})
+        frequency_pct = cluster_metrics.get("proportion", 0) * 100 if cluster_metrics else 0
+        quality_scores = cluster_metrics.get("quality", {})
+        quality_delta = cluster_metrics.get("quality_delta", {})
+        # Build per-metric header display: "metric: score (delta)"
+        header_quality_display = "N/A"
+        if quality_scores or quality_delta:
+            metric_names = sorted(set(quality_scores.keys()) | set(quality_delta.keys()))
+            parts: list[str] = []
+            for metric_name in metric_names:
+                score_val = quality_scores.get(metric_name)
+                delta_val = quality_delta.get(metric_name)
+                score_str = f"{score_val:.3f}" if isinstance(score_val, (int, float)) else "N/A"
+                if isinstance(delta_val, (int, float)):
+                    color = "#28a745" if delta_val >= 0 else "#dc3545"
+                    parts.append(f"{metric_name}: {score_str} <span style=\"color: {color};\">({delta_val:+.3f})</span>")
+                else:
+                    parts.append(f"{metric_name}: {score_str}")
+            header_quality_display = "\n".join(parts)
+        # Format quality scores for detailed view
+        quality_html = ""
+        if quality_scores:
+            quality_parts = []
+            for metric_name, score in quality_scores.items():
+                color = "#28a745" if score >= 0 else "#dc3545"
+                quality_parts.append(f'<span style="color:{color}; font-weight:500;">{metric_name}: {score:.3f}</span>')
+            quality_html = " | ".join(quality_parts)
+        else:
+            quality_html = '<span style="color:#666;">No quality data</span>'
+        # Format quality delta (relative to average)
+        quality_delta_html = ""
+        if quality_delta:
+            delta_parts = []
+            for metric_name, delta in quality_delta.items():
+                color = "#28a745" if delta >= 0 else "#dc3545"
+                sign = "+" if delta >= 0 else ""
+                delta_parts.append(f'<span style="color:{color}; font-weight:500;">{metric_name}: {sign}{delta:.3f}</span>')
+            quality_delta_html = " | ".join(delta_parts)
+        else:
+            quality_delta_html = '<span style="color:#666;">No delta data</span>'
+        # Format header quality score with visual indicators
+        header_quality_text = header_quality_display
+        # Get light color for this cluster (matching overview style)
+        cluster_color = get_light_color_for_cluster(cluster_label, i)
+        # Create expandable cluster card with overview-style design
+        html += f"""
+        <details style="margin: 15px 0; border: 1px solid #e0e0e0; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
+            <summary style="
+                padding: 15px;
+                background: {cluster_color};
+                color: #333;
+                cursor: pointer;
+                font-weight: 600;
+                font-size: 16px;
+                user-select: none;
+                list-style: none;
+                display: flex;
+                justify-content: space-between;
+                align-items: center;
+                border-bottom: 1px solid #dee2e6;
+            ">
+                <div style="max-width: 80%;">
+                    <div style="margin-bottom: 4px;">
+                        <strong style="font-size: 14px;">{cluster_label}</strong>
+                    </div>
+                    <span style="font-size: 12px; color: #555;">
+                        {frequency_pct:.1f}% frequency ({cluster_size} properties) · {len(models_in_cluster)} models
+                    </span>
+                </div>
+                <div style="font-size: 12px; font-weight: normal; white-space: nowrap; text-align: right;">
+                    <div style="margin-bottom: 4px;">
+                        <span style="font-weight: 500;">{header_quality_text}</span>
+                    </div>
+                    <div style="color: #6c757d;">
+                        {frequency_pct:.1f}% frequency
+                    </div>
+                </div>
+            </summary>
+            <div style="padding: 20px; background: #f8f9fa;">
+                <div style="margin-bottom: 15px;">
+                    <strong>Cluster ID:</strong> {cluster_id}<br>
+                    <strong>Size:</strong> {cluster_size} properties<br>
+                    <strong>Models:</strong> {', '.join(models_in_cluster)}<br>
+                    <strong>Frequency:</strong> {frequency_pct:.1f}% of all conversations<br>
+                    <strong>Quality Scores:</strong> {quality_html}<br>
+                    <strong>Quality vs Average:</strong> {quality_delta_html}
+                </div>
+                <h4 style="color: #333; margin: 15px 0 10px 0;">
+                    Property Descriptions ({len(property_descriptions)})
+                </h4>
+                <div style="max-height: 300px; overflow-y: auto; background: white; border: 1px solid #ddd; border-radius: 4px; padding: 10px;">
+        """
+        # Display property descriptions
+        for i, desc in enumerate(property_descriptions, 1):
+            html += f"""
+                    <div style="
+                        padding: 8px;
+                        margin: 4px 0;
+                        background: #f8f9fa;
+                        border-left: 3px solid #667eea;
+                        border-radius: 2px;
+                    ">
+                        <strong>{i}.</strong> {desc}
+                    </div>
+            """
+        html += """
+                </div>
+            </div>
+        </details>
+        """
+    html += "</div>"
+    return html
+def get_cluster_statistics(clustered_df: pd.DataFrame,
+                         selected_models: Optional[List[str]] = None) -> Dict[str, Any]:
+    """Get cluster statistics for display."""
+    if clustered_df.empty:
+        return {}
+    df = clustered_df.copy()
+    # Filter by models if specified
+    if selected_models:
+        df = df[df['model'].isin(selected_models)]
+    stats = {
+        'total_properties': len(df),
+        'total_models': df['model'].nunique() if 'model' in df.columns else 0,
+    }
+    # Fine cluster statistics - try both naming patterns
+    fine_id_col = 'property_description_fine_cluster_id'
+    alt_fine_id_col = 'fine_cluster_id'
+    if fine_id_col in df.columns:
+        stats['fine_clusters'] = df[fine_id_col].nunique()
+        cluster_sizes = df.groupby(fine_id_col).size()
+        stats['min_properties_per_fine_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
+        stats['max_properties_per_fine_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
+        stats['avg_properties_per_fine_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
+    elif alt_fine_id_col in df.columns:
+        stats['fine_clusters'] = df[alt_fine_id_col].nunique()
+        cluster_sizes = df.groupby(alt_fine_id_col).size()
+        stats['min_properties_per_fine_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
+        stats['max_properties_per_fine_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
+        stats['avg_properties_per_fine_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
+    # Coarse cluster statistics - try both naming patterns
+    coarse_id_col = 'property_description_coarse_cluster_id'
+    alt_coarse_id_col = 'coarse_cluster_id'
+    if coarse_id_col in df.columns:
+        stats['coarse_clusters'] = df[coarse_id_col].nunique()
+        cluster_sizes = df.groupby(coarse_id_col).size()
+        stats['min_properties_per_coarse_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
+        stats['max_properties_per_coarse_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
+        stats['avg_properties_per_coarse_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
+    elif alt_coarse_id_col in df.columns:
+        stats['coarse_clusters'] = df[alt_coarse_id_col].nunique()
+        cluster_sizes = df.groupby(alt_coarse_id_col).size()
+        stats['min_properties_per_coarse_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
+        stats['max_properties_per_coarse_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
+        stats['avg_properties_per_coarse_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
+    return stats
+def get_unique_values_for_dropdowns(clustered_df: pd.DataFrame) -> Dict[str, List[str]]:
+    """Get unique values for dropdown menus."""
+    if clustered_df.empty:
+        return {'prompts': [], 'models': [], 'properties': []}
+    # Get unique values, handling missing columns gracefully
+    prompts = []
+    if 'prompt' in clustered_df.columns:
+        unique_prompts = clustered_df['prompt'].dropna().unique().tolist()
+        prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
+    elif 'question' in clustered_df.columns:
+        unique_prompts = clustered_df['question'].dropna().unique().tolist()
+        prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
+    elif 'input' in clustered_df.columns:
+        unique_prompts = clustered_df['input'].dropna().unique().tolist()
+        prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
+    elif 'user_prompt' in clustered_df.columns:
+        unique_prompts = clustered_df['user_prompt'].dropna().unique().tolist()
+        prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
+    # Handle both single model and side-by-side datasets
+    models = []
+    if 'model' in clustered_df.columns:
+        # Single model datasets
+        models = sorted(clustered_df['model'].dropna().unique().tolist())
+    elif 'model_a' in clustered_df.columns and 'model_b' in clustered_df.columns:
+        # Side-by-side datasets - combine models from both columns
+        models_a = clustered_df['model_a'].dropna().unique().tolist()
+        models_b = clustered_df['model_b'].dropna().unique().tolist()
+        all_models = set(models_a + models_b)
+        models = sorted(list(all_models))
+    # Use fine cluster labels instead of property descriptions - try both naming patterns
+    properties = []
+    fine_label_col = 'property_description_fine_cluster_label'
+    alt_fine_label_col = 'fine_cluster_label'
+    if fine_label_col in clustered_df.columns:
+        unique_properties = clustered_df[fine_label_col].dropna().unique().tolist()
+        # Filter out "No properties" clusters
+        unique_properties = [prop for prop in unique_properties if prop != "No properties"]
+        properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
+    elif alt_fine_label_col in clustered_df.columns:
+        unique_properties = clustered_df[alt_fine_label_col].dropna().unique().tolist()
+        # Filter out "No properties" clusters
+        unique_properties = [prop for prop in unique_properties if prop != "No properties"]
+        properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
+    elif 'property_description' in clustered_df.columns:
+        # Fallback to property descriptions if cluster labels not available
+        unique_properties = clustered_df['property_description'].dropna().unique().tolist()
+        # Filter out "No properties" clusters
+        unique_properties = [prop for prop in unique_properties if prop != "No properties"]
+        properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
+    return {
+        'prompts': prompts,
+        'models': models,
+        'properties': properties
+    }
+# ---------------------------------------------------------------------------
+# Example data extraction (restored)
+# ---------------------------------------------------------------------------
+def get_example_data(
+    clustered_df: pd.DataFrame,
+    selected_prompt: str | None = None,
+    selected_model: str | None = None,
+    selected_property: str | None = None,
+    max_examples: int = 5,
+    show_unexpected_behavior: bool = False,
+    randomize: bool = False,
+) -> List[Dict[str, Any]]:
+    """Return a list of example rows filtered by prompt / model / property.
+    This function was accidentally removed during a refactor; it is required by
+    *examples_tab.py* and other parts of the UI.
+    Args:
+        clustered_df: DataFrame containing the clustered results data
+        selected_prompt: Prompt to filter by (None for all)
+        selected_model: Model to filter by (None for all)
+        selected_property: Property description to filter by (None for all)
+        max_examples: Maximum number of examples to return
+        show_unexpected_behavior: If True, filter to only show unexpected behavior
+        randomize: If True, sample randomly from the filtered set instead of taking the first rows
+    Returns:
+        List of example dictionaries with extracted data
+    """
+    if clustered_df.empty:
+        return []
+    df = clustered_df.copy()
+    # Filter by unexpected behavior if requested
+    if show_unexpected_behavior:
+        if "unexpected_behavior" in df.columns:
+            # Assuming True/1 means unexpected behavior
+            df = df[df["unexpected_behavior"].isin([True, 1, "True", "true"])]
+        else:
+            # If no unexpected_behavior column, return empty (or could return all)
+            return []
+    # Filter by prompt
+    if selected_prompt:
+        prompt_cols = ["prompt", "question", "input", "user_prompt"]
+        for col in prompt_cols:
+            if col in df.columns:
+                df = df[df[col].str.contains(selected_prompt, case=False, na=False)]
+                break
+    # Filter by model - handle both single model and side-by-side datasets
+    if selected_model:
+        if "model" in df.columns:
+            # Single model datasets
+            df = df[df["model"] == selected_model]
+        elif "model_a" in df.columns and "model_b" in df.columns:
+            # Side-by-side datasets - filter where either model_a or model_b matches
+            df = df[(df["model_a"] == selected_model) | (df["model_b"] == selected_model)]
+    # Filter by property
+    if selected_property:
+        property_cols = ["property_description", "cluster", "fine_cluster_label", "property_description_fine_cluster_label"]
+        for col in property_cols:
+            if col in df.columns:
+                df = df[df[col].str.contains(selected_property, case=False, na=False)]
+                break
+    # Limit to max_examples (randomized if requested)
+    if randomize:
+        if len(df) > max_examples:
+            df = df.sample(n=max_examples)
+        else:
+            df = df.sample(frac=1)
+    else:
+        df = df.head(max_examples)
+    examples: List[Dict[str, Any]] = []
+    for _, row in df.iterrows():
+        prompt_val = next(
+            (row.get(col) for col in ["prompt", "question", "input", "user_prompt"] if row.get(col) is not None),
+            "N/A",
+        )
+        # Check if this is a side-by-side dataset
+        is_side_by_side = ('model_a_response' in row and 'model_b_response' in row and
+                          row.get('model_a_response') is not None and row.get('model_b_response') is not None)
+        if is_side_by_side:
+            # For side-by-side datasets, store both responses separately
+            response_val = "SIDE_BY_SIDE"  # Special marker
+            model_val = f"{row.get('model_a', 'Model A')} vs {row.get('model_b', 'Model B')}"
+        else:
+            # For single response datasets, use the existing logic
+            response_val = next(
+                (
+                    row.get(col)
+                    for col in [
+                        "model_response",
+                        "model_a_response",
+                        "model_b_response",
+                        "responses",
+                        "response",
+                        "output",
+                    ]
+                    if row.get(col) is not None
+                ),
+                "N/A",
+            )
+            model_val = row.get("model", "N/A")
+        # Try both naming patterns for cluster data
+        fine_cluster_id = row.get("property_description_fine_cluster_id", row.get("fine_cluster_id", "N/A"))
+        fine_cluster_label = row.get("property_description_fine_cluster_label", row.get("fine_cluster_label", "N/A"))
+        coarse_cluster_id = row.get("property_description_coarse_cluster_id", row.get("coarse_cluster_id", "N/A"))
+        coarse_cluster_label = row.get("property_description_coarse_cluster_label", row.get("coarse_cluster_label", "N/A"))
+        example_dict = {
+            "id": row.get("id", "N/A"),
+            "model": model_val,
+            "prompt": prompt_val,
+            "response": response_val,
+            "property_description": row.get("property_description", "N/A"),
+            "score": row.get("score", "N/A"),
+            "fine_cluster_id": fine_cluster_id,
+            "fine_cluster_label": fine_cluster_label,
+            "coarse_cluster_id": coarse_cluster_id,
+            "coarse_cluster_label": coarse_cluster_label,
+            "category": row.get("category", "N/A"),
+            "type": row.get("type", "N/A"),
+            "impact": row.get("impact", "N/A"),
+            "reason": row.get("reason", "N/A"),
+            "evidence": row.get("evidence", "N/A"),
+            "user_preference_direction": row.get("user_preference_direction", "N/A"),
+            "raw_response": row.get("raw_response", "N/A"),
+            "contains_errors": row.get("contains_errors", "N/A"),
+            "unexpected_behavior": row.get("unexpected_behavior", "N/A"),
+        }
+        # Add side-by-side specific fields if applicable
+        if is_side_by_side:
+            example_dict.update({
+                "is_side_by_side": True,
+                "model_a": row.get("model_a", "Model A"),
+                "model_b": row.get("model_b", "Model B"),
+                "model_a_response": row.get("model_a_response", "N/A"),
+                "model_b_response": row.get("model_b_response", "N/A"),
+                "winner": row.get("winner", None),
+            })
+        else:
+            example_dict["is_side_by_side"] = False
+        examples.append(example_dict)
+    return examples
+def format_examples_display(examples: List[Dict[str, Any]],
+                          selected_prompt: str = None,
+                          selected_model: str = None,
+                          selected_property: str = None,
+                          use_accordion: bool = True,
+                          pretty_print_dicts: bool = True) -> str:
+    """Format examples for HTML display with proper conversation rendering.
+    Args:
+        examples: List of example dictionaries
+        selected_prompt: Currently selected prompt filter
+        selected_model: Currently selected model filter
+        selected_property: Currently selected property filter
+        use_accordion: If True, group system and info messages in collapsible accordions
+        pretty_print_dicts: If True, pretty-print embedded dictionaries
+    Returns:
+        HTML string for display
+    """
+    from .conversation_display import convert_to_openai_format, display_openai_conversation_html
+    from .side_by_side_display import display_side_by_side_responses
+    if not examples:
+        return "<p style='color: #e74c3c; padding: 20px;'>No examples found matching the current filters.</p>"
+    # Create filter summary
+    filter_parts = []
+    if selected_prompt and selected_prompt != "All Prompts":
+        filter_parts.append(f"Prompt: {selected_prompt}")
+    if selected_model and selected_model != "All Models":
+        filter_parts.append(f"Model: {selected_model}")
+    if selected_property and selected_property != "All Clusters":
+        filter_parts.append(f"Cluster: {selected_property}")
+    filter_summary = ""
+    if filter_parts:
+        filter_summary = f"""
+        <div style="background: #e3f2fd; padding: 15px; border-radius: 8px; margin-bottom: 20px; border-left: 4px solid #2196f3;">
+            <strong>🔍 Active Filters:</strong> {" • ".join(filter_parts)}
+        </div>
+        """
+    html = f"""
+    <div style="font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;">
+        <h3 style="color: #333; margin-bottom: 15px;">📋 Examples ({len(examples)} found)</h3>
+{filter_summary}
+    """
+    for i, example in enumerate(examples, 1):
+        # Check if this is a side-by-side example
+        if example.get('is_side_by_side', False):
+            # Use side-by-side display for comparison datasets
+            conversation_html = display_side_by_side_responses(
+                model_a=example['model_a'],
+                model_b=example['model_b'],
+                model_a_response=example['model_a_response'],
+                model_b_response=example['model_b_response'],
+                use_accordion=use_accordion,
+                pretty_print_dicts=pretty_print_dicts,
+                score=example['score'],
+                winner=example.get('winner')
+            )
+        else:
+            # Convert response to OpenAI format for proper display (single model)
+            response_data = example['response']
+            if response_data != 'N/A':
+                openai_conversation = convert_to_openai_format(response_data)
+                conversation_html = display_openai_conversation_html(openai_conversation, use_accordion=use_accordion, pretty_print_dicts=pretty_print_dicts)
+            else:
+                conversation_html = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
+        # Determine cluster info
+        cluster_info = ""
+        if example['fine_cluster_label'] != 'N/A':
+            cluster_info = f"""
+            <div style="margin-top: 10px; font-size: 13px; color: #666;">
+                <strong>🏷️ Cluster:</strong> {example['fine_cluster_label']} (ID: {example['fine_cluster_id']})
+            </div>
+            """
+        # Score display for summary (only for non-side-by-side or when not shown in side-by-side)
+        score_badge = ""
+        if not example.get('is_side_by_side', False) and example['score'] != 'N/A':
+            try:
+                score_val = float(example['score'])
+                score_color = '#28a745' if score_val >= 0 else '#dc3545'
+                score_badge = f"""
+                <span style="
+                    background: {score_color};
+                    color: white;
+                    padding: 4px 8px;
+                    border-radius: 12px;
+                    font-size: 12px;
+                    font-weight: bold;
+                    margin-left: 10px;
+                ">
+                    Score: {score_val:.3f}
+                </span>
+                """
+            except:
+                pass
+        # Create short preview of prompt for summary
+        prompt_preview = example['prompt'][:80] + "..." if len(example['prompt']) > 80 else example['prompt']
+        # Create expandable example card
+        # First example is expanded by default
+        open_attr = "open" if i == 1 else ""
+        html += f"""
+        <details {open_attr} style="border: 1px solid #dee2e6; border-radius: 8px; margin-bottom: 15px; background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
+            <summary style="
+                padding: 15px;
+                cursor: pointer;
+                font-weight: 600;
+                color: #495057;
+                background: linear-gradient(90deg, #f8f9fa 0%, #e9ecef 100%);
+                border-radius: 8px 8px 0 0;
+                border-bottom: 1px solid #dee2e6;
+                display: flex;
+                align-items: center;
+                justify-content: space-between;
+            ">
+                <span>
+                    <span style="background: #6c757d; color: white; padding: 4px 8px; border-radius: 4px; font-size: 12px; margin-right: 10px;">#{i}</span>
+                    {prompt_preview}
+                </span>
+                <span style="font-size: 12px; color: #6c757d;">
+                    {example['model']}{score_badge}
+                </span>
+            </summary>
+            <div style="padding: 20px;">
+                <div style="margin-bottom: 15px; padding: 15px; background: #f8f9fa; border-radius: 6px; border-left: 4px solid #17a2b8;">
+                    <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px; font-size: 13px; color: #666;">
+                        <div><strong>Model:</strong> {example['model']}</div>
+                        <div><strong>ID:</strong> {example['id']}</div>
+                        {f'<div><strong>Category:</strong> {example["category"]}</div>' if example["category"] not in ["N/A", "None"] else ""}
+                        {f'<div><strong>Type:</strong> {example["type"]}</div>' if example["type"] not in ["N/A", "None"] else ""}
+                        {f'<div><strong>Impact:</strong> {example["impact"]}</div>' if example["impact"] not in ["N/A", "None"] else ""}
+                    </div>
+                    <div style="margin-top: 10px;">
+                        {f'<div style="margin-top: 10px;"><strong>Property:</strong> {example["property_description"]}</div>' if example["property_description"] not in ["N/A", "None"] else ""}
+                        {f'<div style="margin-top: 10px;"><strong>Reason:</strong> {example["reason"]}</div>' if example["reason"] not in ["N/A", "None"] else ""}
+                        {f'<div style="margin-top: 10px;"><strong>Evidence:</strong> {example["evidence"]}</div>' if example["evidence"] not in ["N/A", "None"] else ""}
+                    </div>
+                </div>
+                <div style="margin-bottom: 15px;">
+                    <h5 style="margin: 0 0 8px 0; color: #333; font-size: 14px;">💬 {"Response Comparison" if example.get('is_side_by_side', False) else "Conversation"}</h5>
+                    <div style="border-radius: 6px; font-size: 13px; line-height: 1.5;">
+                        {conversation_html}
+                    </div>
+                </div>
+            </div>
+        </details>
+        """
+    html += "</div>"
+    return html
+# ---------------------------------------------------------------------------
+# Legacy function aliases (backward compatibility)
+# ---------------------------------------------------------------------------
+def compute_model_rankings(*args, **kwargs):
+    """Legacy alias → forwards to compute_model_rankings_new."""
+    return compute_model_rankings_new(*args, **kwargs)
+def create_model_summary_card(*args, **kwargs):
+    """Legacy alias → forwards to create_model_summary_card_new."""
+    return create_model_summary_card_new(*args, **kwargs)
+def get_total_clusters_count(metrics: Dict[str, Any]) -> int:
+    """Get the total number of clusters from the metrics data."""
+    cluster_scores = metrics.get("cluster_scores", {})
+    # Filter out "No properties" clusters
+    cluster_scores = {k: v for k, v in cluster_scores.items() if k != "No properties"}
+    return len(cluster_scores)
+def get_light_color_for_cluster(cluster_name: str, index: int) -> str:
+    """Generate a light dusty blue background for cluster boxes.
+    Returns a consistent light dusty blue color for all clusters.
+    """
+    return "#f0f4f8"  # Very light dusty blue
+__all__ = [
+    "get_model_clusters",
+    "get_all_models",
+    "get_all_clusters",
+    "format_confidence_interval",
+    "get_confidence_interval_width",
+    "has_confidence_intervals",
+    "extract_quality_score",
+    "get_top_clusters_for_model",
+    "compute_model_rankings_new",
+    "create_model_summary_card_new",
+    "format_cluster_dataframe",
+    "truncate_cluster_name",
+    "create_frequency_comparison_table",
+    "create_frequency_comparison_plots",
+    "search_clusters_by_text",
+    "search_clusters_only",
+    "create_interactive_cluster_viewer",
+    "get_cluster_statistics",
+    "get_unique_values_for_dropdowns",
+    "get_example_data",
+    "format_examples_display",
+    "compute_model_rankings",
+    "create_model_summary_card",
+    "get_total_clusters_count",
+]