Spaces:
Sleeping
Sleeping
Lisa Dunlap
commited on
Commit
Β·
4862c84
0
Parent(s):
restart
Browse files- .gitattributes +39 -0
- .gitignore +7 -0
- data/aci_bench/cluster_scores.json +3 -0
- data/aci_bench/cluster_scores_df.jsonl +3 -0
- data/aci_bench/clustered_results_lightweight.jsonl +3 -0
- data/aci_bench/clusters.json +3 -0
- data/aci_bench/model_cluster_scores.json +3 -0
- data/aci_bench/model_cluster_scores_df.jsonl +3 -0
- data/aci_bench/model_scores.json +3 -0
- data/aci_bench/model_scores_df.jsonl +3 -0
- data/aci_bench/model_stats.json +3 -0
- data/aci_bench/parsed_properties.jsonl +3 -0
- data/aci_bench/parsing_error_summary.json +3 -0
- data/aci_bench/parsing_failures.jsonl +3 -0
- data/aci_bench/parsing_stats.json +3 -0
- data/aci_bench/summary.txt +33 -0
- data/aci_bench/summary_table.json +3 -0
- data/aci_bench/summary_table.jsonl +3 -0
- data/aci_bench/validated_properties.jsonl +3 -0
- data/aci_bench/validation_stats.json +3 -0
- lmmvibes/__init__.py +0 -0
- lmmvibes/metrics/plotting.py +616 -0
- lmmvibes/utils/__init__.py +1 -0
- lmmvibes/utils/persistent_storage.py +80 -0
- lmmvibes/vis_gradio/__init__.py +13 -0
- lmmvibes/vis_gradio/app.py +697 -0
- lmmvibes/vis_gradio/clusters_tab.py +199 -0
- lmmvibes/vis_gradio/conversation_display.py +507 -0
- lmmvibes/vis_gradio/data_loader.py +189 -0
- lmmvibes/vis_gradio/debug_tab.py +83 -0
- lmmvibes/vis_gradio/demo.py +73 -0
- lmmvibes/vis_gradio/examples_tab.py +129 -0
- lmmvibes/vis_gradio/frequency_tab.py +307 -0
- lmmvibes/vis_gradio/launcher.py +122 -0
- lmmvibes/vis_gradio/load_data_tab.py +147 -0
- lmmvibes/vis_gradio/metrics_adapter.py +46 -0
- lmmvibes/vis_gradio/overview_tab.py +82 -0
- lmmvibes/vis_gradio/plots_tab.py +284 -0
- lmmvibes/vis_gradio/side_by_side_display.py +202 -0
- lmmvibes/vis_gradio/state.py +27 -0
- lmmvibes/vis_gradio/utils.py +1673 -0
.gitattributes
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.jsonl filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
38 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
39 |
+
results/**/plots/*.png -filter -merge -diff -text
|
.gitignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
|
3 |
+
# Ignore generated plot images
|
4 |
+
results/**/plots/*.png
|
5 |
+
|
6 |
+
# Ignore large results directories (data now tracked with LFS)
|
7 |
+
results/**
|
data/aci_bench/cluster_scores.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c9800cfb95cb3992d39649d61d01d326f7cd57fdc1e6253cd7a21b83be007762
|
3 |
+
size 35290231
|
data/aci_bench/cluster_scores_df.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:da1c901319ffa8aa23f4e53cfd7bf8f81bf1013c30369e589adb3383136a88cb
|
3 |
+
size 33773423
|
data/aci_bench/clustered_results_lightweight.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:194429736798d0857962dd4b719c23608ae29606137ecd5d0fd979cacb1deb4a
|
3 |
+
size 92743484
|
data/aci_bench/clusters.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1a00c7a0b16723d80fd3490ef658c913b1384f8eb68c8a549e8b50251c4bdf60
|
3 |
+
size 447437
|
data/aci_bench/model_cluster_scores.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4af5a4765a37b003e115b808a09ec4e95ebce3e302854957893f9b563b3cdc1e
|
3 |
+
size 35639398
|
data/aci_bench/model_cluster_scores_df.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cabadb1369aae14d6dbe08dbc4dee6d701891fe9426fbe52588bbc477a1b5995
|
3 |
+
size 33839755
|
data/aci_bench/model_scores.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:93968fdf5604d473d031a4731127603eb3a6f27eba041e7564e52df85dc987f5
|
3 |
+
size 35279538
|
data/aci_bench/model_scores_df.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d2c024085528e9afeda447a975da35099b9f323a57db7e6695e444f6021dd13
|
3 |
+
size 33766092
|
data/aci_bench/model_stats.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4e7b3e1831735691cb43135355719f8d822deda3b64af9baeb02eb403cfb1546
|
3 |
+
size 127543
|
data/aci_bench/parsed_properties.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db2a42c37fd60ecd830569cb776973e16da4acbd4ff9581d8a064239e702e66d
|
3 |
+
size 2441177
|
data/aci_bench/parsing_error_summary.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2915c2fa4df41abe202b65cb7f84c1824fd64bad5a993d88c9349e25352b47ff
|
3 |
+
size 27
|
data/aci_bench/parsing_failures.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1b10b336d6d58227d03a5f83fa8e0cbbefaadeb73a497363b67e68e3a01cf742
|
3 |
+
size 3665
|
data/aci_bench/parsing_stats.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:00197f1b62199cf7d8265acb34f073f0938c694b7230827a67086cd901c3f32e
|
3 |
+
size 219
|
data/aci_bench/summary.txt
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LMM-Vibes Results Summary
|
2 |
+
==================================================
|
3 |
+
|
4 |
+
Total conversations: 720
|
5 |
+
Total properties: 4146
|
6 |
+
Models analyzed: 1
|
7 |
+
|
8 |
+
Output files:
|
9 |
+
- raw_properties.jsonl: Raw LLM responses
|
10 |
+
- extraction_stats.json: Extraction statistics
|
11 |
+
- extraction_samples.jsonl: Sample inputs/outputs
|
12 |
+
- parsed_properties.jsonl: Parsed property objects
|
13 |
+
- parsing_stats.json: Parsing statistics
|
14 |
+
- parsing_failures.jsonl: Failed parsing attempts
|
15 |
+
- validated_properties.jsonl: Validated properties
|
16 |
+
- validation_stats.json: Validation statistics
|
17 |
+
- clustered_results.jsonl: Complete clustered data
|
18 |
+
- embeddings.parquet: Embeddings data
|
19 |
+
- clustered_results_lightweight.jsonl: Data without embeddings
|
20 |
+
- summary_table.jsonl: Clustering summary
|
21 |
+
- model_cluster_scores.json: Per model-cluster combination metrics
|
22 |
+
- cluster_scores.json: Per cluster metrics (aggregated across models)
|
23 |
+
- model_scores.json: Per model metrics (aggregated across clusters)
|
24 |
+
- full_dataset.json: Complete PropertyDataset (JSON format)
|
25 |
+
- full_dataset.parquet: Complete PropertyDataset (parquet format, or .jsonl if mixed data types)
|
26 |
+
|
27 |
+
Model Rankings (by average quality score):
|
28 |
+
1. openai/gpt-4o: 0.833
|
29 |
+
2. google/gemini-1.5-pro-001: 0.828
|
30 |
+
3. openai/gpt-4o-mini: 0.828
|
31 |
+
4. meta/llama-3.3-70b-instruct: 0.827
|
32 |
+
5. qwen/qwen2.5-7b-instruct: 0.818
|
33 |
+
6. microsoft/phi-3.5-mini-instruct: 0.806
|
data/aci_bench/summary_table.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dca91c6976f8751e65d262c12e42451e9880386ae51fe93a62e53e355ac9ba9f
|
3 |
+
size 58069
|
data/aci_bench/summary_table.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:098126fa13c7dd247263c87866cbacbcd583229470a34411022d5af130967d52
|
3 |
+
size 56818
|
data/aci_bench/validated_properties.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db2a42c37fd60ecd830569cb776973e16da4acbd4ff9581d8a064239e702e66d
|
3 |
+
size 2441177
|
data/aci_bench/validation_stats.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ba5d5c25ab20c2a8bfa51202ebc7a4c59af49af68fbe385ac0aca9c2960c4ce
|
3 |
+
size 137
|
lmmvibes/__init__.py
ADDED
File without changes
|
lmmvibes/metrics/plotting.py
ADDED
@@ -0,0 +1,616 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Plotting functionality for functional metrics.
|
3 |
+
|
4 |
+
This module provides comprehensive visualization of metrics from functional_metrics.py,
|
5 |
+
"""
|
6 |
+
|
7 |
+
import json
|
8 |
+
import pandas as pd
|
9 |
+
import numpy as np
|
10 |
+
from pathlib import Path
|
11 |
+
from typing import Dict, Any, List, Optional
|
12 |
+
import warnings
|
13 |
+
|
14 |
+
import plotly.graph_objects as go
|
15 |
+
import plotly.express as px
|
16 |
+
from plotly.subplots import make_subplots
|
17 |
+
import plotly.io as pio
|
18 |
+
|
19 |
+
# Set plotly template
|
20 |
+
pio.templates.default = "plotly_white"
|
21 |
+
warnings.filterwarnings('ignore')
|
22 |
+
|
23 |
+
|
24 |
+
def create_model_cluster_dataframe(model_cluster_scores: Dict[str, Any]) -> pd.DataFrame:
|
25 |
+
"""Convert model-cluster scores to a tidy dataframe."""
|
26 |
+
rows = []
|
27 |
+
for model, clusters in model_cluster_scores.items():
|
28 |
+
for cluster, metrics in clusters.items():
|
29 |
+
# Filter out "No properties" clusters
|
30 |
+
if cluster == "No properties":
|
31 |
+
continue
|
32 |
+
|
33 |
+
row = {
|
34 |
+
'model': model,
|
35 |
+
'cluster': cluster,
|
36 |
+
'size': metrics.get('size', 0),
|
37 |
+
'proportion': metrics.get('proportion', 0),
|
38 |
+
'proportion_delta': metrics.get('proportion_delta', 0)
|
39 |
+
}
|
40 |
+
|
41 |
+
# Add confidence intervals if available
|
42 |
+
if 'proportion_ci' in metrics:
|
43 |
+
ci = metrics['proportion_ci']
|
44 |
+
row.update({
|
45 |
+
'proportion_ci_lower': ci.get('lower', 0),
|
46 |
+
'proportion_ci_upper': ci.get('upper', 0),
|
47 |
+
'proportion_ci_mean': ci.get('mean', 0)
|
48 |
+
})
|
49 |
+
|
50 |
+
if 'proportion_delta_ci' in metrics:
|
51 |
+
ci = metrics['proportion_delta_ci']
|
52 |
+
row.update({
|
53 |
+
'proportion_delta_ci_lower': ci.get('lower', 0),
|
54 |
+
'proportion_delta_ci_upper': ci.get('upper', 0),
|
55 |
+
'proportion_delta_ci_mean': ci.get('mean', 0)
|
56 |
+
})
|
57 |
+
|
58 |
+
# Add significance flags
|
59 |
+
row['proportion_delta_significant'] = metrics.get('proportion_delta_significant', False)
|
60 |
+
|
61 |
+
# Add quality metrics
|
62 |
+
quality = metrics.get('quality', {})
|
63 |
+
quality_delta = metrics.get('quality_delta', {})
|
64 |
+
quality_ci = metrics.get('quality_ci', {})
|
65 |
+
quality_delta_ci = metrics.get('quality_delta_ci', {})
|
66 |
+
quality_delta_significant = metrics.get('quality_delta_significant', {})
|
67 |
+
|
68 |
+
for metric_name in quality.keys():
|
69 |
+
row[f'quality_{metric_name}'] = quality[metric_name]
|
70 |
+
row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
|
71 |
+
row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
|
72 |
+
|
73 |
+
if metric_name in quality_ci:
|
74 |
+
ci = quality_ci[metric_name]
|
75 |
+
row.update({
|
76 |
+
f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
|
77 |
+
f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
|
78 |
+
f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
|
79 |
+
})
|
80 |
+
|
81 |
+
if metric_name in quality_delta_ci:
|
82 |
+
ci = quality_delta_ci[metric_name]
|
83 |
+
row.update({
|
84 |
+
f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
|
85 |
+
f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
|
86 |
+
f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
|
87 |
+
})
|
88 |
+
|
89 |
+
rows.append(row)
|
90 |
+
|
91 |
+
return pd.DataFrame(rows)
|
92 |
+
|
93 |
+
|
94 |
+
def create_cluster_dataframe(cluster_scores: Dict[str, Any]) -> pd.DataFrame:
|
95 |
+
"""Convert cluster scores to a tidy dataframe."""
|
96 |
+
rows = []
|
97 |
+
for cluster, metrics in cluster_scores.items():
|
98 |
+
# Filter out "No properties" clusters
|
99 |
+
if cluster == "No properties":
|
100 |
+
continue
|
101 |
+
|
102 |
+
row = {
|
103 |
+
'cluster': cluster,
|
104 |
+
'size': metrics.get('size', 0),
|
105 |
+
'proportion': metrics.get('proportion', 0)
|
106 |
+
}
|
107 |
+
|
108 |
+
# Add confidence intervals if available
|
109 |
+
if 'proportion_ci' in metrics:
|
110 |
+
ci = metrics['proportion_ci']
|
111 |
+
row.update({
|
112 |
+
'proportion_ci_lower': ci.get('lower', 0),
|
113 |
+
'proportion_ci_upper': ci.get('upper', 0),
|
114 |
+
'proportion_ci_mean': ci.get('mean', 0)
|
115 |
+
})
|
116 |
+
|
117 |
+
# Add quality metrics
|
118 |
+
quality = metrics.get('quality', {})
|
119 |
+
quality_delta = metrics.get('quality_delta', {})
|
120 |
+
quality_ci = metrics.get('quality_ci', {})
|
121 |
+
quality_delta_ci = metrics.get('quality_delta_ci', {})
|
122 |
+
quality_delta_significant = metrics.get('quality_delta_significant', {})
|
123 |
+
|
124 |
+
for metric_name in quality.keys():
|
125 |
+
row[f'quality_{metric_name}'] = quality[metric_name]
|
126 |
+
row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
|
127 |
+
row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
|
128 |
+
|
129 |
+
if metric_name in quality_ci:
|
130 |
+
ci = quality_ci[metric_name]
|
131 |
+
row.update({
|
132 |
+
f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
|
133 |
+
f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
|
134 |
+
f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
|
135 |
+
})
|
136 |
+
|
137 |
+
if metric_name in quality_delta_ci:
|
138 |
+
ci = quality_delta_ci[metric_name]
|
139 |
+
row.update({
|
140 |
+
f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
|
141 |
+
f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
|
142 |
+
f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
|
143 |
+
})
|
144 |
+
|
145 |
+
rows.append(row)
|
146 |
+
|
147 |
+
return pd.DataFrame(rows)
|
148 |
+
|
149 |
+
|
150 |
+
def create_model_dataframe(model_scores: Dict[str, Any]) -> pd.DataFrame:
|
151 |
+
"""Convert model scores to a tidy dataframe."""
|
152 |
+
rows = []
|
153 |
+
for model, metrics in model_scores.items():
|
154 |
+
row = {
|
155 |
+
'model': model,
|
156 |
+
'size': metrics.get('size', 0),
|
157 |
+
'proportion': metrics.get('proportion', 0)
|
158 |
+
}
|
159 |
+
|
160 |
+
# Add confidence intervals if available
|
161 |
+
if 'proportion_ci' in metrics:
|
162 |
+
ci = metrics['proportion_ci']
|
163 |
+
row.update({
|
164 |
+
'proportion_ci_lower': ci.get('lower', 0),
|
165 |
+
'proportion_ci_upper': ci.get('upper', 0),
|
166 |
+
'proportion_ci_mean': ci.get('mean', 0)
|
167 |
+
})
|
168 |
+
|
169 |
+
# Add quality metrics
|
170 |
+
quality = metrics.get('quality', {})
|
171 |
+
quality_delta = metrics.get('quality_delta', {})
|
172 |
+
quality_ci = metrics.get('quality_ci', {})
|
173 |
+
quality_delta_ci = metrics.get('quality_delta_ci', {})
|
174 |
+
quality_delta_significant = metrics.get('quality_delta_significant', {})
|
175 |
+
|
176 |
+
for metric_name in quality.keys():
|
177 |
+
row[f'quality_{metric_name}'] = quality[metric_name]
|
178 |
+
row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
|
179 |
+
row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
|
180 |
+
|
181 |
+
if metric_name in quality_ci:
|
182 |
+
ci = quality_ci[metric_name]
|
183 |
+
row.update({
|
184 |
+
f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
|
185 |
+
f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
|
186 |
+
f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
|
187 |
+
})
|
188 |
+
|
189 |
+
if metric_name in quality_delta_ci:
|
190 |
+
ci = quality_delta_ci[metric_name]
|
191 |
+
row.update({
|
192 |
+
f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
|
193 |
+
f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
|
194 |
+
f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
|
195 |
+
})
|
196 |
+
|
197 |
+
rows.append(row)
|
198 |
+
|
199 |
+
return pd.DataFrame(rows)
|
200 |
+
|
201 |
+
|
202 |
+
def get_quality_metrics(df: pd.DataFrame) -> List[str]:
|
203 |
+
"""Extract quality metric names from dataframe columns."""
|
204 |
+
quality_cols = [col for col in df.columns if col.startswith('quality_') and not col.endswith(('_ci_lower', '_ci_upper', '_ci_mean', '_significant'))]
|
205 |
+
return [col.replace('quality_', '') for col in quality_cols]
|
206 |
+
|
207 |
+
|
208 |
+
def create_interactive_cluster_plot(cluster_df: pd.DataFrame, model_cluster_df: pd.DataFrame,
|
209 |
+
metric_col: str, title: str,
|
210 |
+
ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
|
211 |
+
significant_col: Optional[str] = None) -> go.Figure:
|
212 |
+
"""Create an interactive cluster plot with dropdown for view mode."""
|
213 |
+
|
214 |
+
# Create the figure with subplots
|
215 |
+
fig = make_subplots(
|
216 |
+
rows=1, cols=1,
|
217 |
+
specs=[[{"secondary_y": False}]],
|
218 |
+
subplot_titles=[title]
|
219 |
+
)
|
220 |
+
|
221 |
+
# Prepare cluster_df - reset index if cluster is the index
|
222 |
+
if 'cluster' not in cluster_df.columns and cluster_df.index.name == 'cluster':
|
223 |
+
cluster_df = cluster_df.reset_index()
|
224 |
+
|
225 |
+
# Sort clusters by metric value in descending order for consistent ordering
|
226 |
+
cluster_df = cluster_df.sort_values(metric_col, ascending=False)
|
227 |
+
|
228 |
+
# Add aggregated view (default) - using cluster_df
|
229 |
+
if ci_lower_col and ci_upper_col and ci_lower_col in cluster_df.columns and ci_upper_col in cluster_df.columns:
|
230 |
+
fig.add_trace(
|
231 |
+
go.Bar(
|
232 |
+
x=cluster_df['cluster'],
|
233 |
+
y=cluster_df[metric_col],
|
234 |
+
name='Aggregated (All Models)',
|
235 |
+
error_y=dict(
|
236 |
+
type='data',
|
237 |
+
array=cluster_df[ci_upper_col] - cluster_df[metric_col],
|
238 |
+
arrayminus=cluster_df[metric_col] - cluster_df[ci_lower_col],
|
239 |
+
visible=True
|
240 |
+
),
|
241 |
+
visible=True
|
242 |
+
)
|
243 |
+
)
|
244 |
+
else:
|
245 |
+
fig.add_trace(
|
246 |
+
go.Bar(
|
247 |
+
x=cluster_df['cluster'],
|
248 |
+
y=cluster_df[metric_col],
|
249 |
+
name='Aggregated (All Models)',
|
250 |
+
visible=True
|
251 |
+
)
|
252 |
+
)
|
253 |
+
|
254 |
+
# Grouped by model view - using model_cluster_df
|
255 |
+
for model in model_cluster_df['model'].unique():
|
256 |
+
model_df = model_cluster_df[model_cluster_df['model'] == model]
|
257 |
+
# Sort model_df to match the cluster order
|
258 |
+
model_df = model_df.set_index('cluster').reindex(cluster_df['cluster']).reset_index()
|
259 |
+
if ci_lower_col and ci_upper_col and ci_lower_col in model_cluster_df.columns and ci_upper_col in model_cluster_df.columns:
|
260 |
+
fig.add_trace(
|
261 |
+
go.Bar(
|
262 |
+
x=model_df['cluster'],
|
263 |
+
y=model_df[metric_col],
|
264 |
+
name=f'Model: {model}',
|
265 |
+
error_y=dict(
|
266 |
+
type='data',
|
267 |
+
array=model_df[ci_upper_col] - model_df[metric_col],
|
268 |
+
arrayminus=model_df[metric_col] - model_df[ci_lower_col],
|
269 |
+
visible=False
|
270 |
+
),
|
271 |
+
visible=False
|
272 |
+
)
|
273 |
+
)
|
274 |
+
else:
|
275 |
+
fig.add_trace(
|
276 |
+
go.Bar(
|
277 |
+
x=model_df['cluster'],
|
278 |
+
y=model_df[metric_col],
|
279 |
+
name=f'Model: {model}',
|
280 |
+
visible=False
|
281 |
+
)
|
282 |
+
)
|
283 |
+
|
284 |
+
# Add significance markers if available (for aggregated view)
|
285 |
+
# Red asterisks (*) indicate clusters with statistically significant quality delta values
|
286 |
+
# (confidence intervals that do not contain 0)
|
287 |
+
if significant_col and significant_col in cluster_df.columns:
|
288 |
+
for i, (cluster, is_sig) in enumerate(zip(cluster_df['cluster'], cluster_df[significant_col])):
|
289 |
+
if is_sig:
|
290 |
+
fig.add_annotation(
|
291 |
+
x=cluster,
|
292 |
+
y=cluster_df[cluster_df['cluster'] == cluster][metric_col].iloc[0],
|
293 |
+
text="*",
|
294 |
+
showarrow=False,
|
295 |
+
font=dict(size=16, color="red"),
|
296 |
+
yshift=10
|
297 |
+
)
|
298 |
+
|
299 |
+
# Update layout
|
300 |
+
fig.update_layout(
|
301 |
+
title=title,
|
302 |
+
xaxis_title="Cluster",
|
303 |
+
yaxis_title=metric_col.replace('_', ' ').title(),
|
304 |
+
barmode='group',
|
305 |
+
height=500,
|
306 |
+
showlegend=True,
|
307 |
+
annotations=[
|
308 |
+
dict(
|
309 |
+
text="* = Statistically significant (CI does not contain 0)",
|
310 |
+
showarrow=False,
|
311 |
+
xref="paper", yref="paper",
|
312 |
+
x=0.01, y=0.01,
|
313 |
+
xanchor="left", yanchor="bottom",
|
314 |
+
font=dict(size=10, color="red")
|
315 |
+
)
|
316 |
+
] if significant_col and significant_col in cluster_df.columns else []
|
317 |
+
)
|
318 |
+
|
319 |
+
# Add dropdown for view selection - only 2 options
|
320 |
+
buttons = []
|
321 |
+
|
322 |
+
# Aggregated view button (all models combined)
|
323 |
+
visibility = [True] + [False] * len(model_cluster_df['model'].unique())
|
324 |
+
buttons.append(
|
325 |
+
dict(
|
326 |
+
label="Aggregated (All Models)",
|
327 |
+
method="update",
|
328 |
+
args=[{"visible": visibility, "barmode": "group"}]
|
329 |
+
)
|
330 |
+
)
|
331 |
+
|
332 |
+
# Grouped by model view (each model as separate bars)
|
333 |
+
visibility = [False] + [True] * len(model_cluster_df['model'].unique())
|
334 |
+
buttons.append(
|
335 |
+
dict(
|
336 |
+
label="Grouped by Model",
|
337 |
+
method="update",
|
338 |
+
args=[{"visible": visibility, "barmode": "group"}]
|
339 |
+
)
|
340 |
+
)
|
341 |
+
|
342 |
+
fig.update_layout(
|
343 |
+
updatemenus=[
|
344 |
+
dict(
|
345 |
+
buttons=buttons,
|
346 |
+
direction="down",
|
347 |
+
showactive=True,
|
348 |
+
x=0.95,
|
349 |
+
xanchor="right",
|
350 |
+
y=1.25,
|
351 |
+
yanchor="top"
|
352 |
+
)
|
353 |
+
]
|
354 |
+
)
|
355 |
+
|
356 |
+
return fig
|
357 |
+
|
358 |
+
|
359 |
+
def create_interactive_heatmap(df: pd.DataFrame, value_col: str, title: str,
|
360 |
+
pivot_index: str = 'model', pivot_columns: str = 'cluster',
|
361 |
+
significant_col: Optional[str] = None) -> go.Figure:
|
362 |
+
"""Create an interactive heatmap with hover information."""
|
363 |
+
|
364 |
+
# Create pivot table
|
365 |
+
pivot_df = df.pivot(index=pivot_index, columns=pivot_columns, values=value_col)
|
366 |
+
|
367 |
+
# Sort by mean values for consistent ordering
|
368 |
+
if pivot_index == 'model':
|
369 |
+
# Sort models by their mean values across clusters
|
370 |
+
model_means = pivot_df.mean(axis=1).sort_values(ascending=False)
|
371 |
+
pivot_df = pivot_df.reindex(model_means.index)
|
372 |
+
else:
|
373 |
+
# Sort clusters by their mean values across models
|
374 |
+
cluster_means = pivot_df.mean(axis=0).sort_values(ascending=False)
|
375 |
+
pivot_df = pivot_df.reindex(columns=cluster_means.index)
|
376 |
+
|
377 |
+
# Transpose the data for more intuitive visualization (models on x-axis, clusters on y-axis)
|
378 |
+
pivot_df = pivot_df.T
|
379 |
+
|
380 |
+
# Create heatmap
|
381 |
+
fig = go.Figure(data=go.Heatmap(
|
382 |
+
z=pivot_df.values,
|
383 |
+
x=pivot_df.columns, # Models
|
384 |
+
y=pivot_df.index, # Clusters
|
385 |
+
colorscale='RdBu_r' if 'delta' in value_col else 'Viridis',
|
386 |
+
zmid=0 if 'delta' in value_col else None,
|
387 |
+
text=pivot_df.values.round(3),
|
388 |
+
texttemplate="%{text}",
|
389 |
+
textfont={"size": 10},
|
390 |
+
hoverongaps=False
|
391 |
+
))
|
392 |
+
|
393 |
+
# Add significance markers if available
|
394 |
+
if significant_col and significant_col in df.columns:
|
395 |
+
sig_pivot = df.pivot(index=pivot_index, columns=pivot_columns, values=significant_col)
|
396 |
+
# Apply same sorting as the main pivot
|
397 |
+
if pivot_index == 'model':
|
398 |
+
sig_pivot = sig_pivot.reindex(model_means.index)
|
399 |
+
else:
|
400 |
+
sig_pivot = sig_pivot.reindex(columns=cluster_means.index)
|
401 |
+
sig_pivot = sig_pivot.T # Transpose to match the main heatmap
|
402 |
+
for i, cluster in enumerate(pivot_df.index):
|
403 |
+
for j, model in enumerate(pivot_df.columns):
|
404 |
+
if sig_pivot.loc[cluster, model]:
|
405 |
+
fig.add_annotation(
|
406 |
+
x=model,
|
407 |
+
y=cluster,
|
408 |
+
text="*",
|
409 |
+
showarrow=False,
|
410 |
+
font=dict(size=16, color="red"),
|
411 |
+
xshift=10,
|
412 |
+
yshift=10
|
413 |
+
)
|
414 |
+
|
415 |
+
fig.update_layout(
|
416 |
+
title=title,
|
417 |
+
xaxis_title="Model",
|
418 |
+
yaxis_title="Cluster",
|
419 |
+
height=500,
|
420 |
+
annotations=[
|
421 |
+
dict(
|
422 |
+
text="* = Statistically significant (CI does not contain 0)",
|
423 |
+
showarrow=False,
|
424 |
+
xref="paper", yref="paper",
|
425 |
+
x=0.01, y=0.01,
|
426 |
+
xanchor="left", yanchor="bottom",
|
427 |
+
font=dict(size=10, color="red")
|
428 |
+
)
|
429 |
+
] if significant_col and significant_col in df.columns else []
|
430 |
+
)
|
431 |
+
|
432 |
+
return fig
|
433 |
+
|
434 |
+
|
435 |
+
def create_interactive_model_plot(model_df: pd.DataFrame, model_cluster_df: pd.DataFrame,
|
436 |
+
metric_col: str, title: str,
|
437 |
+
ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
|
438 |
+
significant_col: Optional[str] = None) -> go.Figure:
|
439 |
+
"""Create an interactive model plot with dropdown for view mode."""
|
440 |
+
|
441 |
+
# Create the figure with subplots
|
442 |
+
fig = make_subplots(
|
443 |
+
rows=1, cols=1,
|
444 |
+
specs=[[{"secondary_y": False}]],
|
445 |
+
subplot_titles=[title]
|
446 |
+
)
|
447 |
+
|
448 |
+
# Prepare model_df - reset index if model is the index
|
449 |
+
if 'model' not in model_df.columns and model_df.index.name == 'model':
|
450 |
+
model_df = model_df.reset_index()
|
451 |
+
|
452 |
+
# Add aggregated view (default) - using model_df
|
453 |
+
if ci_lower_col and ci_upper_col and ci_lower_col in model_df.columns and ci_upper_col in model_df.columns:
|
454 |
+
fig.add_trace(
|
455 |
+
go.Bar(
|
456 |
+
x=model_df['model'],
|
457 |
+
y=model_df[metric_col],
|
458 |
+
name='Aggregated (All Clusters)',
|
459 |
+
error_y=dict(
|
460 |
+
type='data',
|
461 |
+
array=model_df[ci_upper_col] - model_df[metric_col],
|
462 |
+
arrayminus=model_df[metric_col] - model_df[ci_lower_col],
|
463 |
+
visible=True
|
464 |
+
),
|
465 |
+
visible=True
|
466 |
+
)
|
467 |
+
)
|
468 |
+
else:
|
469 |
+
fig.add_trace(
|
470 |
+
go.Bar(
|
471 |
+
x=model_df['model'],
|
472 |
+
y=model_df[metric_col],
|
473 |
+
name='Aggregated (All Clusters)',
|
474 |
+
visible=True
|
475 |
+
)
|
476 |
+
)
|
477 |
+
|
478 |
+
# Grouped by cluster view - using model_cluster_df
|
479 |
+
for cluster in model_cluster_df['cluster'].unique():
|
480 |
+
cluster_df = model_cluster_df[model_cluster_df['cluster'] == cluster]
|
481 |
+
if ci_lower_col and ci_upper_col and ci_lower_col in cluster_df.columns and ci_upper_col in cluster_df.columns:
|
482 |
+
fig.add_trace(
|
483 |
+
go.Bar(
|
484 |
+
x=cluster_df['model'],
|
485 |
+
y=cluster_df[metric_col],
|
486 |
+
name=f'Cluster: {cluster}',
|
487 |
+
error_y=dict(
|
488 |
+
type='data',
|
489 |
+
array=cluster_df[ci_upper_col] - cluster_df[metric_col],
|
490 |
+
arrayminus=cluster_df[metric_col] - cluster_df[ci_lower_col],
|
491 |
+
visible=False
|
492 |
+
),
|
493 |
+
visible=False
|
494 |
+
)
|
495 |
+
)
|
496 |
+
else:
|
497 |
+
fig.add_trace(
|
498 |
+
go.Bar(
|
499 |
+
x=cluster_df['model'],
|
500 |
+
y=cluster_df[metric_col],
|
501 |
+
name=f'Cluster: {cluster}',
|
502 |
+
visible=False
|
503 |
+
)
|
504 |
+
)
|
505 |
+
|
506 |
+
# Add significance markers if available (for aggregated view)
|
507 |
+
if significant_col and significant_col in model_df.columns:
|
508 |
+
for i, (model, is_sig) in enumerate(zip(model_df['model'], model_df[significant_col])):
|
509 |
+
if is_sig:
|
510 |
+
fig.add_annotation(
|
511 |
+
x=model,
|
512 |
+
y=model_df[model_df['model'] == model][metric_col].iloc[0],
|
513 |
+
text="*",
|
514 |
+
showarrow=False,
|
515 |
+
font=dict(size=16, color="red"),
|
516 |
+
yshift=10
|
517 |
+
)
|
518 |
+
|
519 |
+
# Update layout
|
520 |
+
fig.update_layout(
|
521 |
+
title=title,
|
522 |
+
xaxis_title="Model",
|
523 |
+
yaxis_title=metric_col.replace('_', ' ').title(),
|
524 |
+
barmode='group',
|
525 |
+
height=500,
|
526 |
+
showlegend=True
|
527 |
+
)
|
528 |
+
|
529 |
+
# Add dropdown for view selection - only 2 options
|
530 |
+
buttons = []
|
531 |
+
|
532 |
+
# Aggregated view button (all clusters combined)
|
533 |
+
visibility = [True] + [False] * len(model_cluster_df['cluster'].unique())
|
534 |
+
buttons.append(
|
535 |
+
dict(
|
536 |
+
label="Aggregated (All Clusters)",
|
537 |
+
method="update",
|
538 |
+
args=[{"visible": visibility, "barmode": "group"}]
|
539 |
+
)
|
540 |
+
)
|
541 |
+
|
542 |
+
# Grouped by cluster view (each cluster as separate bars)
|
543 |
+
visibility = [False] + [True] * len(model_cluster_df['cluster'].unique())
|
544 |
+
buttons.append(
|
545 |
+
dict(
|
546 |
+
label="Grouped by Cluster",
|
547 |
+
method="update",
|
548 |
+
args=[{"visible": visibility, "barmode": "group"}]
|
549 |
+
)
|
550 |
+
)
|
551 |
+
|
552 |
+
fig.update_layout(
|
553 |
+
updatemenus=[
|
554 |
+
dict(
|
555 |
+
buttons=buttons,
|
556 |
+
direction="down",
|
557 |
+
showactive=True,
|
558 |
+
x=0.95,
|
559 |
+
xanchor="right",
|
560 |
+
y=1.25,
|
561 |
+
yanchor="top"
|
562 |
+
)
|
563 |
+
]
|
564 |
+
)
|
565 |
+
|
566 |
+
return fig
|
567 |
+
|
568 |
+
|
569 |
+
def create_interactive_model_cluster_plot(df: pd.DataFrame, metric_col: str, title: str,
|
570 |
+
ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
|
571 |
+
significant_col: Optional[str] = None) -> go.Figure:
|
572 |
+
"""Create an interactive model-cluster plot with grouped bars."""
|
573 |
+
|
574 |
+
# Create grouped bar chart
|
575 |
+
if ci_lower_col and ci_upper_col and ci_lower_col in df.columns and ci_upper_col in df.columns:
|
576 |
+
fig = px.bar(
|
577 |
+
df,
|
578 |
+
x='cluster',
|
579 |
+
y=metric_col,
|
580 |
+
color='model',
|
581 |
+
error_y=df[ci_upper_col] - df[metric_col],
|
582 |
+
error_y_minus=df[metric_col] - df[ci_lower_col],
|
583 |
+
title=title,
|
584 |
+
barmode='group'
|
585 |
+
)
|
586 |
+
else:
|
587 |
+
fig = px.bar(
|
588 |
+
df,
|
589 |
+
x='cluster',
|
590 |
+
y=metric_col,
|
591 |
+
color='model',
|
592 |
+
title=title,
|
593 |
+
barmode='group'
|
594 |
+
)
|
595 |
+
|
596 |
+
# Add significance markers if available
|
597 |
+
if significant_col and significant_col in df.columns:
|
598 |
+
for i, row in df.iterrows():
|
599 |
+
if row[significant_col]:
|
600 |
+
fig.add_annotation(
|
601 |
+
x=row['cluster'],
|
602 |
+
y=row[metric_col],
|
603 |
+
text="*",
|
604 |
+
showarrow=False,
|
605 |
+
font=dict(size=16, color="red"),
|
606 |
+
yshift=10
|
607 |
+
)
|
608 |
+
|
609 |
+
fig.update_layout(
|
610 |
+
height=500,
|
611 |
+
xaxis_title="Cluster",
|
612 |
+
yaxis_title=metric_col.replace('_', ' ').title()
|
613 |
+
)
|
614 |
+
|
615 |
+
return fig
|
616 |
+
|
lmmvibes/utils/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""Utilities package for LMM-Vibes."""
|
lmmvibes/utils/persistent_storage.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utilities for persistent storage in Hugging Face Spaces.
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import Optional
|
7 |
+
|
8 |
+
|
9 |
+
def get_persistent_data_dir() -> Optional[Path]:
|
10 |
+
"""Get the persistent data directory if available.
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
Path to persistent storage directory if available, None otherwise.
|
14 |
+
"""
|
15 |
+
if os.path.isdir("/data"):
|
16 |
+
data_dir = Path("/data/app_data")
|
17 |
+
data_dir.mkdir(exist_ok=True)
|
18 |
+
return data_dir
|
19 |
+
return None
|
20 |
+
|
21 |
+
|
22 |
+
def get_cache_dir() -> Path:
|
23 |
+
"""Get the appropriate cache directory (persistent if available, temp otherwise).
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
Path to cache directory.
|
27 |
+
"""
|
28 |
+
if os.path.isdir("/data"):
|
29 |
+
cache_dir = Path("/data/.cache")
|
30 |
+
cache_dir.mkdir(exist_ok=True)
|
31 |
+
return cache_dir
|
32 |
+
else:
|
33 |
+
# Fallback to temp directory
|
34 |
+
import tempfile
|
35 |
+
return Path(tempfile.gettempdir()) / "app_cache"
|
36 |
+
|
37 |
+
|
38 |
+
def save_uploaded_file(uploaded_file, filename: str) -> Optional[Path]:
|
39 |
+
"""Save an uploaded file to persistent storage.
|
40 |
+
|
41 |
+
Args:
|
42 |
+
uploaded_file: Gradio uploaded file object
|
43 |
+
filename: Name to save the file as
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
Path to saved file if successful, None otherwise.
|
47 |
+
"""
|
48 |
+
persistent_dir = get_persistent_data_dir()
|
49 |
+
if persistent_dir and uploaded_file:
|
50 |
+
save_path = persistent_dir / filename
|
51 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
52 |
+
|
53 |
+
# Copy the uploaded file to persistent storage
|
54 |
+
import shutil
|
55 |
+
shutil.copy2(uploaded_file, save_path)
|
56 |
+
return save_path
|
57 |
+
return None
|
58 |
+
|
59 |
+
|
60 |
+
def is_persistent_storage_available() -> bool:
|
61 |
+
"""Check if persistent storage is available.
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
True if persistent storage is available, False otherwise.
|
65 |
+
"""
|
66 |
+
return os.path.isdir("/data")
|
67 |
+
|
68 |
+
|
69 |
+
def get_persistent_results_dir() -> Optional[Path]:
|
70 |
+
"""Get the persistent results directory for storing pipeline results.
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
Path to persistent results directory if available, None otherwise.
|
74 |
+
"""
|
75 |
+
persistent_dir = get_persistent_data_dir()
|
76 |
+
if persistent_dir:
|
77 |
+
results_dir = persistent_dir / "results"
|
78 |
+
results_dir.mkdir(exist_ok=True)
|
79 |
+
return results_dir
|
80 |
+
return None
|
lmmvibes/vis_gradio/__init__.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Gradio-based visualization for LMM-Vibes pipeline results.
|
2 |
+
|
3 |
+
This module provides a Gradio interface for exploring model performance,
|
4 |
+
cluster analysis, and detailed examples from pipeline output.
|
5 |
+
|
6 |
+
Usage:
|
7 |
+
from lmmvibes.vis_gradio import launch_app
|
8 |
+
launch_app(results_dir="path/to/results")
|
9 |
+
"""
|
10 |
+
|
11 |
+
from .app import launch_app, create_app
|
12 |
+
|
13 |
+
__all__ = ["launch_app", "create_app"]
|
lmmvibes/vis_gradio/app.py
ADDED
@@ -0,0 +1,697 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Main Gradio application for LMM-Vibes pipeline results visualization.
|
3 |
+
|
4 |
+
This module creates a comprehensive Gradio interface for exploring model performance,
|
5 |
+
cluster analysis, and detailed examples from pipeline output.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import gradio as gr
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
import plotly.graph_objects as go
|
12 |
+
from pathlib import Path
|
13 |
+
from typing import Dict, List, Any, Optional, Tuple
|
14 |
+
import os
|
15 |
+
|
16 |
+
from .data_loader import (
|
17 |
+
load_pipeline_results,
|
18 |
+
load_property_examples,
|
19 |
+
scan_for_result_subfolders,
|
20 |
+
validate_results_directory,
|
21 |
+
get_available_models
|
22 |
+
)
|
23 |
+
from .utils import (
|
24 |
+
compute_model_rankings,
|
25 |
+
create_model_summary_card,
|
26 |
+
format_cluster_dataframe,
|
27 |
+
create_frequency_comparison_table,
|
28 |
+
create_frequency_comparison_plots,
|
29 |
+
search_clusters_by_text,
|
30 |
+
get_top_clusters_for_model,
|
31 |
+
create_interactive_cluster_viewer,
|
32 |
+
get_cluster_statistics,
|
33 |
+
get_unique_values_for_dropdowns,
|
34 |
+
get_example_data,
|
35 |
+
format_examples_display,
|
36 |
+
get_total_clusters_count
|
37 |
+
)
|
38 |
+
|
39 |
+
# ---------------------------------------------------------------------------
|
40 |
+
# NEW: centralised state + logic split into per-tab modules
|
41 |
+
# ---------------------------------------------------------------------------
|
42 |
+
from .state import app_state, BASE_RESULTS_DIR
|
43 |
+
|
44 |
+
# Tab-specific logic (moved out of this file)
|
45 |
+
from .load_data_tab import (
|
46 |
+
load_data,
|
47 |
+
get_available_experiments,
|
48 |
+
get_experiment_choices,
|
49 |
+
refresh_experiment_dropdown,
|
50 |
+
load_experiment_data,
|
51 |
+
)
|
52 |
+
from .overview_tab import create_overview
|
53 |
+
from .clusters_tab import view_clusters_interactive, view_clusters_table
|
54 |
+
from .examples_tab import (
|
55 |
+
get_dropdown_choices,
|
56 |
+
update_example_dropdowns,
|
57 |
+
view_examples,
|
58 |
+
)
|
59 |
+
# Frequency and debug remain
|
60 |
+
from .frequency_tab import create_frequency_comparison, create_frequency_plots
|
61 |
+
from .debug_tab import debug_data_structure
|
62 |
+
from .plots_tab import create_plots_tab, create_plot_with_toggle, update_quality_metric_dropdown, update_quality_metric_visibility
|
63 |
+
|
64 |
+
# app_state and BASE_RESULTS_DIR now come from vis_gradio.state
|
65 |
+
|
66 |
+
|
67 |
+
def update_top_n_slider_maximum():
|
68 |
+
"""Update the top N slider maximum based on total clusters in loaded data."""
|
69 |
+
from .state import app_state
|
70 |
+
|
71 |
+
if not app_state.get("metrics"):
|
72 |
+
return gr.Slider(minimum=1, maximum=10, value=3, step=1)
|
73 |
+
|
74 |
+
total_clusters = get_total_clusters_count(app_state["metrics"])
|
75 |
+
max_value = max(10, total_clusters) # At least 10, or total clusters if more
|
76 |
+
|
77 |
+
return gr.Slider(
|
78 |
+
label="Top N Clusters per Model",
|
79 |
+
minimum=1,
|
80 |
+
maximum=max_value,
|
81 |
+
value=min(3, max_value),
|
82 |
+
step=1,
|
83 |
+
info=f"Number of top clusters to show per model (max: {total_clusters})"
|
84 |
+
)
|
85 |
+
|
86 |
+
|
87 |
+
def create_app() -> gr.Blocks:
|
88 |
+
"""Create the main Gradio application."""
|
89 |
+
|
90 |
+
# Custom CSS for reduced margins and better sidebar layout
|
91 |
+
custom_css = """
|
92 |
+
.main-container {
|
93 |
+
max-width: 100% !important;
|
94 |
+
margin: 0 !important;
|
95 |
+
padding: 0 !important;
|
96 |
+
}
|
97 |
+
.gradio-container {
|
98 |
+
max-width: 100% !important;
|
99 |
+
margin: 0 !important;
|
100 |
+
padding: 0 10px !important;
|
101 |
+
}
|
102 |
+
.tabs {
|
103 |
+
margin: 0 !important;
|
104 |
+
padding: 0 !important;
|
105 |
+
}
|
106 |
+
.tab-nav {
|
107 |
+
margin: 0 !important;
|
108 |
+
padding: 0 !important;
|
109 |
+
}
|
110 |
+
.tab-content {
|
111 |
+
margin: 0 !important;
|
112 |
+
padding: 10px !important;
|
113 |
+
}
|
114 |
+
.sidebar {
|
115 |
+
border-right: 1px solid #e0e0e0;
|
116 |
+
background-color: #f8f9fa;
|
117 |
+
padding: 15px !important;
|
118 |
+
}
|
119 |
+
.main-content {
|
120 |
+
padding: 10px !important;
|
121 |
+
}
|
122 |
+
"""
|
123 |
+
|
124 |
+
with gr.Blocks(title="LMM-Vibes Pipeline Results Explorer", theme=gr.themes.Soft(), css=custom_css) as app:
|
125 |
+
gr.Markdown("""
|
126 |
+
**Comprehensive analysis of model behavioral properties and performance**
|
127 |
+
|
128 |
+
Upload your pipeline results directory to explore model performance, cluster analysis, and detailed examples.
|
129 |
+
""")
|
130 |
+
|
131 |
+
with gr.Row():
|
132 |
+
# Sidebar for data loading and model selection
|
133 |
+
with gr.Column(scale=1, min_width=300, elem_classes=["sidebar"]):
|
134 |
+
gr.Markdown("### Load Data")
|
135 |
+
if BASE_RESULTS_DIR:
|
136 |
+
gr.Markdown(f"**Base Results Directory:** `{BASE_RESULTS_DIR}`")
|
137 |
+
gr.Markdown("**WARNING: this might take a while to load**")
|
138 |
+
gr.Markdown("Select an experiment from the dropdown below to load its results.")
|
139 |
+
else:
|
140 |
+
gr.Markdown("Provide the path to your pipeline results directory containing either:")
|
141 |
+
gr.Markdown("β’ **Legacy format**: `model_stats.json` + `clustered_results.jsonl`")
|
142 |
+
gr.Markdown("β’ **Functional format**: `model_cluster_scores.json` + `cluster_scores.json` + `model_scores.json` + `clustered_results.jsonl`")
|
143 |
+
gr.Markdown("*The app will automatically detect which format you're using.*")
|
144 |
+
|
145 |
+
if BASE_RESULTS_DIR:
|
146 |
+
experiment_dropdown = gr.Dropdown(
|
147 |
+
label="Select Experiment",
|
148 |
+
choices=get_experiment_choices(),
|
149 |
+
value="Select an experiment...",
|
150 |
+
info="Choose an experiment to load its results"
|
151 |
+
)
|
152 |
+
else:
|
153 |
+
results_dir_input = gr.Textbox(
|
154 |
+
label="Results Directory Path",
|
155 |
+
placeholder="/path/to/your/results/directory",
|
156 |
+
info="Directory containing pipeline results (legacy or functional format)"
|
157 |
+
)
|
158 |
+
|
159 |
+
load_btn = gr.Button("Load Data", variant="primary")
|
160 |
+
|
161 |
+
data_status = gr.Markdown("")
|
162 |
+
models_info = gr.Markdown("")
|
163 |
+
|
164 |
+
# Model selection (will be updated after loading)
|
165 |
+
selected_models = gr.CheckboxGroup(
|
166 |
+
label="Select Models for Analysis",
|
167 |
+
choices=[],
|
168 |
+
value=[],
|
169 |
+
info="Choose which models to include in comparisons"
|
170 |
+
)
|
171 |
+
|
172 |
+
# Main content area with reduced margins
|
173 |
+
with gr.Column(scale=4, elem_classes=["main-content"]):
|
174 |
+
with gr.Tabs():
|
175 |
+
# Tab 1: Overview
|
176 |
+
with gr.TabItem("π Overview"):
|
177 |
+
with gr.Row():
|
178 |
+
min_cluster_size = gr.Slider(
|
179 |
+
label="Minimum Cluster Size",
|
180 |
+
minimum=1, maximum=50, value=5, step=1,
|
181 |
+
info="Hide clusters with fewer than this many examples"
|
182 |
+
)
|
183 |
+
score_significant_only = gr.Checkbox(
|
184 |
+
label="Show Only Frequency Significant Clusters",
|
185 |
+
value=False,
|
186 |
+
info="Only show clusters where the distinctiveness score is statistically significant"
|
187 |
+
)
|
188 |
+
quality_significant_only = gr.Checkbox(
|
189 |
+
label="Show Only Quality Significant Clusters",
|
190 |
+
value=False,
|
191 |
+
info="Only show clusters where the quality score is statistically significant"
|
192 |
+
)
|
193 |
+
|
194 |
+
with gr.Row():
|
195 |
+
sort_by = gr.Dropdown(
|
196 |
+
label="Sort Clusters By",
|
197 |
+
choices=[
|
198 |
+
("Proportion Delta (Descending)", "salience_desc"),
|
199 |
+
("Proportion Delta (Ascending)", "salience_asc"),
|
200 |
+
("Quality (Ascending)", "quality_asc"),
|
201 |
+
("Quality (Descending)", "quality_desc"),
|
202 |
+
("Frequency (Descending)", "frequency_desc"),
|
203 |
+
("Frequency (Ascending)", "frequency_asc")
|
204 |
+
],
|
205 |
+
value="quality_asc",
|
206 |
+
info="How to sort clusters within each model card"
|
207 |
+
)
|
208 |
+
top_n_overview = gr.Slider(
|
209 |
+
label="Top N Clusters per Model",
|
210 |
+
minimum=1, maximum=10, value=3, step=1,
|
211 |
+
info="Number of top clusters to show per model"
|
212 |
+
)
|
213 |
+
|
214 |
+
overview_display = gr.HTML(label="Model Overview")
|
215 |
+
|
216 |
+
refresh_overview_btn = gr.Button("Refresh Overview")
|
217 |
+
|
218 |
+
# Tab 2: View Clusters
|
219 |
+
with gr.TabItem("π View Clusters"):
|
220 |
+
gr.Markdown("### Interactive Cluster Viewer")
|
221 |
+
gr.Markdown("Explore clusters with detailed property descriptions. Click on clusters to expand and view all properties within each cluster.")
|
222 |
+
|
223 |
+
with gr.Row():
|
224 |
+
search_clusters = gr.Textbox(
|
225 |
+
label="Search Clusters",
|
226 |
+
placeholder="Search in cluster descriptions...",
|
227 |
+
info="Search for specific terms in cluster descriptions only"
|
228 |
+
)
|
229 |
+
|
230 |
+
clusters_display = gr.HTML(
|
231 |
+
label="Interactive Cluster Viewer",
|
232 |
+
value="<p style='color: #666; padding: 20px;'>Load data and select models to view clusters</p>"
|
233 |
+
)
|
234 |
+
|
235 |
+
refresh_clusters_btn = gr.Button("Refresh Clusters")
|
236 |
+
|
237 |
+
# Tab 3: View Examples
|
238 |
+
with gr.TabItem("π View Examples"):
|
239 |
+
# gr.Markdown("### Individual Example Viewer")
|
240 |
+
# gr.Markdown("Explore individual examples with full prompts, model responses, and property information. Click on examples to expand and view full details.")
|
241 |
+
|
242 |
+
with gr.Row():
|
243 |
+
search_examples = gr.Textbox(
|
244 |
+
label="Search Clusters",
|
245 |
+
placeholder="Search in cluster descriptions...",
|
246 |
+
info="Search for specific terms in cluster descriptions to filter examples"
|
247 |
+
)
|
248 |
+
|
249 |
+
with gr.Row():
|
250 |
+
with gr.Column(scale=1):
|
251 |
+
example_prompt_dropdown = gr.Dropdown(
|
252 |
+
label="Select Prompt",
|
253 |
+
choices=["All Prompts"],
|
254 |
+
value="All Prompts",
|
255 |
+
info="Choose a specific prompt or 'All Prompts'"
|
256 |
+
)
|
257 |
+
with gr.Column(scale=1):
|
258 |
+
example_model_dropdown = gr.Dropdown(
|
259 |
+
label="Select Model",
|
260 |
+
choices=["All Models"],
|
261 |
+
value="All Models",
|
262 |
+
info="Choose a specific model or 'All Models'"
|
263 |
+
)
|
264 |
+
with gr.Column(scale=1):
|
265 |
+
example_property_dropdown = gr.Dropdown(
|
266 |
+
label="Select Cluster (Optional)",
|
267 |
+
choices=["All Clusters"],
|
268 |
+
value="All Clusters",
|
269 |
+
info="Choose a specific cluster or 'All Clusters'"
|
270 |
+
)
|
271 |
+
|
272 |
+
with gr.Row():
|
273 |
+
max_examples_slider = gr.Slider(
|
274 |
+
label="Max Examples",
|
275 |
+
minimum=1, maximum=20, value=5, step=1,
|
276 |
+
info="Maximum number of examples to display"
|
277 |
+
)
|
278 |
+
use_accordion_checkbox = gr.Checkbox(
|
279 |
+
label="Use Accordion for System/Info Messages",
|
280 |
+
value=True,
|
281 |
+
info="Group system and info messages in collapsible sections"
|
282 |
+
)
|
283 |
+
pretty_print_checkbox = gr.Checkbox(
|
284 |
+
label="Pretty-print dictionaries",
|
285 |
+
value=True,
|
286 |
+
info="Format embedded dictionaries for readability"
|
287 |
+
)
|
288 |
+
show_unexpected_behavior_checkbox = gr.Checkbox(
|
289 |
+
label="Show Unexpected Behavior Only",
|
290 |
+
value=False,
|
291 |
+
info="Filter to show only examples with unexpected behavior"
|
292 |
+
)
|
293 |
+
view_examples_btn = gr.Button("View Examples", variant="primary")
|
294 |
+
|
295 |
+
examples_display = gr.HTML(
|
296 |
+
label="Examples",
|
297 |
+
value="<p style='color: #666; padding: 20px;'>Load data and select filters to view examples</p>"
|
298 |
+
)
|
299 |
+
|
300 |
+
# Tab 4: Frequency Comparison
|
301 |
+
with gr.TabItem("π Functional Metrics Tables"):
|
302 |
+
gr.Markdown("View the three tables created by the functional metrics pipeline:")
|
303 |
+
gr.Markdown("β’ **Model-Cluster Scores**: Per model-cluster combination metrics")
|
304 |
+
gr.Markdown("β’ **Cluster Scores**: Per cluster metrics (aggregated across all models)")
|
305 |
+
gr.Markdown("β’ **Model Scores**: Per model metrics (aggregated across all clusters)")
|
306 |
+
|
307 |
+
frequency_table_info = gr.Markdown("")
|
308 |
+
|
309 |
+
# Three separate tables for the functional metrics
|
310 |
+
gr.Markdown("### Model-Cluster Scores")
|
311 |
+
gr.Markdown("Per model-cluster combination metrics")
|
312 |
+
model_cluster_table = gr.Dataframe(
|
313 |
+
label="Model-Cluster Scores",
|
314 |
+
interactive=False,
|
315 |
+
wrap=True,
|
316 |
+
max_height=600,
|
317 |
+
elem_classes=["frequency-comparison-table"],
|
318 |
+
show_search="search",
|
319 |
+
pinned_columns=2
|
320 |
+
)
|
321 |
+
|
322 |
+
gr.Markdown("### Cluster Scores")
|
323 |
+
gr.Markdown("Per cluster metrics (aggregated across all models)")
|
324 |
+
cluster_table = gr.Dataframe(
|
325 |
+
label="Cluster Scores",
|
326 |
+
interactive=False,
|
327 |
+
wrap=True,
|
328 |
+
max_height=600,
|
329 |
+
elem_classes=["frequency-comparison-table"],
|
330 |
+
show_search="search",
|
331 |
+
pinned_columns=2
|
332 |
+
)
|
333 |
+
|
334 |
+
gr.Markdown("### Model Scores")
|
335 |
+
gr.Markdown("Per model metrics (aggregated across all clusters)")
|
336 |
+
model_table = gr.Dataframe(
|
337 |
+
label="Model Scores",
|
338 |
+
interactive=False,
|
339 |
+
wrap=True,
|
340 |
+
max_height=600,
|
341 |
+
elem_classes=["frequency-comparison-table"],
|
342 |
+
show_search="search"
|
343 |
+
)
|
344 |
+
|
345 |
+
# Plots section has been removed
|
346 |
+
|
347 |
+
# Remove all custom CSS styling - use Gradio defaults
|
348 |
+
|
349 |
+
# Tab 5: Plots
|
350 |
+
with gr.TabItem("π Plots"):
|
351 |
+
plot_display, plot_info, show_ci_checkbox, plot_type_dropdown, quality_metric_dropdown = create_plots_tab()
|
352 |
+
|
353 |
+
# (Search Examples tab removed)
|
354 |
+
# Tab 6: Debug Data
|
355 |
+
with gr.TabItem("π Debug Data"):
|
356 |
+
gr.Markdown("### Data Structure Debug")
|
357 |
+
gr.Markdown("If tables aren't loading correctly, use this tab to inspect your data structure and identify issues.")
|
358 |
+
|
359 |
+
debug_display = gr.HTML(
|
360 |
+
label="Debug Information",
|
361 |
+
value="<p style='color: #666; padding: 20px;'>Load data to see debug information</p>"
|
362 |
+
)
|
363 |
+
|
364 |
+
debug_btn = gr.Button("Show Debug Info", variant="secondary")
|
365 |
+
|
366 |
+
# Event handlers
|
367 |
+
if BASE_RESULTS_DIR:
|
368 |
+
# Use dropdown for experiment selection
|
369 |
+
if 'experiment_dropdown' in locals():
|
370 |
+
(experiment_dropdown.change(
|
371 |
+
fn=load_experiment_data,
|
372 |
+
inputs=[experiment_dropdown],
|
373 |
+
outputs=[data_status, models_info, selected_models]
|
374 |
+
).then(
|
375 |
+
fn=update_example_dropdowns,
|
376 |
+
outputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown]
|
377 |
+
).then(
|
378 |
+
fn=view_examples,
|
379 |
+
inputs=[
|
380 |
+
example_prompt_dropdown,
|
381 |
+
example_model_dropdown,
|
382 |
+
example_property_dropdown,
|
383 |
+
max_examples_slider,
|
384 |
+
use_accordion_checkbox,
|
385 |
+
pretty_print_checkbox,
|
386 |
+
search_examples,
|
387 |
+
show_unexpected_behavior_checkbox,
|
388 |
+
],
|
389 |
+
outputs=[examples_display]
|
390 |
+
).then(
|
391 |
+
fn=update_top_n_slider_maximum,
|
392 |
+
outputs=[top_n_overview]
|
393 |
+
).then(
|
394 |
+
fn=create_frequency_comparison,
|
395 |
+
inputs=[selected_models],
|
396 |
+
outputs=[model_cluster_table, cluster_table, model_table, frequency_table_info]
|
397 |
+
).then(
|
398 |
+
fn=create_plot_with_toggle,
|
399 |
+
inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
|
400 |
+
outputs=[plot_display, plot_info]
|
401 |
+
).then(
|
402 |
+
fn=update_quality_metric_dropdown,
|
403 |
+
outputs=[quality_metric_dropdown]
|
404 |
+
))
|
405 |
+
else:
|
406 |
+
# Use textbox for manual path entry
|
407 |
+
if 'load_btn' in locals() and 'results_dir_input' in locals():
|
408 |
+
(load_btn.click(
|
409 |
+
fn=load_data,
|
410 |
+
inputs=[results_dir_input],
|
411 |
+
outputs=[data_status, models_info, selected_models]
|
412 |
+
).then(
|
413 |
+
fn=update_example_dropdowns,
|
414 |
+
outputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown]
|
415 |
+
).then(
|
416 |
+
fn=view_examples,
|
417 |
+
inputs=[
|
418 |
+
example_prompt_dropdown,
|
419 |
+
example_model_dropdown,
|
420 |
+
example_property_dropdown,
|
421 |
+
max_examples_slider,
|
422 |
+
use_accordion_checkbox,
|
423 |
+
pretty_print_checkbox,
|
424 |
+
search_examples,
|
425 |
+
show_unexpected_behavior_checkbox,
|
426 |
+
],
|
427 |
+
outputs=[examples_display]
|
428 |
+
).then(
|
429 |
+
fn=update_top_n_slider_maximum,
|
430 |
+
outputs=[top_n_overview]
|
431 |
+
).then(
|
432 |
+
fn=create_frequency_comparison,
|
433 |
+
inputs=[selected_models],
|
434 |
+
outputs=[model_cluster_table, cluster_table, model_table, frequency_table_info]
|
435 |
+
).then(
|
436 |
+
fn=create_plot_with_toggle,
|
437 |
+
inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
|
438 |
+
outputs=[plot_display, plot_info]
|
439 |
+
).then(
|
440 |
+
fn=update_quality_metric_dropdown,
|
441 |
+
outputs=[quality_metric_dropdown]
|
442 |
+
))
|
443 |
+
|
444 |
+
refresh_overview_btn.click(
|
445 |
+
fn=create_overview,
|
446 |
+
inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
|
447 |
+
outputs=[overview_display]
|
448 |
+
)
|
449 |
+
|
450 |
+
refresh_clusters_btn.click(
|
451 |
+
fn=view_clusters_interactive,
|
452 |
+
inputs=[selected_models, search_clusters],
|
453 |
+
outputs=[clusters_display]
|
454 |
+
)
|
455 |
+
|
456 |
+
# View Examples handlers
|
457 |
+
view_examples_btn.click(
|
458 |
+
fn=view_examples,
|
459 |
+
inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
|
460 |
+
outputs=[examples_display]
|
461 |
+
)
|
462 |
+
|
463 |
+
# Auto-refresh examples when dropdowns change
|
464 |
+
example_prompt_dropdown.change(
|
465 |
+
fn=view_examples,
|
466 |
+
inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
|
467 |
+
outputs=[examples_display]
|
468 |
+
)
|
469 |
+
|
470 |
+
example_model_dropdown.change(
|
471 |
+
fn=view_examples,
|
472 |
+
inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
|
473 |
+
outputs=[examples_display]
|
474 |
+
)
|
475 |
+
|
476 |
+
example_property_dropdown.change(
|
477 |
+
fn=view_examples,
|
478 |
+
inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
|
479 |
+
outputs=[examples_display]
|
480 |
+
)
|
481 |
+
|
482 |
+
# Auto-refresh examples when search term changes
|
483 |
+
search_examples.change(
|
484 |
+
fn=view_examples,
|
485 |
+
inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
|
486 |
+
outputs=[examples_display]
|
487 |
+
)
|
488 |
+
|
489 |
+
# Auto-refresh examples when unexpected behavior checkbox changes
|
490 |
+
show_unexpected_behavior_checkbox.change(
|
491 |
+
fn=view_examples,
|
492 |
+
inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
|
493 |
+
outputs=[examples_display]
|
494 |
+
)
|
495 |
+
|
496 |
+
# Frequency Tab Handlers
|
497 |
+
freq_inputs = [selected_models]
|
498 |
+
freq_outputs = [model_cluster_table, cluster_table, model_table, frequency_table_info]
|
499 |
+
|
500 |
+
selected_models.change(fn=create_frequency_comparison, inputs=freq_inputs, outputs=freq_outputs)
|
501 |
+
|
502 |
+
# (Search Examples tab removed β no search_btn handler required)
|
503 |
+
|
504 |
+
debug_btn.click(
|
505 |
+
fn=debug_data_structure,
|
506 |
+
outputs=[debug_display]
|
507 |
+
)
|
508 |
+
|
509 |
+
# Plots Tab Handlers
|
510 |
+
show_ci_checkbox.change(
|
511 |
+
fn=create_plot_with_toggle,
|
512 |
+
inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
|
513 |
+
outputs=[plot_display, plot_info]
|
514 |
+
)
|
515 |
+
|
516 |
+
# Quality metric dropdown handlers (only for quality plots)
|
517 |
+
quality_metric_dropdown.change(
|
518 |
+
fn=create_plot_with_toggle,
|
519 |
+
inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
|
520 |
+
outputs=[plot_display, plot_info]
|
521 |
+
)
|
522 |
+
|
523 |
+
# Update quality metric visibility and plot based on plot type
|
524 |
+
plot_type_dropdown.change(
|
525 |
+
fn=update_quality_metric_visibility,
|
526 |
+
inputs=[plot_type_dropdown],
|
527 |
+
outputs=[quality_metric_dropdown]
|
528 |
+
).then(
|
529 |
+
fn=create_plot_with_toggle,
|
530 |
+
inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
|
531 |
+
outputs=[plot_display, plot_info]
|
532 |
+
)
|
533 |
+
|
534 |
+
# Auto-refresh on model selection change
|
535 |
+
selected_models.change(
|
536 |
+
fn=create_overview,
|
537 |
+
inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
|
538 |
+
outputs=[overview_display]
|
539 |
+
)
|
540 |
+
|
541 |
+
# Auto-refresh on significance filter changes
|
542 |
+
score_significant_only.change(
|
543 |
+
fn=create_overview,
|
544 |
+
inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
|
545 |
+
outputs=[overview_display]
|
546 |
+
)
|
547 |
+
|
548 |
+
quality_significant_only.change(
|
549 |
+
fn=create_overview,
|
550 |
+
inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
|
551 |
+
outputs=[overview_display]
|
552 |
+
)
|
553 |
+
|
554 |
+
# Auto-refresh on sort dropdown change
|
555 |
+
sort_by.change(
|
556 |
+
fn=create_overview,
|
557 |
+
inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
|
558 |
+
outputs=[overview_display]
|
559 |
+
)
|
560 |
+
|
561 |
+
# Auto-refresh on cluster level change
|
562 |
+
# cluster_level.change(
|
563 |
+
# fn=create_overview,
|
564 |
+
# inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
|
565 |
+
# outputs=[overview_display]
|
566 |
+
# )
|
567 |
+
|
568 |
+
# Auto-refresh on top N change
|
569 |
+
top_n_overview.change(
|
570 |
+
fn=create_overview,
|
571 |
+
inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
|
572 |
+
outputs=[overview_display]
|
573 |
+
)
|
574 |
+
|
575 |
+
# Auto-refresh on minimum cluster size change
|
576 |
+
min_cluster_size.change(
|
577 |
+
fn=create_overview,
|
578 |
+
inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
|
579 |
+
outputs=[overview_display]
|
580 |
+
)
|
581 |
+
|
582 |
+
selected_models.change(
|
583 |
+
fn=view_clusters_interactive,
|
584 |
+
inputs=[selected_models, gr.State("fine"), search_clusters],
|
585 |
+
outputs=[clusters_display]
|
586 |
+
)
|
587 |
+
|
588 |
+
# Auto-refresh clusters when search term changes (with debouncing)
|
589 |
+
search_clusters.change(
|
590 |
+
fn=view_clusters_interactive,
|
591 |
+
inputs=[selected_models, gr.State("fine"), search_clusters],
|
592 |
+
outputs=[clusters_display]
|
593 |
+
)
|
594 |
+
|
595 |
+
return app
|
596 |
+
|
597 |
+
|
598 |
+
def launch_app(results_dir: Optional[str] = None,
|
599 |
+
share: bool = False,
|
600 |
+
server_name: str = "127.0.0.1",
|
601 |
+
server_port: int = 7860,
|
602 |
+
**kwargs) -> None:
|
603 |
+
"""Launch the Gradio application.
|
604 |
+
|
605 |
+
Args:
|
606 |
+
results_dir: Optional path to base results directory containing experiment subfolders
|
607 |
+
share: Whether to create a public link
|
608 |
+
server_name: Server address
|
609 |
+
server_port: Server port
|
610 |
+
**kwargs: Additional arguments for gr.Blocks.launch()
|
611 |
+
"""
|
612 |
+
global BASE_RESULTS_DIR
|
613 |
+
|
614 |
+
# Set the global base results directory
|
615 |
+
if results_dir:
|
616 |
+
BASE_RESULTS_DIR = results_dir
|
617 |
+
print(f"π Base results directory set to: {results_dir}")
|
618 |
+
|
619 |
+
# Check if it's a valid directory
|
620 |
+
if not os.path.exists(results_dir):
|
621 |
+
print(f"β οΈ Warning: Base results directory does not exist: {results_dir}")
|
622 |
+
BASE_RESULTS_DIR = None
|
623 |
+
else:
|
624 |
+
# Scan for available experiments
|
625 |
+
experiments = get_available_experiments(results_dir)
|
626 |
+
print(f"π Found {len(experiments)} experiments: {experiments}")
|
627 |
+
|
628 |
+
app = create_app()
|
629 |
+
|
630 |
+
# Auto-load data if results_dir is provided and contains a single experiment
|
631 |
+
if results_dir and os.path.exists(results_dir):
|
632 |
+
experiments = get_available_experiments(results_dir)
|
633 |
+
if len(experiments) == 1:
|
634 |
+
# Auto-load the single experiment
|
635 |
+
experiment_path = os.path.join(results_dir, experiments[0])
|
636 |
+
try:
|
637 |
+
clustered_df, model_stats, model_cluster_df, results_path = load_pipeline_results(experiment_path)
|
638 |
+
app_state['clustered_df'] = clustered_df
|
639 |
+
app_state['model_stats'] = model_stats
|
640 |
+
app_state['model_cluster_df'] = model_cluster_df
|
641 |
+
app_state['results_path'] = results_path
|
642 |
+
app_state['available_models'] = get_available_models(model_stats)
|
643 |
+
app_state['current_results_dir'] = experiment_path
|
644 |
+
print(f"β
Auto-loaded data from: {experiment_path}")
|
645 |
+
except Exception as e:
|
646 |
+
print(f"β Failed to auto-load data: {e}")
|
647 |
+
elif len(experiments) > 1:
|
648 |
+
print(f"π Multiple experiments found. Please select one from the dropdown.")
|
649 |
+
|
650 |
+
print(f"π Launching Gradio app on {server_name}:{server_port}")
|
651 |
+
print(f"Share mode: {share}")
|
652 |
+
print(f"π§ Additional kwargs: {kwargs}")
|
653 |
+
|
654 |
+
try:
|
655 |
+
app.launch(
|
656 |
+
share=share,
|
657 |
+
server_name=server_name,
|
658 |
+
server_port=server_port,
|
659 |
+
show_error=True, # Show detailed error messages
|
660 |
+
quiet=False, # Show more verbose output
|
661 |
+
**kwargs
|
662 |
+
)
|
663 |
+
except Exception as e:
|
664 |
+
print(f"β Failed to launch on port {server_port}: {e}")
|
665 |
+
print("π Trying alternative port configuration...")
|
666 |
+
|
667 |
+
# Try with a port range instead of port 0
|
668 |
+
try:
|
669 |
+
# Try ports in a reasonable range
|
670 |
+
for alt_port in [8080, 8081, 8082, 8083, 8084, 8085, 8086, 8087, 8088, 8089]:
|
671 |
+
try:
|
672 |
+
print(f"π Trying port {alt_port}...")
|
673 |
+
app.launch(
|
674 |
+
share=share,
|
675 |
+
server_name=server_name,
|
676 |
+
server_port=alt_port,
|
677 |
+
show_error=True,
|
678 |
+
quiet=False,
|
679 |
+
**kwargs
|
680 |
+
)
|
681 |
+
break # If successful, break out of the loop
|
682 |
+
except Exception as port_error:
|
683 |
+
if "Cannot find empty port" in str(port_error):
|
684 |
+
print(f" Port {alt_port} is busy, trying next...")
|
685 |
+
continue
|
686 |
+
else:
|
687 |
+
raise port_error
|
688 |
+
else:
|
689 |
+
# If we get here, all ports in our range were busy
|
690 |
+
raise Exception("All attempted ports (8080-8089) are busy")
|
691 |
+
|
692 |
+
except Exception as e2:
|
693 |
+
print(f"β Failed to launch with alternative ports: {e2}")
|
694 |
+
print("π‘ Try specifying a different port manually:")
|
695 |
+
print(f" python -m lmmvibes.vis_gradio.launcher --port 9000")
|
696 |
+
print(f" python -m lmmvibes.vis_gradio.launcher --auto_port")
|
697 |
+
raise e2
|
lmmvibes/vis_gradio/clusters_tab.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Helpers for the **View Clusters** tab β both the interactive HTML and
|
2 |
+
fallback dataframe view."""
|
3 |
+
from typing import List
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
from .state import app_state
|
8 |
+
from .utils import (
|
9 |
+
search_clusters_by_text,
|
10 |
+
search_clusters_only,
|
11 |
+
create_interactive_cluster_viewer,
|
12 |
+
get_cluster_statistics,
|
13 |
+
format_cluster_dataframe,
|
14 |
+
)
|
15 |
+
|
16 |
+
__all__ = ["view_clusters_interactive", "view_clusters_table"]
|
17 |
+
|
18 |
+
|
19 |
+
# ---------------------------------------------------------------------------
|
20 |
+
# Interactive HTML view
|
21 |
+
# ---------------------------------------------------------------------------
|
22 |
+
|
23 |
+
def view_clusters_interactive(
|
24 |
+
selected_models: List[str],
|
25 |
+
cluster_level: str,
|
26 |
+
search_term: str = "",
|
27 |
+
) -> str:
|
28 |
+
if app_state["clustered_df"] is None:
|
29 |
+
return (
|
30 |
+
"<p style='color: #e74c3c; padding: 20px;'>β Please load data first "
|
31 |
+
"using the 'Load Data' tab</p>"
|
32 |
+
)
|
33 |
+
|
34 |
+
df = app_state["clustered_df"].dropna(subset=["property_description"]).copy()
|
35 |
+
|
36 |
+
# Apply search filter first
|
37 |
+
if search_term and search_term.strip():
|
38 |
+
df = search_clusters_only(df, search_term.strip(), cluster_level)
|
39 |
+
|
40 |
+
# Build interactive viewer
|
41 |
+
cluster_html = create_interactive_cluster_viewer(df, selected_models, cluster_level)
|
42 |
+
|
43 |
+
# Statistics summary at the top
|
44 |
+
stats = get_cluster_statistics(df, selected_models)
|
45 |
+
if not stats:
|
46 |
+
return (
|
47 |
+
"<p style='color: #e74c3c; padding: 20px;'>β No cluster data available</p>"
|
48 |
+
)
|
49 |
+
|
50 |
+
# Get additional metrics from cluster_scores
|
51 |
+
cluster_scores = app_state.get("metrics", {}).get("cluster_scores", {})
|
52 |
+
|
53 |
+
# Calculate average quality scores and frequency
|
54 |
+
total_frequency = 0
|
55 |
+
quality_scores_list = []
|
56 |
+
metric_names = set()
|
57 |
+
|
58 |
+
for cluster_name, cluster_data in cluster_scores.items():
|
59 |
+
total_frequency += cluster_data.get("proportion", 0) * 100
|
60 |
+
quality_scores = cluster_data.get("quality", {})
|
61 |
+
if quality_scores:
|
62 |
+
quality_scores_list.extend(quality_scores.values())
|
63 |
+
metric_names.update(quality_scores.keys())
|
64 |
+
|
65 |
+
avg_quality = sum(quality_scores_list) / len(quality_scores_list) if quality_scores_list else 0
|
66 |
+
metrics_suffix = f" ({', '.join(sorted(metric_names))})" if metric_names else ""
|
67 |
+
|
68 |
+
stats_html = f"""
|
69 |
+
<div style="
|
70 |
+
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
|
71 |
+
color: white;
|
72 |
+
padding: 20px;
|
73 |
+
border-radius: 8px;
|
74 |
+
margin-bottom: 20px;
|
75 |
+
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
76 |
+
">
|
77 |
+
<h3 style="margin: 0 0 15px 0;">Cluster Statistics</h3>
|
78 |
+
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
|
79 |
+
<div>
|
80 |
+
<div style="font-size: 24px; font-weight: bold;">{stats['total_properties']:,}</div>
|
81 |
+
<div style="opacity: 0.9;">Total Properties</div>
|
82 |
+
</div>
|
83 |
+
<div>
|
84 |
+
<div style="font-size: 24px; font-weight: bold;">{stats['total_models']}</div>
|
85 |
+
<div style="opacity: 0.9;">Models</div>
|
86 |
+
</div>
|
87 |
+
<div>
|
88 |
+
<div style="font-size: 24px; font-weight: bold;">{avg_quality:.3f}</div>
|
89 |
+
<div style="opacity: 0.9;">Avg Quality{metrics_suffix}</div>
|
90 |
+
</div>
|
91 |
+
"""
|
92 |
+
|
93 |
+
if cluster_level == "fine" and "fine_clusters" in stats:
|
94 |
+
stats_html += f"""
|
95 |
+
<div>
|
96 |
+
<div style="font-size: 24px; font-weight: bold;">{stats['fine_clusters']}</div>
|
97 |
+
<div style="opacity: 0.9;">Fine Clusters</div>
|
98 |
+
</div>
|
99 |
+
<div>
|
100 |
+
<div style="font-size: 24px; font-weight: bold;">{stats['avg_properties_per_fine_cluster']:.1f}</div>
|
101 |
+
<div style="opacity: 0.9;">Avg Properties/Cluster</div>
|
102 |
+
</div>
|
103 |
+
"""
|
104 |
+
elif cluster_level == "coarse" and "coarse_clusters" in stats:
|
105 |
+
stats_html += f"""
|
106 |
+
<div>
|
107 |
+
<div style="font-size: 24px; font-weight: bold;">{stats['coarse_clusters']}</div>
|
108 |
+
<div style="opacity: 0.9;">Coarse Clusters</div>
|
109 |
+
</div>
|
110 |
+
<div>
|
111 |
+
<div style="font-size: 24px; font-weight: bold;">{stats['avg_properties_per_coarse_cluster']:.1f}</div>
|
112 |
+
<div style="opacity: 0.9;">Avg Properties/Cluster</div>
|
113 |
+
</div>
|
114 |
+
"""
|
115 |
+
|
116 |
+
stats_html += """
|
117 |
+
</div>
|
118 |
+
</div>
|
119 |
+
"""
|
120 |
+
|
121 |
+
# Add a note if coarse clusters were requested but not available
|
122 |
+
if cluster_level == "coarse" and "coarse_clusters" not in stats and "fine_clusters" in stats:
|
123 |
+
stats_html += """
|
124 |
+
<div style="
|
125 |
+
background: #fff3cd;
|
126 |
+
border-left: 4px solid #ffc107;
|
127 |
+
padding: 10px 15px;
|
128 |
+
margin-bottom: 15px;
|
129 |
+
border-radius: 4px;
|
130 |
+
">
|
131 |
+
β οΈ <strong>Note:</strong> Coarse clusters not available in this dataset. Showing fine clusters instead.
|
132 |
+
</div>
|
133 |
+
"""
|
134 |
+
|
135 |
+
# Additional filter chips
|
136 |
+
filter_info = ""
|
137 |
+
if search_term and search_term.strip():
|
138 |
+
filter_info += f"""
|
139 |
+
<div style="
|
140 |
+
background: #e3f2fd;
|
141 |
+
border-left: 4px solid #2196f3;
|
142 |
+
padding: 10px 15px;
|
143 |
+
margin-bottom: 15px;
|
144 |
+
border-radius: 4px;
|
145 |
+
">
|
146 |
+
π <strong>Search Filter:</strong> "{search_term}"
|
147 |
+
</div>
|
148 |
+
"""
|
149 |
+
|
150 |
+
if selected_models:
|
151 |
+
filter_info += f"""
|
152 |
+
<div style="
|
153 |
+
background: #f3e5f5;
|
154 |
+
border-left: 4px solid #9c27b0;
|
155 |
+
padding: 10px 15px;
|
156 |
+
margin-bottom: 15px;
|
157 |
+
border-radius: 4px;
|
158 |
+
">
|
159 |
+
π― <strong>Selected Models:</strong> {', '.join(selected_models)}
|
160 |
+
</div>
|
161 |
+
"""
|
162 |
+
|
163 |
+
return stats_html + filter_info + cluster_html
|
164 |
+
|
165 |
+
|
166 |
+
# ---------------------------------------------------------------------------
|
167 |
+
# Dataframe fallback view
|
168 |
+
# ---------------------------------------------------------------------------
|
169 |
+
|
170 |
+
def view_clusters_table(
|
171 |
+
selected_models: List[str],
|
172 |
+
cluster_level: str,
|
173 |
+
search_term: str = "",
|
174 |
+
) -> pd.DataFrame:
|
175 |
+
if app_state["clustered_df"] is None:
|
176 |
+
return pd.DataFrame({"Message": ["Please load data first using the 'Load Data' tab"]})
|
177 |
+
|
178 |
+
df = app_state["clustered_df"].copy()
|
179 |
+
|
180 |
+
if search_term and search_term.strip():
|
181 |
+
df = search_clusters_only(df, search_term.strip(), cluster_level)
|
182 |
+
|
183 |
+
formatted_df = format_cluster_dataframe(df, selected_models, cluster_level)
|
184 |
+
|
185 |
+
if formatted_df.empty:
|
186 |
+
if search_term and search_term.strip():
|
187 |
+
return pd.DataFrame({"Message": [f"No results found for search term '{search_term}'. Try a different search term."]})
|
188 |
+
elif selected_models:
|
189 |
+
available_models = df["model"].unique().tolist() if "model" in df.columns else []
|
190 |
+
return pd.DataFrame({"Message": [
|
191 |
+
f"No data found for selected models: {', '.join(selected_models)}. "
|
192 |
+
f"Available models: {', '.join(available_models)}"
|
193 |
+
]})
|
194 |
+
else:
|
195 |
+
return pd.DataFrame({"Message": [
|
196 |
+
"No data available. Please check your data files and try reloading."
|
197 |
+
]})
|
198 |
+
|
199 |
+
return formatted_df
|
lmmvibes/vis_gradio/conversation_display.py
ADDED
@@ -0,0 +1,507 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
"""Conversation display helpers for vis_gradio.
|
4 |
+
|
5 |
+
This module encapsulates everything related to:
|
6 |
+
β’ safely parsing model responses (lists / dicts / JSON strings)
|
7 |
+
β’ pretty-printing embedded dictionaries for readability
|
8 |
+
β’ converting multiple conversation formats to the OpenAI chat list format
|
9 |
+
β’ rendering that list as HTML (including accordion grouping + raw JSON viewer).
|
10 |
+
|
11 |
+
Moving this logic out of utils.py keeps the latter lean and focussed on general
|
12 |
+
analytics utilities.
|
13 |
+
"""
|
14 |
+
|
15 |
+
from typing import List, Dict, Any
|
16 |
+
import ast
|
17 |
+
import json
|
18 |
+
import html
|
19 |
+
import markdown
|
20 |
+
import re
|
21 |
+
|
22 |
+
__all__: List[str] = [
|
23 |
+
"convert_to_openai_format",
|
24 |
+
"display_openai_conversation_html",
|
25 |
+
"pretty_print_embedded_dicts",
|
26 |
+
]
|
27 |
+
|
28 |
+
# ---------------------------------------------------------------------------
|
29 |
+
# Pretty-printing helpers
|
30 |
+
# ---------------------------------------------------------------------------
|
31 |
+
|
32 |
+
def _find_balanced_spans(text: str):
|
33 |
+
"""Return (start, end) spans of balanced {...} or [...] regions in *text*."""
|
34 |
+
spans, stack = [], []
|
35 |
+
for i, ch in enumerate(text):
|
36 |
+
if ch in "{[":
|
37 |
+
stack.append((ch, i))
|
38 |
+
elif ch in "]}" and stack:
|
39 |
+
opener, start = stack.pop()
|
40 |
+
if (opener, ch) in {("{", "}"), ("[", "]")} and not stack:
|
41 |
+
spans.append((start, i + 1))
|
42 |
+
return spans
|
43 |
+
|
44 |
+
|
45 |
+
def _try_parse_slice(slice_: str):
|
46 |
+
"""Attempt to parse *slice_* into a Python object; return None on failure."""
|
47 |
+
try:
|
48 |
+
return ast.literal_eval(slice_)
|
49 |
+
except Exception:
|
50 |
+
try:
|
51 |
+
return json.loads(slice_)
|
52 |
+
except Exception:
|
53 |
+
return None
|
54 |
+
|
55 |
+
|
56 |
+
def _find_code_spans(text: str) -> List[tuple]:
|
57 |
+
"""Return spans for markdown code regions to be preserved as-is.
|
58 |
+
|
59 |
+
Includes:
|
60 |
+
- fenced code blocks delimited by ``` ... ```
|
61 |
+
- inline code segments delimited by `...`
|
62 |
+
"""
|
63 |
+
spans: List[tuple] = []
|
64 |
+
|
65 |
+
# Fenced blocks ``` ... ``` (language spec allowed after opening fence)
|
66 |
+
idx = 0
|
67 |
+
while True:
|
68 |
+
start = text.find("```", idx)
|
69 |
+
if start == -1:
|
70 |
+
break
|
71 |
+
# Find the end fence
|
72 |
+
end = text.find("```", start + 3)
|
73 |
+
if end == -1:
|
74 |
+
# Unclosed fence: treat rest of string as code
|
75 |
+
spans.append((start, len(text)))
|
76 |
+
break
|
77 |
+
spans.append((start, end + 3))
|
78 |
+
idx = end + 3
|
79 |
+
|
80 |
+
# Inline code `...`
|
81 |
+
for m in re.finditer(r"`[^`]*`", text, flags=re.DOTALL):
|
82 |
+
spans.append((m.start(), m.end()))
|
83 |
+
|
84 |
+
# Sort and merge overlapping spans
|
85 |
+
spans.sort()
|
86 |
+
merged: List[tuple] = []
|
87 |
+
for s, e in spans:
|
88 |
+
if not merged or s > merged[-1][1]:
|
89 |
+
merged.append((s, e))
|
90 |
+
else:
|
91 |
+
merged[-1] = (merged[-1][0], max(merged[-1][1], e))
|
92 |
+
return merged
|
93 |
+
|
94 |
+
|
95 |
+
def _is_inside_any_span(start: int, end: int, spans: List[tuple]) -> bool:
|
96 |
+
for s, e in spans:
|
97 |
+
if start >= s and end <= e:
|
98 |
+
return True
|
99 |
+
return False
|
100 |
+
|
101 |
+
|
102 |
+
def pretty_print_embedded_dicts(text: str) -> str:
|
103 |
+
"""Replace dicts or list-of-dicts with a `<pre>` block, except inside code.
|
104 |
+
|
105 |
+
Dict-like regions that fall within markdown code spans (inline backticks
|
106 |
+
or fenced code blocks) are left untouched so code examples render verbatim.
|
107 |
+
"""
|
108 |
+
if not text:
|
109 |
+
return text
|
110 |
+
|
111 |
+
code_spans = _find_code_spans(text)
|
112 |
+
|
113 |
+
new_parts, last_idx = [], 0
|
114 |
+
for start, end in _find_balanced_spans(text):
|
115 |
+
candidate = text[start:end]
|
116 |
+
parsed = _try_parse_slice(candidate)
|
117 |
+
is_good = isinstance(parsed, dict) or (
|
118 |
+
isinstance(parsed, list) and parsed and all(isinstance(d, dict) for d in parsed)
|
119 |
+
)
|
120 |
+
if is_good and not _is_inside_any_span(start, end, code_spans):
|
121 |
+
new_parts.append(html.escape(text[last_idx:start], quote=False))
|
122 |
+
pretty = json.dumps(parsed, indent=2, ensure_ascii=False)
|
123 |
+
new_parts.append(
|
124 |
+
f"<pre style='background:#f8f9fa;padding:10px;border-radius:4px;overflow-x:auto;'>{pretty}</pre>"
|
125 |
+
)
|
126 |
+
last_idx = end
|
127 |
+
new_parts.append(html.escape(text[last_idx:], quote=False))
|
128 |
+
return "".join(new_parts)
|
129 |
+
|
130 |
+
# ---------------------------------------------------------------------------
|
131 |
+
# Format conversion
|
132 |
+
# ---------------------------------------------------------------------------
|
133 |
+
|
134 |
+
def convert_to_openai_format(response_data: Any):
|
135 |
+
"""Convert various response payloads into the OpenAI chat format list."""
|
136 |
+
if isinstance(response_data, list):
|
137 |
+
return response_data
|
138 |
+
if isinstance(response_data, str):
|
139 |
+
# Try Python literal first (handles single quotes)
|
140 |
+
try:
|
141 |
+
parsed = ast.literal_eval(response_data)
|
142 |
+
if isinstance(parsed, list):
|
143 |
+
return parsed
|
144 |
+
except (ValueError, SyntaxError):
|
145 |
+
pass
|
146 |
+
# Try JSON
|
147 |
+
try:
|
148 |
+
parsed = json.loads(response_data)
|
149 |
+
if isinstance(parsed, list):
|
150 |
+
return parsed
|
151 |
+
except json.JSONDecodeError:
|
152 |
+
pass
|
153 |
+
# Fallback plain-text assistant message
|
154 |
+
return [{"role": "assistant", "content": response_data}]
|
155 |
+
# Fallback for any other type
|
156 |
+
return [{"role": "assistant", "content": str(response_data)}]
|
157 |
+
|
158 |
+
# ---------------------------------------------------------------------------
|
159 |
+
# HTML rendering
|
160 |
+
# ---------------------------------------------------------------------------
|
161 |
+
|
162 |
+
def _markdown(text: str, *, pretty_print_dicts: bool = True) -> str:
|
163 |
+
"""Render markdown, optionally pretty-printing any embedded dicts."""
|
164 |
+
processed = pretty_print_embedded_dicts(text) if pretty_print_dicts else html.escape(text, quote=False)
|
165 |
+
|
166 |
+
# Configure extensions for proper code block handling
|
167 |
+
extensions = ["fenced_code"]
|
168 |
+
extension_configs = {}
|
169 |
+
|
170 |
+
try:
|
171 |
+
import pygments
|
172 |
+
extensions.append("codehilite")
|
173 |
+
extension_configs['codehilite'] = {
|
174 |
+
'css_class': 'highlight',
|
175 |
+
'use_pygments': True,
|
176 |
+
'guess_lang': True,
|
177 |
+
'linenums': False
|
178 |
+
}
|
179 |
+
except ImportError:
|
180 |
+
pass
|
181 |
+
|
182 |
+
# Convert newlines to <br> only outside of code blocks
|
183 |
+
# Process fenced code blocks first, then handle line breaks
|
184 |
+
result = markdown.markdown(processed, extensions=extensions, extension_configs=extension_configs)
|
185 |
+
|
186 |
+
# Add line breaks for non-code content (simple approach)
|
187 |
+
# This replaces single newlines with <br> but preserves code blocks
|
188 |
+
import re
|
189 |
+
|
190 |
+
# Split by code blocks to avoid affecting them
|
191 |
+
code_block_pattern = r'(<pre[^>]*>.*?</pre>|<code[^>]*>.*?</code>)'
|
192 |
+
parts = re.split(code_block_pattern, result, flags=re.DOTALL)
|
193 |
+
|
194 |
+
for i in range(0, len(parts), 2): # Process non-code parts only
|
195 |
+
if i < len(parts):
|
196 |
+
# Replace single newlines with <br>, but not double newlines (paragraphs)
|
197 |
+
parts[i] = re.sub(r'(?<!\n)\n(?!\n)', '<br>\n', parts[i])
|
198 |
+
|
199 |
+
return ''.join(parts)
|
200 |
+
|
201 |
+
|
202 |
+
def display_openai_conversation_html(conversation_data: List[Dict[str, Any]], *, use_accordion: bool = True, pretty_print_dicts: bool = True) -> str:
|
203 |
+
"""Convert an OpenAI-style conversation list into styled HTML for Gradio."""
|
204 |
+
|
205 |
+
if not conversation_data:
|
206 |
+
return "<p>No conversation data available</p>"
|
207 |
+
|
208 |
+
# Collapsed raw JSON section for debugging
|
209 |
+
raw_json = json.dumps(conversation_data, indent=2, ensure_ascii=False)
|
210 |
+
html_out = f"""
|
211 |
+
<details style="margin: 8px 0;">
|
212 |
+
<summary style="cursor: pointer; font-weight: 600;">
|
213 |
+
Click to see raw response ({len(conversation_data)})
|
214 |
+
</summary>
|
215 |
+
<div style="padding: 8px 15px;">
|
216 |
+
<pre style="white-space: pre-wrap; word-wrap: break-word; background: #f8f9fa; padding: 10px; border-radius: 4px; overflow-x: auto;">{raw_json}</pre>
|
217 |
+
</div>
|
218 |
+
</details>
|
219 |
+
"""
|
220 |
+
|
221 |
+
role_colors = {
|
222 |
+
"system": "#ff6b6b",
|
223 |
+
"info": "#4ecdc4",
|
224 |
+
"assistant": "#45b7d1",
|
225 |
+
"tool": "#96ceb4",
|
226 |
+
"user": "#feca57",
|
227 |
+
}
|
228 |
+
|
229 |
+
def _format_msg(role: str, content: Any) -> str:
|
230 |
+
if isinstance(content, dict) or (isinstance(content, list) and content and all(isinstance(d, dict) for d in content)):
|
231 |
+
if pretty_print_dicts:
|
232 |
+
content_html = (
|
233 |
+
f"<pre style='background: #f8f9fa; padding: 10px; border-radius: 4px; overflow-x: auto;'>{json.dumps(content, indent=2, ensure_ascii=False)}</pre>"
|
234 |
+
)
|
235 |
+
else:
|
236 |
+
content_html = f"<code>{html.escape(json.dumps(content, ensure_ascii=False))}</code>"
|
237 |
+
elif isinstance(content, str):
|
238 |
+
content_html = _markdown(content, pretty_print_dicts=pretty_print_dicts)
|
239 |
+
elif content is None:
|
240 |
+
content_html = "<em>(No content)</em>"
|
241 |
+
else:
|
242 |
+
content_html = str(content)
|
243 |
+
color = role_colors.get(role.lower(), "#95a5a6")
|
244 |
+
return (
|
245 |
+
f"<div style='border-left: 4px solid {color}; margin: 8px 0; background-color: #ffffff; padding: 12px; border-radius: 0 8px 8px 0;'>"
|
246 |
+
f"<div style='font-weight: 600; color: {color}; margin-bottom: 8px; text-transform: capitalize; font-size: 14px;'>{role}</div>"
|
247 |
+
f"<div style='color: #333; line-height: 1.6; font-family: \"Segoe UI\", Tahoma, Geneva, Verdana, sans-serif;'>{content_html}</div>"
|
248 |
+
"</div>"
|
249 |
+
)
|
250 |
+
|
251 |
+
if use_accordion:
|
252 |
+
system_msgs, info_msgs, other_msgs = [], [], []
|
253 |
+
for m in conversation_data:
|
254 |
+
if not isinstance(m, dict):
|
255 |
+
continue
|
256 |
+
role = m.get("role", "unknown").lower()
|
257 |
+
content = m.get("content", "")
|
258 |
+
if isinstance(content, dict) and "text" in content:
|
259 |
+
content = content["text"]
|
260 |
+
if role == "system":
|
261 |
+
system_msgs.append((role, content))
|
262 |
+
elif role == "info":
|
263 |
+
info_msgs.append((role, content))
|
264 |
+
else:
|
265 |
+
other_msgs.append((role, content))
|
266 |
+
|
267 |
+
def _accordion(title: str, items: List):
|
268 |
+
if not items:
|
269 |
+
return ""
|
270 |
+
inner = "".join(_format_msg(r, c) for r, c in items)
|
271 |
+
return (
|
272 |
+
f"<details style='margin: 8px 0;'>"
|
273 |
+
f"<summary style='cursor: pointer; font-weight: 600;'>"
|
274 |
+
f"{html.escape(title)} ({len(items)})" # e.g. "Click to see system messages (3)"
|
275 |
+
f"</summary>"
|
276 |
+
f"<div style='padding: 8px 15px;'>{inner}</div>"
|
277 |
+
"</details>"
|
278 |
+
)
|
279 |
+
|
280 |
+
html_out += _accordion("Click to see system messages", system_msgs)
|
281 |
+
html_out += _accordion("Click to see info messages", info_msgs)
|
282 |
+
for r, c in other_msgs:
|
283 |
+
html_out += _format_msg(r, c)
|
284 |
+
else:
|
285 |
+
# No accordion: just render everything
|
286 |
+
for m in conversation_data:
|
287 |
+
if not isinstance(m, dict):
|
288 |
+
continue
|
289 |
+
role = m.get("role", "unknown").lower()
|
290 |
+
content = m.get("content", "")
|
291 |
+
if isinstance(content, dict) and "text" in content:
|
292 |
+
content = content["text"]
|
293 |
+
html_out += _format_msg(role, content)
|
294 |
+
|
295 |
+
# CSS for proper code block styling and summary hover effects
|
296 |
+
css_styles = """
|
297 |
+
<style>
|
298 |
+
:root {
|
299 |
+
/* Code block color palette - GitHub Light inspired */
|
300 |
+
--code-bg: #f6f8fa;
|
301 |
+
--code-text: #24292f;
|
302 |
+
--code-comment: #6a737d;
|
303 |
+
--code-keyword: #d73a49;
|
304 |
+
--code-string: #032f62;
|
305 |
+
--code-number: #005cc5;
|
306 |
+
--code-operator: #24292f;
|
307 |
+
--code-function: #6f42c1;
|
308 |
+
--code-border: #d0d7de;
|
309 |
+
|
310 |
+
/* Inline code colors - same light theme */
|
311 |
+
--inline-code-bg: #f3f4f6;
|
312 |
+
--inline-code-text: #24292f;
|
313 |
+
--inline-code-border: #d1d5db;
|
314 |
+
|
315 |
+
/* Code block structure */
|
316 |
+
--code-border-radius: 8px;
|
317 |
+
--code-padding: 16px;
|
318 |
+
--code-font-size: 14px;
|
319 |
+
--code-line-height: 1.5;
|
320 |
+
--code-font-family: 'JetBrains Mono', 'Fira Code', 'Cascadia Code', 'SF Mono', Consolas, 'Liberation Mono', Menlo, Courier, monospace;
|
321 |
+
}
|
322 |
+
|
323 |
+
/* Base code styling */
|
324 |
+
pre, code {
|
325 |
+
font-family: var(--code-font-family) !important;
|
326 |
+
font-size: var(--code-font-size) !important;
|
327 |
+
line-height: var(--code-line-height) !important;
|
328 |
+
font-variant-ligatures: normal !important;
|
329 |
+
-webkit-font-smoothing: antialiased !important;
|
330 |
+
-moz-osx-font-smoothing: grayscale !important;
|
331 |
+
}
|
332 |
+
|
333 |
+
/* Fenced code blocks - light theme */
|
334 |
+
.highlight, .codehilite, pre.highlight, pre.codehilite,
|
335 |
+
.language-python, .language-text, .language-bash {
|
336 |
+
background: var(--code-bg) !important;
|
337 |
+
color: var(--code-text) !important;
|
338 |
+
border: 1px solid var(--code-border) !important;
|
339 |
+
border-radius: var(--code-border-radius) !important;
|
340 |
+
padding: var(--code-padding) !important;
|
341 |
+
margin: 12px 0 !important;
|
342 |
+
overflow-x: auto !important;
|
343 |
+
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05) !important;
|
344 |
+
position: relative !important;
|
345 |
+
white-space: pre !important;
|
346 |
+
display: block !important;
|
347 |
+
}
|
348 |
+
|
349 |
+
.highlight pre, .codehilite pre {
|
350 |
+
background: transparent !important;
|
351 |
+
color: inherit !important;
|
352 |
+
margin: 0 !important;
|
353 |
+
padding: 0 !important;
|
354 |
+
border: none !important;
|
355 |
+
border-radius: 0 !important;
|
356 |
+
overflow: visible !important;
|
357 |
+
white-space: pre !important;
|
358 |
+
display: block !important;
|
359 |
+
}
|
360 |
+
|
361 |
+
/* Ensure code blocks preserve formatting */
|
362 |
+
.highlight code, .codehilite code {
|
363 |
+
white-space: pre !important;
|
364 |
+
display: block !important;
|
365 |
+
padding: 0 !important;
|
366 |
+
margin: 0 !important;
|
367 |
+
background: transparent !important;
|
368 |
+
border: none !important;
|
369 |
+
font-size: inherit !important;
|
370 |
+
line-height: inherit !important;
|
371 |
+
}
|
372 |
+
|
373 |
+
/* Add language label for fenced blocks */
|
374 |
+
.highlight::before, .codehilite::before {
|
375 |
+
content: 'python';
|
376 |
+
position: absolute;
|
377 |
+
top: 8px;
|
378 |
+
right: 12px;
|
379 |
+
background: rgba(0, 0, 0, 0.05);
|
380 |
+
color: #586069;
|
381 |
+
padding: 2px 8px;
|
382 |
+
border-radius: 4px;
|
383 |
+
font-size: 11px;
|
384 |
+
font-weight: 500;
|
385 |
+
text-transform: uppercase;
|
386 |
+
letter-spacing: 0.5px;
|
387 |
+
}
|
388 |
+
|
389 |
+
/* Syntax highlighting for Python - Light theme */
|
390 |
+
.highlight .k, .codehilite .k, /* keywords */
|
391 |
+
.highlight .kn, .codehilite .kn, /* keyword.namespace */
|
392 |
+
.highlight .kp, .codehilite .kp, /* keyword.pseudo */
|
393 |
+
.highlight .kr, .codehilite .kr, /* keyword.reserved */
|
394 |
+
.highlight .kt, .codehilite .kt /* keyword.type */
|
395 |
+
{
|
396 |
+
color: var(--code-keyword) !important;
|
397 |
+
font-weight: 600 !important;
|
398 |
+
}
|
399 |
+
|
400 |
+
.highlight .s, .codehilite .s, /* strings */
|
401 |
+
.highlight .s1, .codehilite .s1, /* string.single */
|
402 |
+
.highlight .s2, .codehilite .s2, /* string.double */
|
403 |
+
.highlight .se, .codehilite .se /* string.escape */
|
404 |
+
{
|
405 |
+
color: var(--code-string) !important;
|
406 |
+
}
|
407 |
+
|
408 |
+
.highlight .c, .codehilite .c, /* comments */
|
409 |
+
.highlight .c1, .codehilite .c1, /* comment.single */
|
410 |
+
.highlight .cm, .codehilite .cm /* comment.multiline */
|
411 |
+
{
|
412 |
+
color: var(--code-comment) !important;
|
413 |
+
font-style: italic !important;
|
414 |
+
}
|
415 |
+
|
416 |
+
.highlight .m, .codehilite .m, /* numbers */
|
417 |
+
.highlight .mi, .codehilite .mi, /* number.integer */
|
418 |
+
.highlight .mf, .codehilite .mf, /* number.float */
|
419 |
+
.highlight .mo, .codehilite .mo /* number.octal */
|
420 |
+
{
|
421 |
+
color: var(--code-number) !important;
|
422 |
+
font-weight: 600 !important;
|
423 |
+
}
|
424 |
+
|
425 |
+
.highlight .nf, .codehilite .nf, /* function names */
|
426 |
+
.highlight .fm, .codehilite .fm /* function.magic */
|
427 |
+
{
|
428 |
+
color: var(--code-function) !important;
|
429 |
+
font-weight: 600 !important;
|
430 |
+
}
|
431 |
+
|
432 |
+
.highlight .o, .codehilite .o, /* operators */
|
433 |
+
.highlight .ow, .codehilite .ow /* operator.word */
|
434 |
+
{
|
435 |
+
color: var(--code-operator) !important;
|
436 |
+
}
|
437 |
+
|
438 |
+
/* Inline code - light theme */
|
439 |
+
p code, li code, div code, span code,
|
440 |
+
h1 code, h2 code, h3 code, h4 code, h5 code, h6 code {
|
441 |
+
background: var(--inline-code-bg) !important;
|
442 |
+
color: var(--inline-code-text) !important;
|
443 |
+
border: 1px solid var(--inline-code-border) !important;
|
444 |
+
padding: 2px 6px !important;
|
445 |
+
border-radius: 4px !important;
|
446 |
+
font-size: 0.9em !important;
|
447 |
+
font-weight: 600 !important;
|
448 |
+
white-space: nowrap !important;
|
449 |
+
box-shadow: none !important;
|
450 |
+
display: inline !important;
|
451 |
+
}
|
452 |
+
|
453 |
+
/* Code blocks inside paragraphs should not be treated as inline */
|
454 |
+
p pre, li pre, div pre {
|
455 |
+
background: var(--code-bg) !important;
|
456 |
+
color: var(--code-text) !important;
|
457 |
+
border: 1px solid var(--code-border) !important;
|
458 |
+
border-radius: var(--code-border-radius) !important;
|
459 |
+
padding: var(--code-padding) !important;
|
460 |
+
margin: 8px 0 !important;
|
461 |
+
white-space: pre !important;
|
462 |
+
overflow-x: auto !important;
|
463 |
+
display: block !important;
|
464 |
+
}
|
465 |
+
|
466 |
+
/* Scrollbar styling for code blocks - light theme */
|
467 |
+
.highlight::-webkit-scrollbar, .codehilite::-webkit-scrollbar,
|
468 |
+
pre::-webkit-scrollbar {
|
469 |
+
height: 8px !important;
|
470 |
+
background: #f1f3f4 !important;
|
471 |
+
border-radius: 4px !important;
|
472 |
+
}
|
473 |
+
|
474 |
+
.highlight::-webkit-scrollbar-thumb, .codehilite::-webkit-scrollbar-thumb,
|
475 |
+
pre::-webkit-scrollbar-thumb {
|
476 |
+
background: #c1c8cd !important;
|
477 |
+
border-radius: 4px !important;
|
478 |
+
}
|
479 |
+
|
480 |
+
.highlight::-webkit-scrollbar-thumb:hover, .codehilite::-webkit-scrollbar-thumb:hover,
|
481 |
+
pre::-webkit-scrollbar-thumb:hover {
|
482 |
+
background: #a8b3ba !important;
|
483 |
+
}
|
484 |
+
"""
|
485 |
+
|
486 |
+
if use_accordion:
|
487 |
+
css_styles += """
|
488 |
+
/* Accordion styling */
|
489 |
+
details > summary {
|
490 |
+
list-style: none !important;
|
491 |
+
cursor: pointer !important;
|
492 |
+
}
|
493 |
+
details > summary:hover {
|
494 |
+
background-color: transparent !important;
|
495 |
+
box-shadow: none !important;
|
496 |
+
transform: none !important;
|
497 |
+
}
|
498 |
+
details > summary::-webkit-details-marker,
|
499 |
+
details > summary::marker {
|
500 |
+
display: none !important;
|
501 |
+
}
|
502 |
+
"""
|
503 |
+
|
504 |
+
css_styles += "</style>"
|
505 |
+
html_out = css_styles + html_out
|
506 |
+
|
507 |
+
return html_out
|
lmmvibes/vis_gradio/data_loader.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Data loading functionality for the LMM-Vibes Gradio app.
|
3 |
+
|
4 |
+
This module handles loading pipeline results and converting them to formats
|
5 |
+
suitable for the Gradio interface.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import json
|
9 |
+
import pandas as pd
|
10 |
+
from pathlib import Path
|
11 |
+
from typing import Dict, List, Any, Tuple, Optional
|
12 |
+
import os
|
13 |
+
|
14 |
+
from .state import app_state
|
15 |
+
from lmmvibes.metrics.plotting import create_model_cluster_dataframe
|
16 |
+
|
17 |
+
|
18 |
+
class DataCache:
|
19 |
+
"""Simple cache for loaded data to avoid re-loading."""
|
20 |
+
_cache = {}
|
21 |
+
|
22 |
+
@classmethod
|
23 |
+
def get(cls, key: str):
|
24 |
+
return cls._cache.get(key)
|
25 |
+
|
26 |
+
@classmethod
|
27 |
+
def set(cls, key: str, value: Any):
|
28 |
+
cls._cache[key] = value
|
29 |
+
|
30 |
+
@classmethod
|
31 |
+
def clear(cls):
|
32 |
+
cls._cache.clear()
|
33 |
+
|
34 |
+
|
35 |
+
def scan_for_result_subfolders(base_dir: str) -> List[str]:
|
36 |
+
"""Scan for subfolders that might contain pipeline results."""
|
37 |
+
base_path = Path(base_dir)
|
38 |
+
if not base_path.exists():
|
39 |
+
return []
|
40 |
+
|
41 |
+
# Look for subfolders that contain the required files
|
42 |
+
subfolders = []
|
43 |
+
for item in base_path.iterdir():
|
44 |
+
if item.is_dir():
|
45 |
+
# Check if this subfolder contains pipeline results
|
46 |
+
required_files = [
|
47 |
+
"model_cluster_scores.json",
|
48 |
+
"cluster_scores.json",
|
49 |
+
"model_scores.json",
|
50 |
+
"clustered_results_lightweight.jsonl"
|
51 |
+
]
|
52 |
+
if all((item / f).exists() for f in required_files):
|
53 |
+
subfolders.append(item.name)
|
54 |
+
|
55 |
+
return subfolders
|
56 |
+
|
57 |
+
|
58 |
+
def validate_results_directory(results_dir: str) -> Tuple[bool, str]:
|
59 |
+
"""Validate that the results directory contains the expected files."""
|
60 |
+
results_path = Path(results_dir)
|
61 |
+
|
62 |
+
if not results_path.exists():
|
63 |
+
return False, f"Directory does not exist: {results_dir}"
|
64 |
+
|
65 |
+
if not results_path.is_dir():
|
66 |
+
return False, f"Path is not a directory: {results_dir}"
|
67 |
+
|
68 |
+
# Check for FunctionalMetrics format files
|
69 |
+
required_files = [
|
70 |
+
"model_cluster_scores.json",
|
71 |
+
"cluster_scores.json",
|
72 |
+
"model_scores.json",
|
73 |
+
]
|
74 |
+
|
75 |
+
missing_files = []
|
76 |
+
for filename in required_files:
|
77 |
+
if not (results_path / filename).exists():
|
78 |
+
missing_files.append(filename)
|
79 |
+
|
80 |
+
# Check for clustered results
|
81 |
+
if not (results_path / "clustered_results_lightweight.jsonl").exists():
|
82 |
+
missing_files.append("clustered_results_lightweight.jsonl")
|
83 |
+
|
84 |
+
if missing_files:
|
85 |
+
return False, f"Missing required files: {', '.join(missing_files)}"
|
86 |
+
|
87 |
+
return True, ""
|
88 |
+
|
89 |
+
|
90 |
+
def get_available_models(metrics: Dict[str, Any]) -> List[str]:
|
91 |
+
"""Extract available models from metrics data."""
|
92 |
+
model_cluster_scores = metrics.get("model_cluster_scores", {})
|
93 |
+
return list(model_cluster_scores.keys())
|
94 |
+
|
95 |
+
|
96 |
+
def get_all_models(metrics: Dict[str, Any]) -> List[str]:
|
97 |
+
"""Get all available models from metrics data."""
|
98 |
+
return get_available_models(metrics)
|
99 |
+
|
100 |
+
|
101 |
+
def load_pipeline_results(results_dir: str) -> Tuple[pd.DataFrame, Dict[str, Any], pd.DataFrame, Path]:
|
102 |
+
"""Load pipeline outputs (FunctionalMetrics format only).
|
103 |
+
Returns:
|
104 |
+
clustered_df: DataFrame of per-conversation data loaded from clustered_results.jsonl
|
105 |
+
metrics: Dict containing the three FunctionalMetrics score dictionaries
|
106 |
+
model_cluster_df: DataFrame created from model_cluster_scores for plotting/analysis
|
107 |
+
results_path: Path to the results directory
|
108 |
+
"""
|
109 |
+
cache_key = f"pipeline_results_{results_dir}"
|
110 |
+
cached = DataCache.get(cache_key)
|
111 |
+
if cached:
|
112 |
+
return cached
|
113 |
+
|
114 |
+
results_path = Path(results_dir)
|
115 |
+
if not results_path.exists():
|
116 |
+
raise FileNotFoundError(f"Results directory does not exist: {results_dir}")
|
117 |
+
|
118 |
+
# ------------------------------------------------------------------
|
119 |
+
# 1. Load FunctionalMetrics score files (must ALL be present)
|
120 |
+
# ------------------------------------------------------------------
|
121 |
+
required_files = [
|
122 |
+
"model_cluster_scores.json",
|
123 |
+
"cluster_scores.json",
|
124 |
+
"model_scores.json",
|
125 |
+
]
|
126 |
+
missing = [f for f in required_files if not (results_path / f).exists()]
|
127 |
+
if missing:
|
128 |
+
raise FileNotFoundError(
|
129 |
+
f"Missing required metrics files in {results_dir}: {', '.join(missing)}"
|
130 |
+
)
|
131 |
+
|
132 |
+
with open(results_path / "model_cluster_scores.json") as f:
|
133 |
+
model_cluster_scores = json.load(f)
|
134 |
+
with open(results_path / "cluster_scores.json") as f:
|
135 |
+
cluster_scores = json.load(f)
|
136 |
+
with open(results_path / "model_scores.json") as f:
|
137 |
+
model_scores = json.load(f)
|
138 |
+
|
139 |
+
metrics = {
|
140 |
+
"model_cluster_scores": model_cluster_scores,
|
141 |
+
"cluster_scores": cluster_scores,
|
142 |
+
"model_scores": model_scores,
|
143 |
+
}
|
144 |
+
|
145 |
+
# ------------------------------------------------------------------
|
146 |
+
# 2. Load clustered conversation data (JSON-Lines)
|
147 |
+
# ------------------------------------------------------------------
|
148 |
+
clustered_path = results_path / "clustered_results_lightweight.jsonl"
|
149 |
+
if not clustered_path.exists():
|
150 |
+
raise FileNotFoundError(f"clustered_results_lightweight.jsonl not found in {results_dir}")
|
151 |
+
|
152 |
+
try:
|
153 |
+
clustered_df = pd.read_json(clustered_path, lines=True)
|
154 |
+
except Exception as e:
|
155 |
+
raise ValueError(f"Could not load clustered results: {e}")
|
156 |
+
|
157 |
+
# ------------------------------------------------------------------
|
158 |
+
# 3. Create model_cluster_df from metrics for plotting/analysis
|
159 |
+
# ------------------------------------------------------------------
|
160 |
+
model_cluster_df = create_model_cluster_dataframe(model_cluster_scores)
|
161 |
+
|
162 |
+
result = (clustered_df, metrics, model_cluster_df, results_path)
|
163 |
+
DataCache.set(cache_key, result)
|
164 |
+
return result
|
165 |
+
|
166 |
+
|
167 |
+
def load_property_examples(results_path: Path, property_ids: List[str]) -> pd.DataFrame:
|
168 |
+
"""Load specific property examples on-demand"""
|
169 |
+
if not property_ids:
|
170 |
+
return pd.DataFrame()
|
171 |
+
|
172 |
+
cache_key = f"examples_{results_path}_{hash(tuple(sorted(property_ids)))}"
|
173 |
+
cached = DataCache.get(cache_key)
|
174 |
+
if cached is not None:
|
175 |
+
return cached
|
176 |
+
|
177 |
+
# Load full dataset to get prompt/response details
|
178 |
+
clustered_path = results_path / "clustered_results_lightweight.jsonl"
|
179 |
+
|
180 |
+
if not clustered_path.exists():
|
181 |
+
raise FileNotFoundError("Could not load example data - clustered_results_lightweight.jsonl not found")
|
182 |
+
|
183 |
+
try:
|
184 |
+
full_df = pd.read_json(clustered_path, lines=True)
|
185 |
+
result = full_df[full_df['id'].isin(property_ids)]
|
186 |
+
DataCache.set(cache_key, result)
|
187 |
+
return result
|
188 |
+
except Exception as e:
|
189 |
+
raise ValueError(f"Failed to load examples: {e}")
|
lmmvibes/vis_gradio/debug_tab.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Logic for the **Debug Data** tab."""
|
2 |
+
from __future__ import annotations
|
3 |
+
|
4 |
+
from .state import app_state
|
5 |
+
|
6 |
+
__all__ = ["debug_data_structure"]
|
7 |
+
|
8 |
+
|
9 |
+
def debug_data_structure() -> str:
|
10 |
+
if app_state["clustered_df"] is None:
|
11 |
+
return "<p style='color: #e74c3c;'>β No data loaded</p>"
|
12 |
+
|
13 |
+
df = app_state["clustered_df"]
|
14 |
+
|
15 |
+
n_rows = len(df)
|
16 |
+
n_cols = len(df.columns)
|
17 |
+
|
18 |
+
# Check for both naming patterns
|
19 |
+
has_fine_clusters = ("property_description_fine_cluster_id" in df.columns or
|
20 |
+
"fine_cluster_id" in df.columns)
|
21 |
+
has_coarse_clusters = ("property_description_coarse_cluster_id" in df.columns or
|
22 |
+
"coarse_cluster_id" in df.columns)
|
23 |
+
|
24 |
+
sample_rows = min(3, len(df))
|
25 |
+
sample_data = df.head(sample_rows).to_html(
|
26 |
+
escape=False,
|
27 |
+
classes="table table-striped",
|
28 |
+
table_id="debug-table",
|
29 |
+
)
|
30 |
+
|
31 |
+
html = f"""
|
32 |
+
<div style="max-width: 1200px; margin: 0 auto;">
|
33 |
+
<h3>π Data Structure Debug Info</h3>
|
34 |
+
|
35 |
+
<div style="background: #f8f9fa; padding: 15px; border-radius: 8px; margin: 15px 0;">
|
36 |
+
<h4>Basic Statistics</h4>
|
37 |
+
<ul>
|
38 |
+
<li><strong>Rows:</strong> {n_rows:,}</li>
|
39 |
+
<li><strong>Columns:</strong> {n_cols}</li>
|
40 |
+
<li><strong>Fine Clusters Available:</strong> {'β
Yes' if has_fine_clusters else 'β No'}</li>
|
41 |
+
<li><strong>Coarse Clusters Available:</strong> {'β
Yes' if has_coarse_clusters else 'β No'}</li>
|
42 |
+
</ul>
|
43 |
+
</div>
|
44 |
+
|
45 |
+
<div style="background: #f8f9fa; padding: 15px; border-radius: 8px; margin: 15px 0;">
|
46 |
+
<h4>Available Columns</h4>
|
47 |
+
<div style="max-height: 200px; overflow-y: auto; background: white; padding: 10px; border-radius: 4px;">
|
48 |
+
<ul>
|
49 |
+
"""
|
50 |
+
|
51 |
+
for col in sorted(df.columns):
|
52 |
+
unique_values = df[col].nunique() if df[col].dtype == "object" else "N/A"
|
53 |
+
html += f"<li><code>{col}</code> - {df[col].dtype} (unique values: {unique_values})</li>"
|
54 |
+
|
55 |
+
html += f"""
|
56 |
+
</ul>
|
57 |
+
</div>
|
58 |
+
</div>
|
59 |
+
|
60 |
+
<div style="background: #f8f9fa; padding: 15px; border-radius: 8px; margin: 15px 0;">
|
61 |
+
<h4>Sample Data (First {sample_rows} rows)</h4>
|
62 |
+
<div style="max-height: 400px; overflow: auto; background: white; padding: 10px; border-radius: 4px;">
|
63 |
+
{sample_data}
|
64 |
+
</div>
|
65 |
+
</div>
|
66 |
+
</div>
|
67 |
+
|
68 |
+
<style>
|
69 |
+
#debug-table {{
|
70 |
+
font-size: 12px;
|
71 |
+
width: 100%;
|
72 |
+
}}
|
73 |
+
#debug-table th, #debug-table td {{
|
74 |
+
padding: 4px 8px;
|
75 |
+
border: 1px solid #ddd;
|
76 |
+
}}
|
77 |
+
#debug-table th {{
|
78 |
+
background: #f1f1f1;
|
79 |
+
}}
|
80 |
+
</style>
|
81 |
+
"""
|
82 |
+
|
83 |
+
return html
|
lmmvibes/vis_gradio/demo.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Demo script showing different ways to use the LMM-Vibes Gradio visualization.
|
3 |
+
|
4 |
+
This demonstrates the Python API for launching the Gradio app.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import argparse
|
8 |
+
from pathlib import Path
|
9 |
+
from lmmvibes.vis_gradio import launch_app, create_app
|
10 |
+
|
11 |
+
|
12 |
+
def demo_basic_launch():
|
13 |
+
"""Demo: Basic launch without pre-loading data."""
|
14 |
+
print("π Demo: Basic launch - data can be loaded through the UI")
|
15 |
+
launch_app()
|
16 |
+
|
17 |
+
|
18 |
+
def demo_preload_data(results_dir: str):
|
19 |
+
"""Demo: Launch with pre-loaded data."""
|
20 |
+
print(f"π Demo: Launch with pre-loaded data from {results_dir}")
|
21 |
+
launch_app(results_dir=results_dir)
|
22 |
+
|
23 |
+
|
24 |
+
def demo_custom_settings(results_dir: str = None):
|
25 |
+
"""Demo: Launch with custom settings."""
|
26 |
+
print("π Demo: Launch with custom settings")
|
27 |
+
launch_app(
|
28 |
+
results_dir=results_dir,
|
29 |
+
share=True, # Create public shareable link
|
30 |
+
server_name="0.0.0.0", # Allow access from other machines
|
31 |
+
server_port=8080, # Custom port
|
32 |
+
)
|
33 |
+
|
34 |
+
|
35 |
+
def demo_programmatic_access():
|
36 |
+
"""Demo: Create app object for programmatic access."""
|
37 |
+
print("π Demo: Programmatic app creation")
|
38 |
+
|
39 |
+
# Create the app object without launching
|
40 |
+
app = create_app()
|
41 |
+
|
42 |
+
# You could modify the app here if needed
|
43 |
+
# app.title = "My Custom Title"
|
44 |
+
|
45 |
+
# Launch when ready
|
46 |
+
print("Launching app...")
|
47 |
+
app.launch(share=False, server_port=7861)
|
48 |
+
|
49 |
+
|
50 |
+
def main():
|
51 |
+
parser = argparse.ArgumentParser(description="LMM-Vibes Gradio Visualization Demo")
|
52 |
+
parser.add_argument("--results_dir", help="Path to results directory for demos")
|
53 |
+
parser.add_argument("--demo", choices=[
|
54 |
+
"basic", "preload", "custom", "programmatic"
|
55 |
+
], default="basic", help="Which demo to run")
|
56 |
+
|
57 |
+
args = parser.parse_args()
|
58 |
+
|
59 |
+
if args.demo == "basic":
|
60 |
+
demo_basic_launch()
|
61 |
+
elif args.demo == "preload":
|
62 |
+
if not args.results_dir:
|
63 |
+
print("β Error: --results_dir required for preload demo")
|
64 |
+
return
|
65 |
+
demo_preload_data(args.results_dir)
|
66 |
+
elif args.demo == "custom":
|
67 |
+
demo_custom_settings(args.results_dir)
|
68 |
+
elif args.demo == "programmatic":
|
69 |
+
demo_programmatic_access()
|
70 |
+
|
71 |
+
|
72 |
+
if __name__ == "__main__":
|
73 |
+
main()
|
lmmvibes/vis_gradio/examples_tab.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Logic for the **View Examples** tab β dropdown population + example renderer."""
|
2 |
+
from __future__ import annotations
|
3 |
+
|
4 |
+
from typing import Any, List, Tuple
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
|
8 |
+
from .state import app_state
|
9 |
+
from .utils import (
|
10 |
+
get_unique_values_for_dropdowns,
|
11 |
+
get_example_data,
|
12 |
+
format_examples_display,
|
13 |
+
search_clusters_only,
|
14 |
+
)
|
15 |
+
|
16 |
+
__all__: List[str] = [
|
17 |
+
"get_dropdown_choices",
|
18 |
+
"update_example_dropdowns",
|
19 |
+
"view_examples",
|
20 |
+
"get_filter_options",
|
21 |
+
"update_filter_dropdowns",
|
22 |
+
]
|
23 |
+
|
24 |
+
|
25 |
+
# ---------------------------------------------------------------------------
|
26 |
+
# Dropdown helpers
|
27 |
+
# ---------------------------------------------------------------------------
|
28 |
+
|
29 |
+
def get_dropdown_choices() -> Tuple[List[str], List[str], List[str]]:
|
30 |
+
if app_state["clustered_df"] is None:
|
31 |
+
return [], [], []
|
32 |
+
|
33 |
+
choices = get_unique_values_for_dropdowns(app_state["clustered_df"])
|
34 |
+
prompts = ["All Prompts"] + choices["prompts"]
|
35 |
+
models = ["All Models"] + choices["models"]
|
36 |
+
properties = ["All Clusters"] + choices["properties"]
|
37 |
+
return prompts, models, properties
|
38 |
+
|
39 |
+
|
40 |
+
def update_example_dropdowns() -> Tuple[Any, Any, Any]:
|
41 |
+
prompts, models, properties = get_dropdown_choices()
|
42 |
+
return (
|
43 |
+
gr.update(choices=prompts, value="All Prompts" if prompts else None),
|
44 |
+
gr.update(choices=models, value="All Models" if models else None),
|
45 |
+
gr.update(choices=properties, value="All Clusters" if properties else None),
|
46 |
+
)
|
47 |
+
|
48 |
+
|
49 |
+
# ---------------------------------------------------------------------------
|
50 |
+
# Example viewer
|
51 |
+
# ---------------------------------------------------------------------------
|
52 |
+
|
53 |
+
def view_examples(
|
54 |
+
selected_prompt: str,
|
55 |
+
selected_model: str,
|
56 |
+
selected_property: str,
|
57 |
+
max_examples: int = 5,
|
58 |
+
use_accordion: bool = True,
|
59 |
+
pretty_print_dicts: bool = True,
|
60 |
+
search_term: str = "",
|
61 |
+
show_unexpected_behavior: bool = False,
|
62 |
+
) -> str:
|
63 |
+
if app_state["clustered_df"] is None:
|
64 |
+
return (
|
65 |
+
"<p style='color: #e74c3c; padding: 20px;'>β Please load data first "
|
66 |
+
"using the 'Load Data' tab</p>"
|
67 |
+
)
|
68 |
+
|
69 |
+
# Apply search filter first if search term is provided
|
70 |
+
df = app_state["clustered_df"]
|
71 |
+
if search_term and isinstance(search_term, str) and search_term.strip():
|
72 |
+
df = search_clusters_only(df, search_term.strip(), 'fine') # Default to fine clusters
|
73 |
+
if df.empty:
|
74 |
+
return f"<p style='color: #e74c3c; padding: 20px;'>β No clusters found matching '{search_term}'</p>"
|
75 |
+
|
76 |
+
examples = get_example_data(
|
77 |
+
df,
|
78 |
+
selected_prompt if selected_prompt != "All Prompts" else None,
|
79 |
+
selected_model if selected_model != "All Models" else None,
|
80 |
+
selected_property if selected_property != "All Clusters" else None,
|
81 |
+
max_examples,
|
82 |
+
show_unexpected_behavior=show_unexpected_behavior,
|
83 |
+
randomize=(
|
84 |
+
(selected_prompt == "All Prompts") and
|
85 |
+
(selected_model == "All Models") and
|
86 |
+
(selected_property == "All Clusters") and
|
87 |
+
(not search_term or not str(search_term).strip())
|
88 |
+
),
|
89 |
+
)
|
90 |
+
|
91 |
+
return format_examples_display(
|
92 |
+
examples,
|
93 |
+
selected_prompt,
|
94 |
+
selected_model,
|
95 |
+
selected_property,
|
96 |
+
use_accordion=use_accordion,
|
97 |
+
pretty_print_dicts=pretty_print_dicts,
|
98 |
+
)
|
99 |
+
|
100 |
+
|
101 |
+
# ---------------------------------------------------------------------------
|
102 |
+
# Filter dropdown helpers for frequency comparison
|
103 |
+
# ---------------------------------------------------------------------------
|
104 |
+
|
105 |
+
def get_filter_options() -> Tuple[List[str], List[str]]:
|
106 |
+
if not app_state["model_stats"]:
|
107 |
+
return ["All Models"], ["All Metrics"]
|
108 |
+
|
109 |
+
available_models = ["All Models"] + list(app_state["model_stats"].keys())
|
110 |
+
|
111 |
+
quality_metrics = set()
|
112 |
+
for model_data in app_state["model_stats"].values():
|
113 |
+
clusters = model_data.get("fine", []) + model_data.get("coarse", [])
|
114 |
+
for cluster in clusters:
|
115 |
+
quality_score = cluster.get("quality_score", {})
|
116 |
+
if isinstance(quality_score, dict):
|
117 |
+
quality_metrics.update(quality_score.keys())
|
118 |
+
|
119 |
+
available_metrics = ["All Metrics"] + sorted(list(quality_metrics))
|
120 |
+
|
121 |
+
return available_models, available_metrics
|
122 |
+
|
123 |
+
|
124 |
+
def update_filter_dropdowns() -> Tuple[Any, Any]:
|
125 |
+
models, metrics = get_filter_options()
|
126 |
+
return (
|
127 |
+
gr.update(choices=models, value="All Models" if models else None),
|
128 |
+
gr.update(choices=metrics, value="All Metrics" if metrics else None),
|
129 |
+
)
|
lmmvibes/vis_gradio/frequency_tab.py
ADDED
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Logic for the **Frequency Comparison** tab."""
|
2 |
+
from typing import List, Tuple, Dict, Any
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
from .state import app_state
|
7 |
+
|
8 |
+
|
9 |
+
# ---------------------------------------------------------------------------
|
10 |
+
# NOTE: app_state currently stores metrics under the legacy key 'model_stats'.
|
11 |
+
# During later cleanup this module will switch to 'metrics'. For now we treat
|
12 |
+
# the value as already being the new FunctionalMetrics dict.
|
13 |
+
# ---------------------------------------------------------------------------
|
14 |
+
|
15 |
+
__all__ = ["create_frequency_comparison", "create_frequency_plots"]
|
16 |
+
|
17 |
+
|
18 |
+
def create_frequency_comparison(
|
19 |
+
selected_models: List[str],
|
20 |
+
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
|
21 |
+
"""Create frequency comparison tables for the 3 functional metrics tables."""
|
22 |
+
if not app_state["model_stats"]:
|
23 |
+
empty_df = pd.DataFrame({"Message": ["Please load data first"]})
|
24 |
+
return empty_df, empty_df, empty_df, ""
|
25 |
+
|
26 |
+
if not selected_models:
|
27 |
+
empty_df = pd.DataFrame({"Message": ["Please select at least one model"]})
|
28 |
+
return empty_df, empty_df, empty_df, ""
|
29 |
+
|
30 |
+
# Get the functional metrics data
|
31 |
+
metrics_data = app_state["model_stats"]
|
32 |
+
|
33 |
+
# Debug: Print data structure info
|
34 |
+
print(f"DEBUG: Creating frequency comparison tables")
|
35 |
+
print(f" - Selected models: {selected_models}")
|
36 |
+
print(f" - Available keys in metrics_data: {list(metrics_data.keys())}")
|
37 |
+
|
38 |
+
if "model_cluster_scores" in metrics_data:
|
39 |
+
model_cluster_scores = metrics_data["model_cluster_scores"]
|
40 |
+
print(f" - Model cluster scores keys: {list(model_cluster_scores.keys())}")
|
41 |
+
for model in selected_models:
|
42 |
+
if model in model_cluster_scores:
|
43 |
+
clusters = model_cluster_scores[model]
|
44 |
+
print(f" - {model}: {len(clusters)} clusters")
|
45 |
+
else:
|
46 |
+
print(f" - {model}: NOT FOUND in model_cluster_scores")
|
47 |
+
|
48 |
+
if "cluster_scores" in metrics_data:
|
49 |
+
cluster_scores = metrics_data["cluster_scores"]
|
50 |
+
print(f" - Cluster scores: {len(cluster_scores)} clusters")
|
51 |
+
|
52 |
+
if "model_scores" in metrics_data:
|
53 |
+
model_scores = metrics_data["model_scores"]
|
54 |
+
print(f" - Model scores: {list(model_scores.keys())}")
|
55 |
+
|
56 |
+
# Create the three tables
|
57 |
+
model_cluster_df = create_model_cluster_table(metrics_data, selected_models)
|
58 |
+
cluster_df = create_cluster_table(metrics_data, selected_models)
|
59 |
+
model_df = create_model_table(metrics_data, selected_models)
|
60 |
+
|
61 |
+
print(f" - Created tables with rows: Model-Cluster={len(model_cluster_df)}, Cluster={len(cluster_df)}, Model={len(model_df)}")
|
62 |
+
|
63 |
+
info_text = f"**Model-Cluster Scores:** {len(model_cluster_df)} rows | **Cluster Scores:** {len(cluster_df)} rows | **Model Scores:** {len(model_df)} rows"
|
64 |
+
return model_cluster_df, cluster_df, model_df, info_text
|
65 |
+
|
66 |
+
|
67 |
+
def create_model_cluster_table(metrics_data: Dict[str, Any], selected_models: List[str]) -> pd.DataFrame:
|
68 |
+
"""Create table for model-cluster scores."""
|
69 |
+
model_cluster_scores = metrics_data.get("model_cluster_scores", {})
|
70 |
+
|
71 |
+
print(f"DEBUG: Creating model-cluster table")
|
72 |
+
print(f" - Available models in model_cluster_scores: {list(model_cluster_scores.keys())}")
|
73 |
+
print(f" - Selected models: {selected_models}")
|
74 |
+
|
75 |
+
rows = []
|
76 |
+
for model_name, clusters in model_cluster_scores.items():
|
77 |
+
if model_name not in selected_models:
|
78 |
+
print(f" - Skipping {model_name} (not in selected_models)")
|
79 |
+
continue
|
80 |
+
|
81 |
+
print(f" - Processing {model_name} with {len(clusters)} clusters")
|
82 |
+
for cluster_name, metrics in clusters.items():
|
83 |
+
# Filter out "No properties" clusters
|
84 |
+
if cluster_name == "No properties":
|
85 |
+
continue
|
86 |
+
|
87 |
+
# Basic metrics
|
88 |
+
size = metrics.get("size", 0)
|
89 |
+
proportion = metrics.get("proportion", 0) * 100 # Convert to percentage
|
90 |
+
proportion_delta = metrics.get("proportion_delta", 0) * 100 # Convert to percentage
|
91 |
+
|
92 |
+
# Quality metrics - show each metric separately
|
93 |
+
quality = metrics.get("quality", {})
|
94 |
+
quality_delta = metrics.get("quality_delta", {})
|
95 |
+
|
96 |
+
# Create base row
|
97 |
+
row = {
|
98 |
+
"Model": model_name,
|
99 |
+
"Cluster": cluster_name,
|
100 |
+
"Size": size,
|
101 |
+
"Proportion (%)": f"{proportion:.1f}",
|
102 |
+
"Proportion Delta (%)": f"{proportion_delta:.1f}",
|
103 |
+
# "Examples": len(metrics.get("examples", []))
|
104 |
+
}
|
105 |
+
|
106 |
+
# Add quality metrics for each individual metric
|
107 |
+
for metric_name, quality_val in quality.items():
|
108 |
+
row[f"Quality_{metric_name.title()}"] = f"{quality_val:.3f}"
|
109 |
+
|
110 |
+
for metric_name, delta_val in quality_delta.items():
|
111 |
+
row[f"Quality_Delta_{metric_name.title()}"] = f"{delta_val:+.3f}"
|
112 |
+
|
113 |
+
# Confidence intervals
|
114 |
+
proportion_ci = metrics.get("proportion_ci", {})
|
115 |
+
proportion_delta_ci = metrics.get("proportion_delta_ci", {})
|
116 |
+
|
117 |
+
# Significance flags
|
118 |
+
proportion_delta_significant = metrics.get("proportion_delta_significant", False)
|
119 |
+
quality_delta_significant = metrics.get("quality_delta_significant", {})
|
120 |
+
|
121 |
+
# Format confidence intervals
|
122 |
+
proportion_ci_str = format_ci(proportion_ci)
|
123 |
+
proportion_delta_ci_str = format_ci(proportion_delta_ci)
|
124 |
+
|
125 |
+
# Add confidence intervals and significance
|
126 |
+
row.update({
|
127 |
+
"Proportion CI": proportion_ci_str,
|
128 |
+
"Proportion Delta CI": proportion_delta_ci_str,
|
129 |
+
"Proportion Delta Significant": "Yes" if proportion_delta_significant else "No",
|
130 |
+
})
|
131 |
+
|
132 |
+
# Add quality delta significance for each metric
|
133 |
+
for metric_name, is_significant in quality_delta_significant.items():
|
134 |
+
row[f"Quality_Delta_{metric_name.title()}_Significant"] = "Yes" if is_significant else "No"
|
135 |
+
|
136 |
+
rows.append(row)
|
137 |
+
|
138 |
+
print(f" - Created {len(rows)} rows for model-cluster table")
|
139 |
+
return pd.DataFrame(rows)
|
140 |
+
|
141 |
+
|
142 |
+
def create_cluster_table(metrics_data: Dict[str, Any], selected_models: List[str]) -> pd.DataFrame:
|
143 |
+
"""Create table for cluster scores (aggregated across all models)."""
|
144 |
+
cluster_scores = metrics_data.get("cluster_scores", {})
|
145 |
+
|
146 |
+
print(f"DEBUG: Creating cluster table")
|
147 |
+
print(f" - Available clusters: {list(cluster_scores.keys())}")
|
148 |
+
print(f" - Number of clusters: {len(cluster_scores)}")
|
149 |
+
|
150 |
+
rows = []
|
151 |
+
for cluster_name, metrics in cluster_scores.items():
|
152 |
+
# Filter out "No properties" clusters
|
153 |
+
if cluster_name == "No properties":
|
154 |
+
continue
|
155 |
+
|
156 |
+
# Basic metrics
|
157 |
+
size = metrics.get("size", 0)
|
158 |
+
proportion = metrics.get("proportion", 0) * 100 # Convert to percentage
|
159 |
+
|
160 |
+
# Quality metrics - show each metric separately
|
161 |
+
quality = metrics.get("quality", {})
|
162 |
+
quality_delta = metrics.get("quality_delta", {})
|
163 |
+
|
164 |
+
# Create base row
|
165 |
+
row = {
|
166 |
+
"Cluster": cluster_name,
|
167 |
+
"Size": size,
|
168 |
+
"Proportion (%)": f"{proportion:.1f}",
|
169 |
+
# "Examples": len(metrics.get("examples", []))
|
170 |
+
}
|
171 |
+
|
172 |
+
# Add quality metrics for each individual metric
|
173 |
+
for metric_name, quality_val in quality.items():
|
174 |
+
row[f"Quality_{metric_name.title()}"] = f"{quality_val:.3f}"
|
175 |
+
|
176 |
+
for metric_name, delta_val in quality_delta.items():
|
177 |
+
row[f"Quality_Delta_{metric_name.title()}"] = f"{delta_val:+.3f}"
|
178 |
+
|
179 |
+
# Confidence intervals
|
180 |
+
proportion_ci = metrics.get("proportion_ci", {})
|
181 |
+
quality_ci = metrics.get("quality_ci", {})
|
182 |
+
quality_delta_ci = metrics.get("quality_delta_ci", {})
|
183 |
+
|
184 |
+
# Significance flags
|
185 |
+
quality_delta_significant = metrics.get("quality_delta_significant", {})
|
186 |
+
|
187 |
+
# Format confidence intervals
|
188 |
+
proportion_ci_str = format_ci(proportion_ci)
|
189 |
+
quality_ci_str = format_ci(quality_ci)
|
190 |
+
quality_delta_ci_str = format_ci(quality_delta_ci)
|
191 |
+
|
192 |
+
# Add confidence intervals and significance
|
193 |
+
row.update({
|
194 |
+
"Proportion CI": proportion_ci_str,
|
195 |
+
})
|
196 |
+
|
197 |
+
# Add quality CI and significance for each metric
|
198 |
+
for metric_name in quality.keys():
|
199 |
+
if metric_name in quality_ci:
|
200 |
+
ci = quality_ci[metric_name]
|
201 |
+
row[f"Quality_{metric_name.title()}_CI"] = format_ci(ci)
|
202 |
+
|
203 |
+
for metric_name in quality_delta.keys():
|
204 |
+
if metric_name in quality_delta_ci:
|
205 |
+
ci = quality_delta_ci[metric_name]
|
206 |
+
row[f"Quality_Delta_{metric_name.title()}_CI"] = format_ci(ci)
|
207 |
+
row[f"Quality_Delta_{metric_name.title()}_Significant"] = "Yes" if quality_delta_significant.get(metric_name, False) else "No"
|
208 |
+
|
209 |
+
rows.append(row)
|
210 |
+
|
211 |
+
print(f" - Created {len(rows)} rows for cluster table")
|
212 |
+
return pd.DataFrame(rows)
|
213 |
+
|
214 |
+
|
215 |
+
def create_model_table(metrics_data: Dict[str, Any], selected_models: List[str]) -> pd.DataFrame:
|
216 |
+
"""Create table for model scores (aggregated across all clusters)."""
|
217 |
+
model_scores = metrics_data.get("model_scores", {})
|
218 |
+
|
219 |
+
print(f"DEBUG: Creating model table")
|
220 |
+
print(f" - Available models in model_scores: {list(model_scores.keys())}")
|
221 |
+
print(f" - Selected models: {selected_models}")
|
222 |
+
|
223 |
+
rows = []
|
224 |
+
for model_name, metrics in model_scores.items():
|
225 |
+
# Filter by selected models
|
226 |
+
if model_name not in selected_models:
|
227 |
+
print(f" - Skipping {model_name} (not in selected_models)")
|
228 |
+
continue
|
229 |
+
|
230 |
+
print(f" - Processing {model_name}")
|
231 |
+
# Basic metrics
|
232 |
+
size = metrics.get("size", 0)
|
233 |
+
proportion = metrics.get("proportion", 0) * 100 # Convert to percentage
|
234 |
+
|
235 |
+
# Quality metrics - show each metric separately
|
236 |
+
quality = metrics.get("quality", {})
|
237 |
+
quality_delta = metrics.get("quality_delta", {})
|
238 |
+
|
239 |
+
# Create base row
|
240 |
+
row = {
|
241 |
+
"Model": model_name,
|
242 |
+
"Size": size,
|
243 |
+
# "Proportion (%)": f"{proportion:.1f}",
|
244 |
+
# "Examples": len(metrics.get("examples", []))
|
245 |
+
}
|
246 |
+
|
247 |
+
# Add quality metrics for each individual metric
|
248 |
+
for metric_name, quality_val in quality.items():
|
249 |
+
row[f"Quality_{metric_name.title()}"] = f"{quality_val:.3f}"
|
250 |
+
|
251 |
+
# for metric_name, delta_val in quality_delta.items():
|
252 |
+
# row[f"Quality_Delta_{metric_name.title()}"] = f"{delta_val:+.3f}"
|
253 |
+
|
254 |
+
# Confidence intervals
|
255 |
+
proportion_ci = metrics.get("proportion_ci", {})
|
256 |
+
quality_ci = metrics.get("quality_ci", {})
|
257 |
+
quality_delta_ci = metrics.get("quality_delta_ci", {})
|
258 |
+
|
259 |
+
# Significance flags
|
260 |
+
quality_delta_significant = metrics.get("quality_delta_significant", {})
|
261 |
+
|
262 |
+
# Format confidence intervals
|
263 |
+
proportion_ci_str = format_ci(proportion_ci)
|
264 |
+
|
265 |
+
# Add confidence intervals and significance
|
266 |
+
row.update({
|
267 |
+
"Proportion CI": proportion_ci_str,
|
268 |
+
})
|
269 |
+
|
270 |
+
# Add quality CI and significance for each metric
|
271 |
+
for metric_name in quality.keys():
|
272 |
+
if metric_name in quality_ci:
|
273 |
+
ci = quality_ci[metric_name]
|
274 |
+
row[f"Quality_{metric_name.title()}_CI"] = format_ci(ci)
|
275 |
+
|
276 |
+
# for metric_name in quality_delta.keys():
|
277 |
+
# if metric_name in quality_delta_ci:
|
278 |
+
# ci = quality_delta_ci[metric_name]
|
279 |
+
# row[f"Quality_Delta_{metric_name.title()}_CI"] = format_ci(ci)
|
280 |
+
# row[f"Quality_Delta_{metric_name.title()}_Significant"] = "Yes" if quality_delta_significant.get(metric_name, False) else "No"
|
281 |
+
|
282 |
+
rows.append(row)
|
283 |
+
|
284 |
+
print(f" - Created {len(rows)} rows for model table")
|
285 |
+
return pd.DataFrame(rows)
|
286 |
+
|
287 |
+
|
288 |
+
def format_ci(ci_dict: Dict[str, Any]) -> str:
|
289 |
+
"""Format confidence interval dictionary to string."""
|
290 |
+
if not ci_dict or not isinstance(ci_dict, dict):
|
291 |
+
return "N/A"
|
292 |
+
|
293 |
+
lower = ci_dict.get("lower")
|
294 |
+
upper = ci_dict.get("upper")
|
295 |
+
mean = ci_dict.get("mean")
|
296 |
+
|
297 |
+
if lower is not None and upper is not None:
|
298 |
+
return f"[{lower:.3f}, {upper:.3f}]"
|
299 |
+
elif mean is not None:
|
300 |
+
return f"Mean: {mean:.3f}"
|
301 |
+
else:
|
302 |
+
return "N/A"
|
303 |
+
|
304 |
+
|
305 |
+
def create_frequency_plots(*_args, **_kwargs):
|
306 |
+
"""Removed for now β kept as a stub for backward compatibility."""
|
307 |
+
return None, None
|
lmmvibes/vis_gradio/launcher.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
CLI launcher for LMM-Vibes Gradio visualization app.
|
4 |
+
|
5 |
+
Usage:
|
6 |
+
python -m lmmvibes.vis_gradio.launcher --results_dir path/to/results
|
7 |
+
|
8 |
+
Or directly:
|
9 |
+
python lmmvibes/vis_gradio/launcher.py --results_dir path/to/results
|
10 |
+
"""
|
11 |
+
|
12 |
+
import argparse
|
13 |
+
import sys
|
14 |
+
from pathlib import Path
|
15 |
+
|
16 |
+
def main():
|
17 |
+
parser = argparse.ArgumentParser(
|
18 |
+
description="Launch LMM-Vibes Gradio visualization app",
|
19 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
20 |
+
epilog="""
|
21 |
+
Examples:
|
22 |
+
# Launch with auto-loaded data from a base results directory
|
23 |
+
python -m lmmvibes.vis_gradio.launcher --results_dir /path/to/results
|
24 |
+
|
25 |
+
# Launch with public sharing enabled
|
26 |
+
python -m lmmvibes.vis_gradio.launcher --results_dir /path/to/results --share
|
27 |
+
|
28 |
+
# Launch on specific port
|
29 |
+
python -m lmmvibes.vis_gradio.launcher --results_dir /path/to/results --port 8080
|
30 |
+
|
31 |
+
# Launch with automatic port selection
|
32 |
+
python -m lmmvibes.vis_gradio.launcher --results_dir /path/to/results --auto_port
|
33 |
+
|
34 |
+
# Launch without auto-loading (manual selection in app)
|
35 |
+
python -m lmmvibes.vis_gradio.launcher
|
36 |
+
"""
|
37 |
+
)
|
38 |
+
|
39 |
+
parser.add_argument(
|
40 |
+
"--results_dir",
|
41 |
+
type=str,
|
42 |
+
help="Path to base results directory containing experiment subfolders (optional - can be loaded in the app)"
|
43 |
+
)
|
44 |
+
|
45 |
+
parser.add_argument(
|
46 |
+
"--share",
|
47 |
+
action="store_true",
|
48 |
+
help="Create a public shareable link"
|
49 |
+
)
|
50 |
+
|
51 |
+
parser.add_argument(
|
52 |
+
"--server_name",
|
53 |
+
type=str,
|
54 |
+
default="127.0.0.1",
|
55 |
+
help="Server address (default: 127.0.0.1)"
|
56 |
+
)
|
57 |
+
|
58 |
+
parser.add_argument(
|
59 |
+
"--port",
|
60 |
+
type=int,
|
61 |
+
default=7860,
|
62 |
+
help="Server port (default: 7860). Use --auto_port to automatically find an available port."
|
63 |
+
)
|
64 |
+
|
65 |
+
parser.add_argument(
|
66 |
+
"--auto_port",
|
67 |
+
action="store_true",
|
68 |
+
help="Automatically find an available port by trying ports 8080-8089"
|
69 |
+
)
|
70 |
+
|
71 |
+
parser.add_argument(
|
72 |
+
"--debug",
|
73 |
+
action="store_true",
|
74 |
+
help="Enable debug mode"
|
75 |
+
)
|
76 |
+
|
77 |
+
args = parser.parse_args()
|
78 |
+
|
79 |
+
# Handle auto_port option
|
80 |
+
if args.auto_port:
|
81 |
+
# Use a high port range for auto-port mode
|
82 |
+
args.port = 8080
|
83 |
+
print("π Auto-port mode enabled - will try ports 8080-8089")
|
84 |
+
|
85 |
+
# Validate results directory if provided
|
86 |
+
if args.results_dir:
|
87 |
+
results_path = Path(args.results_dir)
|
88 |
+
if not results_path.exists():
|
89 |
+
print(f"β Error: Results directory does not exist: {args.results_dir}")
|
90 |
+
sys.exit(1)
|
91 |
+
if not results_path.is_dir():
|
92 |
+
print(f"β Error: Path is not a directory: {args.results_dir}")
|
93 |
+
sys.exit(1)
|
94 |
+
|
95 |
+
# Import and launch the app
|
96 |
+
try:
|
97 |
+
from .app import launch_app
|
98 |
+
|
99 |
+
print("π Launching LMM-Vibes Gradio Visualization App...")
|
100 |
+
print(f"π Server: http://{args.server_name}:{args.port}")
|
101 |
+
if args.share:
|
102 |
+
print("π Public sharing enabled")
|
103 |
+
|
104 |
+
launch_app(
|
105 |
+
results_dir=args.results_dir,
|
106 |
+
share=args.share,
|
107 |
+
server_name=args.server_name,
|
108 |
+
server_port=args.port,
|
109 |
+
debug=args.debug
|
110 |
+
)
|
111 |
+
|
112 |
+
except ImportError as e:
|
113 |
+
print(f"β Error: Failed to import required modules: {e}")
|
114 |
+
print("π‘ Make sure you have gradio installed: pip install gradio")
|
115 |
+
sys.exit(1)
|
116 |
+
except Exception as e:
|
117 |
+
print(f"β Error launching app: {e}")
|
118 |
+
sys.exit(1)
|
119 |
+
|
120 |
+
|
121 |
+
if __name__ == "__main__":
|
122 |
+
main()
|
lmmvibes/vis_gradio/load_data_tab.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utilities for the "Load Data" tab β loading pipeline results and scanning for
|
3 |
+
available experiment folders.
|
4 |
+
"""
|
5 |
+
from __future__ import annotations
|
6 |
+
|
7 |
+
import os
|
8 |
+
from pathlib import Path
|
9 |
+
from typing import List, Tuple
|
10 |
+
|
11 |
+
import gradio as gr
|
12 |
+
|
13 |
+
# ---------------------------------------------------------------------------
|
14 |
+
# Loading utilities updated for FunctionalMetrics
|
15 |
+
# ---------------------------------------------------------------------------
|
16 |
+
|
17 |
+
from .state import app_state, BASE_RESULTS_DIR
|
18 |
+
from .data_loader import (
|
19 |
+
load_pipeline_results,
|
20 |
+
scan_for_result_subfolders,
|
21 |
+
validate_results_directory,
|
22 |
+
)
|
23 |
+
|
24 |
+
# Metrics helpers
|
25 |
+
from .metrics_adapter import get_all_models
|
26 |
+
|
27 |
+
__all__ = [
|
28 |
+
"load_data",
|
29 |
+
"get_available_experiments",
|
30 |
+
"get_experiment_choices",
|
31 |
+
"refresh_experiment_dropdown",
|
32 |
+
"load_experiment_data",
|
33 |
+
]
|
34 |
+
|
35 |
+
|
36 |
+
def load_data(results_dir: str) -> Tuple[str, str, str]:
|
37 |
+
"""Load pipeline results from *results_dir* and update the shared *app_state*.
|
38 |
+
|
39 |
+
Returns a tuple of (summary_markdown, models_info_markdown, models_checkbox_update).
|
40 |
+
"""
|
41 |
+
try:
|
42 |
+
# 1. Validate directory structure
|
43 |
+
is_valid, error_msg = validate_results_directory(results_dir)
|
44 |
+
if not is_valid:
|
45 |
+
return "", f"β Error: {error_msg}", ""
|
46 |
+
|
47 |
+
# 2. Handle optional sub-folder selection (first match for now)
|
48 |
+
subfolders = scan_for_result_subfolders(results_dir)
|
49 |
+
final_dir = results_dir
|
50 |
+
if subfolders and "." not in subfolders:
|
51 |
+
final_dir = str(Path(results_dir) / subfolders[0])
|
52 |
+
|
53 |
+
# 3. Load results into memory
|
54 |
+
clustered_df, metrics, model_cluster_df, results_path = load_pipeline_results(final_dir)
|
55 |
+
|
56 |
+
# 4. Stash in global state so other tabs can use it
|
57 |
+
app_state["clustered_df"] = clustered_df
|
58 |
+
app_state["metrics"] = metrics
|
59 |
+
app_state["model_cluster_df"] = model_cluster_df
|
60 |
+
# Temporary alias for legacy modules
|
61 |
+
app_state["model_stats"] = metrics
|
62 |
+
app_state["results_path"] = results_path
|
63 |
+
app_state["available_models"] = get_all_models(metrics)
|
64 |
+
app_state["current_results_dir"] = final_dir
|
65 |
+
|
66 |
+
# 5. Compose status messages
|
67 |
+
n_models = len(metrics.get("model_cluster_scores", {}))
|
68 |
+
n_properties = len(clustered_df)
|
69 |
+
|
70 |
+
summary = f"""
|
71 |
+
β
**Successfully loaded pipeline results!**
|
72 |
+
|
73 |
+
**Data Summary:**
|
74 |
+
- **Models:** {n_models}
|
75 |
+
- **Properties:** {n_properties:,}
|
76 |
+
- **Results Directory:** {Path(final_dir).name}
|
77 |
+
"""
|
78 |
+
# Check for both naming patterns for fine clusters
|
79 |
+
if ("fine_cluster_id" in clustered_df.columns or
|
80 |
+
"property_description_fine_cluster_id" in clustered_df.columns):
|
81 |
+
fine_id_col = ("fine_cluster_id" if "fine_cluster_id" in clustered_df.columns
|
82 |
+
else "property_description_fine_cluster_id")
|
83 |
+
n_fine_clusters = clustered_df[fine_id_col].nunique()
|
84 |
+
summary += f"\n- **Fine Clusters:** {n_fine_clusters}"
|
85 |
+
|
86 |
+
# Check for both naming patterns for coarse clusters
|
87 |
+
if ("coarse_cluster_id" in clustered_df.columns or
|
88 |
+
"property_description_coarse_cluster_id" in clustered_df.columns):
|
89 |
+
coarse_id_col = ("coarse_cluster_id" if "coarse_cluster_id" in clustered_df.columns
|
90 |
+
else "property_description_coarse_cluster_id")
|
91 |
+
n_coarse_clusters = clustered_df[coarse_id_col].nunique()
|
92 |
+
summary += f"\n- **Coarse Clusters:** {n_coarse_clusters}"
|
93 |
+
|
94 |
+
model_choices = app_state["available_models"]
|
95 |
+
models_info = f"Available models: {', '.join(model_choices)}"
|
96 |
+
|
97 |
+
# Gradio update object for the CheckboxGroup
|
98 |
+
return summary, models_info, gr.update(choices=model_choices, value=model_choices)
|
99 |
+
|
100 |
+
except Exception as e:
|
101 |
+
error_msg = f"β Error loading results: {e}"
|
102 |
+
return "", error_msg, gr.update(choices=[], value=[])
|
103 |
+
|
104 |
+
|
105 |
+
def get_available_experiments(base_dir: str) -> List[str]:
|
106 |
+
"""Return experiment sub-directories that contain the expected result files."""
|
107 |
+
if not base_dir or not os.path.exists(base_dir):
|
108 |
+
return []
|
109 |
+
|
110 |
+
experiments: List[str] = []
|
111 |
+
try:
|
112 |
+
for item in os.listdir(base_dir):
|
113 |
+
item_path = os.path.join(base_dir, item)
|
114 |
+
if os.path.isdir(item_path):
|
115 |
+
if (
|
116 |
+
os.path.exists(os.path.join(item_path, "model_stats.json"))
|
117 |
+
or os.path.exists(os.path.join(item_path, "clustered_results_lightweight.jsonl"))
|
118 |
+
):
|
119 |
+
experiments.append(item)
|
120 |
+
except Exception as e:
|
121 |
+
print(f"Error scanning experiments: {e}")
|
122 |
+
|
123 |
+
return sorted(experiments)
|
124 |
+
|
125 |
+
|
126 |
+
def get_experiment_choices() -> List[str]:
|
127 |
+
"""Return dropdown choices for the experiment selector."""
|
128 |
+
if not BASE_RESULTS_DIR:
|
129 |
+
return []
|
130 |
+
experiments = get_available_experiments(BASE_RESULTS_DIR)
|
131 |
+
return ["Select an experiment..."] + experiments
|
132 |
+
|
133 |
+
|
134 |
+
def refresh_experiment_dropdown() -> gr.update:
|
135 |
+
"""Gradio helper to refresh the experiment dropdown choices."""
|
136 |
+
choices = get_experiment_choices()
|
137 |
+
return gr.update(choices=choices, value="Select an experiment...")
|
138 |
+
|
139 |
+
|
140 |
+
def load_experiment_data(experiment_name: str) -> Tuple[str, str, str]:
|
141 |
+
"""Wrapper used by Gradio events to load a *selected* experiment."""
|
142 |
+
if not BASE_RESULTS_DIR or experiment_name == "Select an experiment...":
|
143 |
+
return "", "Please select a valid experiment", gr.update(choices=[], value=[])
|
144 |
+
|
145 |
+
experiment_path = os.path.join(BASE_RESULTS_DIR, experiment_name)
|
146 |
+
print(f"π Loading experiment: {experiment_name} from {experiment_path}")
|
147 |
+
return load_data(experiment_path)
|
lmmvibes/vis_gradio/metrics_adapter.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Lightweight access helpers for FunctionalMetrics score dictionaries.
|
2 |
+
|
3 |
+
The Gradio UI now receives the *raw* FunctionalMetrics output as a
|
4 |
+
```
|
5 |
+
metrics = {
|
6 |
+
"model_cluster_scores": {...},
|
7 |
+
"cluster_scores": {...},
|
8 |
+
"model_scores": {...},
|
9 |
+
}
|
10 |
+
```
|
11 |
+
This module centralises the most common look-ups so that the rest of the
|
12 |
+
codebase does *not* need to know the exact key names. If the format
|
13 |
+
changes again we only need to update these helpers.
|
14 |
+
"""
|
15 |
+
from typing import Dict, Any, List
|
16 |
+
|
17 |
+
__all__ = [
|
18 |
+
"get_model_clusters",
|
19 |
+
"get_all_models",
|
20 |
+
"get_all_clusters",
|
21 |
+
]
|
22 |
+
|
23 |
+
def get_model_clusters(metrics: Dict[str, Any], model_name: str) -> Dict[str, Any]:
|
24 |
+
"""Return the per-cluster dictionary for a given model.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
metrics: The dict returned by ``load_pipeline_results``.
|
28 |
+
model_name: Name of the model.
|
29 |
+
"""
|
30 |
+
if model_name == "all":
|
31 |
+
# For "all" model, return cluster_scores (aggregated across all models)
|
32 |
+
return metrics.get("cluster_scores", {})
|
33 |
+
else:
|
34 |
+
return metrics.get("model_cluster_scores", {}).get(model_name, {})
|
35 |
+
|
36 |
+
|
37 |
+
def get_all_models(metrics: Dict[str, Any]) -> List[str]:
|
38 |
+
"""Return the list of model names present in the metrics dict."""
|
39 |
+
models = list(metrics.get("model_cluster_scores", {}).keys())
|
40 |
+
# Add "all" as the first option to show aggregated metrics across all models
|
41 |
+
return ["all"] + models
|
42 |
+
|
43 |
+
|
44 |
+
def get_all_clusters(metrics: Dict[str, Any]) -> List[str]:
|
45 |
+
"""Return the list of cluster names (across all models)."""
|
46 |
+
return list(metrics.get("cluster_scores", {}).keys())
|
lmmvibes/vis_gradio/overview_tab.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Logic helpers for the **Overview** tab."""
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
from .state import app_state
|
5 |
+
from .utils import compute_model_rankings_new, create_model_summary_card_new
|
6 |
+
|
7 |
+
__all__ = ["create_overview"]
|
8 |
+
|
9 |
+
|
10 |
+
def create_overview(
|
11 |
+
selected_models: List[str],
|
12 |
+
top_n: int,
|
13 |
+
score_significant_only: bool = False,
|
14 |
+
quality_significant_only: bool = False,
|
15 |
+
sort_by: str = "quality_asc",
|
16 |
+
min_cluster_size: int = 1,
|
17 |
+
) -> str:
|
18 |
+
"""Return the HTML snippet that summarises model performance."""
|
19 |
+
if not app_state["metrics"]:
|
20 |
+
return "Please load data first using the 'Load Data' tab."
|
21 |
+
|
22 |
+
if not selected_models:
|
23 |
+
return "Please select at least one model to display."
|
24 |
+
|
25 |
+
# 1. Compute global rankings and filter to selection
|
26 |
+
model_rankings = compute_model_rankings_new(app_state["metrics"])
|
27 |
+
filtered_rankings = [
|
28 |
+
(name, stats) for name, stats in model_rankings if name in selected_models
|
29 |
+
]
|
30 |
+
|
31 |
+
# Sort so "all" appears first, then the rest by their rankings
|
32 |
+
all_models = [(name, stats) for name, stats in filtered_rankings if name == "all"]
|
33 |
+
other_models = [(name, stats) for name, stats in filtered_rankings if name != "all"]
|
34 |
+
filtered_rankings = all_models + other_models
|
35 |
+
|
36 |
+
if not filtered_rankings:
|
37 |
+
return "No data available for selected models."
|
38 |
+
|
39 |
+
# 2. Assemble HTML
|
40 |
+
overview_html = """
|
41 |
+
<div style="max-width: 1600px; margin: 0 auto;">
|
42 |
+
<p style="color: #666; margin-bottom: 10px;">
|
43 |
+
Top distinctive clusters where each model shows unique behavioural patterns.
|
44 |
+
Frequency shows what percentage of a model's battles resulted in that behavioural pattern.
|
45 |
+
</p>
|
46 |
+
|
47 |
+
<details style="margin-bottom:25px;">
|
48 |
+
<summary style="cursor:pointer; color:#4c6ef5; font-weight:600;">βΉοΈ What do "proportion delta", "Quality Ξ", and significance tags mean?</summary>
|
49 |
+
<div style="margin-top:12px; font-size:14px; line-height:1.5; color:#333;">
|
50 |
+
<strong>Proportion Delta</strong><br>
|
51 |
+
For each cluster we compute how often <em>this model</em> appears in that cluster compared with the average across all models.<br>
|
52 |
+
β’ A positive value (e.g. <code>+0.15</code>) means the model hits the behaviour more often than average.<br>
|
53 |
+
β’ A negative value (e.g. <code>-0.08</code>) means it appears less often.<br>
|
54 |
+
It is derived from the <code>proportion_delta</code> field in <code>model_cluster_scores.json</code>.<br><br>
|
55 |
+
<strong>Quality Ξ</strong><br>
|
56 |
+
The difference between the cluster's quality score(s) for this model and the model's <em>overall</em> quality baseline, shown for each individual metric (e.g., helpfulness, accuracy).<br>
|
57 |
+
Positive values (green) indicate the model performs better than its average in that behaviour; negative values (red) indicate worse.<br>
|
58 |
+
This is derived from the <code>quality_delta</code> metric dictionary in <code>model_cluster_scores.json</code>.<br><br>
|
59 |
+
<strong>Significance Tags (FREQ/QUAL)</strong><br>
|
60 |
+
The <span style="background: #28a745; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">FREQ</span> and <span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">QUAL</span> tags indicate <em>statistical significance</em> based on confidence intervals:<br>
|
61 |
+
β’ <strong>FREQ</strong> (green): The proportion delta is statistically significant (confidence interval doesn't include zero)<br>
|
62 |
+
β’ <strong>QUAL</strong> (blue): At least one quality metric delta is statistically significant<br>
|
63 |
+
These tags help identify which behavioral patterns are reliably different from the model's baseline performance.
|
64 |
+
</div>
|
65 |
+
</details>
|
66 |
+
"""
|
67 |
+
|
68 |
+
for model_name, _ in filtered_rankings:
|
69 |
+
card_html = create_model_summary_card_new(
|
70 |
+
model_name,
|
71 |
+
app_state["metrics"],
|
72 |
+
# top_n etc.
|
73 |
+
top_n,
|
74 |
+
score_significant_only=score_significant_only,
|
75 |
+
quality_significant_only=quality_significant_only,
|
76 |
+
sort_by=sort_by,
|
77 |
+
min_cluster_size=min_cluster_size,
|
78 |
+
)
|
79 |
+
overview_html += card_html
|
80 |
+
|
81 |
+
overview_html += "</div>"
|
82 |
+
return overview_html
|
lmmvibes/vis_gradio/plots_tab.py
ADDED
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Plots tab for the LMM-Vibes Gradio app.
|
3 |
+
|
4 |
+
This module provides functionality to display the model cluster proportion and quality plots.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
import pandas as pd
|
9 |
+
import plotly.express as px
|
10 |
+
import plotly.graph_objects as go
|
11 |
+
from typing import Tuple, List
|
12 |
+
|
13 |
+
from .state import app_state
|
14 |
+
|
15 |
+
|
16 |
+
def create_proportion_plot(show_ci: bool = False) -> Tuple[go.Figure, str]:
|
17 |
+
"""Create a grouped bar plot of proportion by property and model."""
|
18 |
+
if app_state.get("model_cluster_df") is None:
|
19 |
+
return None, "No model cluster data loaded. Please load data first."
|
20 |
+
|
21 |
+
model_cluster_df = app_state["model_cluster_df"]
|
22 |
+
print("DataFrame shape:", model_cluster_df.shape)
|
23 |
+
print("Columns:", model_cluster_df.columns.tolist())
|
24 |
+
print("Proportion range:", model_cluster_df['proportion'].min(), "to", model_cluster_df['proportion'].max())
|
25 |
+
print("Sample data:")
|
26 |
+
print(model_cluster_df[['model', 'cluster', 'proportion']].head(10))
|
27 |
+
|
28 |
+
if model_cluster_df.empty:
|
29 |
+
return None, "No model cluster data available."
|
30 |
+
|
31 |
+
# Ensure proportion values are numeric and in reasonable range
|
32 |
+
model_cluster_df = model_cluster_df.copy()
|
33 |
+
model_cluster_df['proportion'] = pd.to_numeric(model_cluster_df['proportion'], errors='coerce')
|
34 |
+
|
35 |
+
# Check for any unreasonable values
|
36 |
+
print("After conversion - Proportion range:", model_cluster_df['proportion'].min(), "to", model_cluster_df['proportion'].max())
|
37 |
+
print("Proportion values > 1:", (model_cluster_df['proportion'] > 1).sum())
|
38 |
+
print("Proportion values < 0:", (model_cluster_df['proportion'] < 0).sum())
|
39 |
+
|
40 |
+
# Create property name mapping with proper ordering
|
41 |
+
unique_properties = sorted(model_cluster_df['cluster'].unique())
|
42 |
+
property_mapping = {prop: f"P{i+1}" for i, prop in enumerate(unique_properties)}
|
43 |
+
|
44 |
+
# Create abbreviated property column for plotting
|
45 |
+
model_cluster_df['property_abbr'] = model_cluster_df['cluster'].map(property_mapping)
|
46 |
+
|
47 |
+
# Filter out "No properties" clusters
|
48 |
+
model_cluster_df = model_cluster_df[model_cluster_df['cluster'] != "No properties"]
|
49 |
+
|
50 |
+
# Prepare confidence interval data if requested
|
51 |
+
error_y_data = None
|
52 |
+
if show_ci and 'proportion_ci_lower' in model_cluster_df.columns and 'proportion_ci_upper' in model_cluster_df.columns:
|
53 |
+
# Calculate error bar values
|
54 |
+
model_cluster_df['y_error'] = model_cluster_df['proportion_ci_upper'] - model_cluster_df['proportion']
|
55 |
+
model_cluster_df['y_error_minus'] = model_cluster_df['proportion'] - model_cluster_df['proportion_ci_lower']
|
56 |
+
# Replace NaN values with 0
|
57 |
+
model_cluster_df['y_error'] = model_cluster_df['y_error'].fillna(0)
|
58 |
+
model_cluster_df['y_error_minus'] = model_cluster_df['y_error_minus'].fillna(0)
|
59 |
+
error_y_data = model_cluster_df['y_error']
|
60 |
+
error_y_minus_data = model_cluster_df['y_error_minus']
|
61 |
+
|
62 |
+
# Create a grouped bar plot of 'proportion' by property (x) and model (hue)
|
63 |
+
fig = px.bar(
|
64 |
+
model_cluster_df,
|
65 |
+
x="property_abbr",
|
66 |
+
y="proportion",
|
67 |
+
color="model",
|
68 |
+
barmode="group",
|
69 |
+
title="Proportion by Property and Model",
|
70 |
+
labels={"proportion": "Proportion", "property_abbr": "Property", "model": "Model"},
|
71 |
+
error_y="y_error" if error_y_data is not None else None,
|
72 |
+
error_y_minus="y_error_minus" if error_y_data is not None else None
|
73 |
+
)
|
74 |
+
|
75 |
+
# Set the x-axis order to ensure P1, P2, P3, etc.
|
76 |
+
property_order = [f"P{i+1}" for i in range(len(unique_properties))]
|
77 |
+
fig.update_xaxes(categoryorder='array', categoryarray=property_order)
|
78 |
+
fig.update_layout(xaxis_tickangle=45)
|
79 |
+
|
80 |
+
# save figure to file
|
81 |
+
fig.write_html("model_cluster_proportion_plot.html")
|
82 |
+
|
83 |
+
# Create property mapping string
|
84 |
+
mapping_text = "**Property Mapping:**\n\n"
|
85 |
+
for prop, abbr in property_mapping.items():
|
86 |
+
mapping_text += f"**{abbr}:** {prop}\n\n"
|
87 |
+
|
88 |
+
# Add confidence interval info if enabled
|
89 |
+
if show_ci:
|
90 |
+
if 'proportion_ci_lower' in model_cluster_df.columns and 'proportion_ci_upper' in model_cluster_df.columns:
|
91 |
+
mapping_text += "---\n\n**Confidence Intervals:**\n"
|
92 |
+
mapping_text += "Error bars show 95% confidence intervals for proportion values.\n"
|
93 |
+
else:
|
94 |
+
mapping_text += "---\n\n**Note:** Confidence interval data not available in the loaded dataset.\n"
|
95 |
+
|
96 |
+
return fig, mapping_text
|
97 |
+
|
98 |
+
|
99 |
+
def create_quality_plot(quality_metric: str = "helpfulness", show_ci: bool = False) -> Tuple[go.Figure, str]:
|
100 |
+
"""Create a grouped bar plot of quality by property and model."""
|
101 |
+
if app_state.get("model_cluster_df") is None:
|
102 |
+
return None, "No model cluster data loaded. Please load data first."
|
103 |
+
|
104 |
+
model_cluster_df = app_state["model_cluster_df"]
|
105 |
+
|
106 |
+
if model_cluster_df.empty:
|
107 |
+
return None, "No model cluster data available."
|
108 |
+
|
109 |
+
# Check if the quality metric exists in the data
|
110 |
+
quality_col = f"quality_{quality_metric}"
|
111 |
+
if quality_col not in model_cluster_df.columns:
|
112 |
+
# Get available quality metrics for better error message
|
113 |
+
available_metrics = [col.replace("quality_", "") for col in model_cluster_df.columns
|
114 |
+
if col.startswith("quality_")
|
115 |
+
and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant", "_delta"))]
|
116 |
+
if not available_metrics:
|
117 |
+
return None, f"No quality metrics found in the data. Available columns: {list(model_cluster_df.columns)}"
|
118 |
+
return None, f"Quality metric '{quality_metric}' not found. Available metrics: {available_metrics}"
|
119 |
+
|
120 |
+
# Create a copy for plotting
|
121 |
+
plot_df = model_cluster_df.copy()
|
122 |
+
|
123 |
+
# Ensure quality values are numeric
|
124 |
+
plot_df[quality_col] = pd.to_numeric(plot_df[quality_col], errors='coerce')
|
125 |
+
|
126 |
+
# Check if we have any valid quality data
|
127 |
+
if plot_df[quality_col].isna().all():
|
128 |
+
return None, f"No valid quality data found for metric '{quality_metric}'. All values are missing or invalid."
|
129 |
+
|
130 |
+
# Create property name mapping with proper ordering (same as proportion plot)
|
131 |
+
unique_properties = sorted(plot_df['cluster'].unique())
|
132 |
+
property_mapping = {prop: f"P{i+1}" for i, prop in enumerate(unique_properties)}
|
133 |
+
|
134 |
+
# Create abbreviated property column for plotting
|
135 |
+
plot_df['property_abbr'] = plot_df['cluster'].map(property_mapping)
|
136 |
+
|
137 |
+
# Filter out "No properties" clusters
|
138 |
+
plot_df = plot_df[plot_df['cluster'] != "No properties"]
|
139 |
+
|
140 |
+
# Prepare confidence interval data if requested
|
141 |
+
error_y_data = None
|
142 |
+
if show_ci:
|
143 |
+
ci_lower_col = f"{quality_col}_ci_lower"
|
144 |
+
ci_upper_col = f"{quality_col}_ci_upper"
|
145 |
+
if ci_lower_col in plot_df.columns and ci_upper_col in plot_df.columns:
|
146 |
+
# Calculate error bar values
|
147 |
+
plot_df['y_error'] = plot_df[ci_upper_col] - plot_df[quality_col]
|
148 |
+
plot_df['y_error_minus'] = plot_df[quality_col] - plot_df[ci_lower_col]
|
149 |
+
# Replace NaN values with 0
|
150 |
+
plot_df['y_error'] = plot_df['y_error'].fillna(0)
|
151 |
+
plot_df['y_error_minus'] = plot_df['y_error_minus'].fillna(0)
|
152 |
+
error_y_data = plot_df['y_error']
|
153 |
+
error_y_minus_data = plot_df['y_error_minus']
|
154 |
+
|
155 |
+
# Create a grouped bar plot of quality by property (x) and model (hue)
|
156 |
+
fig = px.bar(
|
157 |
+
plot_df,
|
158 |
+
x="property_abbr",
|
159 |
+
y=quality_col,
|
160 |
+
color="model",
|
161 |
+
barmode="group",
|
162 |
+
title=f"Quality ({quality_metric.title()}) by Property and Model",
|
163 |
+
labels={quality_col: f"Quality ({quality_metric.title()})", "property_abbr": "Property", "model": "Model"},
|
164 |
+
error_y="y_error" if error_y_data is not None else None,
|
165 |
+
error_y_minus="y_error_minus" if error_y_data is not None else None
|
166 |
+
)
|
167 |
+
|
168 |
+
# Set the x-axis order to ensure P1, P2, P3, etc. (same as proportion plot)
|
169 |
+
property_order = [f"P{i+1}" for i in range(len(unique_properties))]
|
170 |
+
fig.update_xaxes(categoryorder='array', categoryarray=property_order)
|
171 |
+
fig.update_layout(xaxis_tickangle=45)
|
172 |
+
|
173 |
+
# save figure to file
|
174 |
+
fig.write_html(f"model_cluster_quality_{quality_metric}_plot.html")
|
175 |
+
|
176 |
+
# Create property mapping string (same as proportion plot)
|
177 |
+
mapping_text = "**Property Mapping:**\n\n"
|
178 |
+
for prop, abbr in property_mapping.items():
|
179 |
+
mapping_text += f"**{abbr}:** {prop}\n\n"
|
180 |
+
|
181 |
+
# Add confidence interval info if enabled
|
182 |
+
if show_ci:
|
183 |
+
ci_lower_col = f"{quality_col}_ci_lower"
|
184 |
+
ci_upper_col = f"{quality_col}_ci_upper"
|
185 |
+
if ci_lower_col in plot_df.columns and ci_upper_col in plot_df.columns:
|
186 |
+
mapping_text += "---\n\n**Confidence Intervals:**\n"
|
187 |
+
mapping_text += f"Error bars show 95% confidence intervals for {quality_metric} values.\n"
|
188 |
+
else:
|
189 |
+
mapping_text += "---\n\n**Note:** Confidence interval data not available for this quality metric.\n"
|
190 |
+
|
191 |
+
return fig, mapping_text
|
192 |
+
|
193 |
+
|
194 |
+
def get_available_quality_metrics() -> List[str]:
|
195 |
+
"""Get available quality metrics from the loaded DataFrame."""
|
196 |
+
if app_state.get("model_cluster_df") is None:
|
197 |
+
return ["helpfulness", "accuracy", "harmlessness", "honesty"]
|
198 |
+
|
199 |
+
model_cluster_df = app_state["model_cluster_df"]
|
200 |
+
# Find all quality columns (excluding CI and other suffix columns)
|
201 |
+
quality_columns = [col for col in model_cluster_df.columns
|
202 |
+
if col.startswith("quality_")
|
203 |
+
and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant", "_delta"))]
|
204 |
+
# Extract metric names by removing "quality_" prefix
|
205 |
+
available_quality_metrics = [col.replace("quality_", "") for col in quality_columns]
|
206 |
+
|
207 |
+
# If no quality metrics found, provide defaults
|
208 |
+
if not available_quality_metrics:
|
209 |
+
available_quality_metrics = ["helpfulness", "accuracy", "harmlessness", "honesty"]
|
210 |
+
|
211 |
+
return available_quality_metrics
|
212 |
+
|
213 |
+
|
214 |
+
def update_quality_metric_dropdown() -> gr.Dropdown:
|
215 |
+
"""Update the quality metric dropdown with available metrics."""
|
216 |
+
available_metrics = get_available_quality_metrics()
|
217 |
+
return gr.Dropdown(
|
218 |
+
label="Quality Metric",
|
219 |
+
choices=available_metrics,
|
220 |
+
value=available_metrics[0] if available_metrics else "helpfulness",
|
221 |
+
info="Select which quality metric to display"
|
222 |
+
)
|
223 |
+
|
224 |
+
|
225 |
+
def update_quality_metric_visibility(plot_type: str) -> gr.Dropdown:
|
226 |
+
"""Update the quality metric dropdown visibility based on plot type."""
|
227 |
+
available_metrics = get_available_quality_metrics()
|
228 |
+
return gr.Dropdown(
|
229 |
+
label="Quality Metric",
|
230 |
+
choices=available_metrics,
|
231 |
+
value=available_metrics[0] if available_metrics else "helpfulness",
|
232 |
+
info="Select which quality metric to display",
|
233 |
+
visible=(plot_type == "quality")
|
234 |
+
)
|
235 |
+
|
236 |
+
|
237 |
+
def create_plot_with_toggle(plot_type: str, quality_metric: str = "helpfulness", show_ci: bool = False) -> Tuple[go.Figure, str]:
|
238 |
+
"""Create a plot based on the selected type (frequency or quality)."""
|
239 |
+
if plot_type == "frequency":
|
240 |
+
return create_proportion_plot(show_ci)
|
241 |
+
elif plot_type == "quality":
|
242 |
+
return create_quality_plot(quality_metric, show_ci)
|
243 |
+
else:
|
244 |
+
return None, f"Unknown plot type: {plot_type}"
|
245 |
+
|
246 |
+
|
247 |
+
def create_plots_tab() -> Tuple[gr.Plot, gr.Markdown, gr.Checkbox, gr.Dropdown, gr.Dropdown]:
|
248 |
+
"""Create the plots tab interface with a toggle between frequency and quality plots."""
|
249 |
+
gr.Markdown("Interactive grouped bar plot showing either frequency (proportion) or quality metrics by property and model. **If the plot looks wonky, just unclick and re-click the signifigance checkbox to have it resize**")
|
250 |
+
|
251 |
+
# Plot controls in a row
|
252 |
+
with gr.Row():
|
253 |
+
# Plot type toggle
|
254 |
+
plot_type_dropdown = gr.Dropdown(
|
255 |
+
label="Plot Type",
|
256 |
+
choices=["frequency", "quality"],
|
257 |
+
value="frequency",
|
258 |
+
info="Choose between frequency (proportion) or quality metrics"
|
259 |
+
)
|
260 |
+
|
261 |
+
# Quality metric dropdown (only visible for quality plots)
|
262 |
+
quality_metric_dropdown = gr.Dropdown(
|
263 |
+
label="Quality Metric",
|
264 |
+
choices=get_available_quality_metrics(),
|
265 |
+
value=get_available_quality_metrics()[0] if get_available_quality_metrics() else "helpfulness",
|
266 |
+
info="Select which quality metric to display",
|
267 |
+
visible=False # Initially hidden, shown when quality is selected
|
268 |
+
)
|
269 |
+
|
270 |
+
# Add checkbox for confidence intervals
|
271 |
+
show_ci_checkbox = gr.Checkbox(
|
272 |
+
label="Show Confidence Intervals",
|
273 |
+
value=True,
|
274 |
+
info="Display 95% confidence intervals as error bars (if available in data)"
|
275 |
+
)
|
276 |
+
|
277 |
+
plot_display = gr.Plot(
|
278 |
+
label="Model-Cluster Analysis Plot",
|
279 |
+
value=None
|
280 |
+
)
|
281 |
+
|
282 |
+
plot_info = gr.Markdown("")
|
283 |
+
|
284 |
+
return plot_display, plot_info, show_ci_checkbox, plot_type_dropdown, quality_metric_dropdown
|
lmmvibes/vis_gradio/side_by_side_display.py
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Side-by-side display component for comparing model responses.
|
3 |
+
|
4 |
+
This module provides functionality to display two model responses side by side
|
5 |
+
for comparison, specifically designed for datasets with model_a_response and
|
6 |
+
model_b_response fields.
|
7 |
+
"""
|
8 |
+
|
9 |
+
from typing import Dict, Any, Optional
|
10 |
+
from .conversation_display import convert_to_openai_format, display_openai_conversation_html
|
11 |
+
import html
|
12 |
+
|
13 |
+
def display_side_by_side_responses(
|
14 |
+
model_a: str,
|
15 |
+
model_b: str,
|
16 |
+
model_a_response: Any,
|
17 |
+
model_b_response: Any,
|
18 |
+
use_accordion: bool = True,
|
19 |
+
pretty_print_dicts: bool = True,
|
20 |
+
score: Optional[float] = None,
|
21 |
+
winner: Optional[str] = None
|
22 |
+
) -> str:
|
23 |
+
"""
|
24 |
+
Display two model responses side by side for comparison.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
model_a: Name of model A
|
28 |
+
model_b: Name of model B
|
29 |
+
model_a_response: Response data from model A
|
30 |
+
model_b_response: Response data from model B
|
31 |
+
use_accordion: If True, group system and info messages in collapsible accordions
|
32 |
+
pretty_print_dicts: If True, pretty-print embedded dictionaries
|
33 |
+
score: Optional score for the comparison
|
34 |
+
winner: Optional winner indication ('model_a', 'model_b', or 'tie')
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
HTML string for side-by-side display
|
38 |
+
"""
|
39 |
+
|
40 |
+
# Convert responses to OpenAI format
|
41 |
+
conversation_a = convert_to_openai_format(model_a_response) if model_a_response != 'N/A' else None
|
42 |
+
conversation_b = convert_to_openai_format(model_b_response) if model_b_response != 'N/A' else None
|
43 |
+
|
44 |
+
# Generate conversation HTML for each model
|
45 |
+
if conversation_a:
|
46 |
+
html_a = display_openai_conversation_html(
|
47 |
+
conversation_a,
|
48 |
+
use_accordion=use_accordion,
|
49 |
+
pretty_print_dicts=pretty_print_dicts
|
50 |
+
)
|
51 |
+
else:
|
52 |
+
html_a = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
|
53 |
+
|
54 |
+
if conversation_b:
|
55 |
+
html_b = display_openai_conversation_html(
|
56 |
+
conversation_b,
|
57 |
+
use_accordion=use_accordion,
|
58 |
+
pretty_print_dicts=pretty_print_dicts
|
59 |
+
)
|
60 |
+
else:
|
61 |
+
html_b = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
|
62 |
+
|
63 |
+
# Create winner badges if winner is specified
|
64 |
+
winner_badge_a = ""
|
65 |
+
winner_badge_b = ""
|
66 |
+
if winner:
|
67 |
+
if winner == 'model_a':
|
68 |
+
winner_badge_a = """
|
69 |
+
<span style="
|
70 |
+
background: #28a745;
|
71 |
+
color: white;
|
72 |
+
padding: 4px 8px;
|
73 |
+
border-radius: 12px;
|
74 |
+
font-size: 12px;
|
75 |
+
font-weight: bold;
|
76 |
+
margin-left: 10px;
|
77 |
+
">
|
78 |
+
π Winner
|
79 |
+
</span>
|
80 |
+
"""
|
81 |
+
elif winner == 'model_b':
|
82 |
+
winner_badge_b = """
|
83 |
+
<span style="
|
84 |
+
background: #28a745;
|
85 |
+
color: white;
|
86 |
+
padding: 4px 8px;
|
87 |
+
border-radius: 12px;
|
88 |
+
font-size: 12px;
|
89 |
+
font-weight: bold;
|
90 |
+
margin-left: 10px;
|
91 |
+
">
|
92 |
+
π Winner
|
93 |
+
</span>
|
94 |
+
"""
|
95 |
+
elif winner == 'tie':
|
96 |
+
tie_badge = """
|
97 |
+
<span style="
|
98 |
+
background: #6c757d;
|
99 |
+
color: white;
|
100 |
+
padding: 4px 8px;
|
101 |
+
border-radius: 12px;
|
102 |
+
font-size: 12px;
|
103 |
+
font-weight: bold;
|
104 |
+
margin-left: 10px;
|
105 |
+
">
|
106 |
+
π€ Tie
|
107 |
+
</span>
|
108 |
+
"""
|
109 |
+
winner_badge_a = tie_badge
|
110 |
+
winner_badge_b = tie_badge
|
111 |
+
|
112 |
+
# Add score badge if available
|
113 |
+
score_info = ""
|
114 |
+
if score is not None and score != 'N/A':
|
115 |
+
try:
|
116 |
+
score_val = float(score)
|
117 |
+
score_color = '#28a745' if score_val >= 0 else '#dc3545'
|
118 |
+
score_info = f"""
|
119 |
+
<div style="text-align: center; margin-bottom: 15px;">
|
120 |
+
<span style="
|
121 |
+
background: {score_color};
|
122 |
+
color: white;
|
123 |
+
padding: 6px 12px;
|
124 |
+
border-radius: 15px;
|
125 |
+
font-size: 14px;
|
126 |
+
font-weight: bold;
|
127 |
+
">
|
128 |
+
Comparison Score: {score_val:.3f}
|
129 |
+
</span>
|
130 |
+
</div>
|
131 |
+
"""
|
132 |
+
except (ValueError, TypeError):
|
133 |
+
pass
|
134 |
+
|
135 |
+
# Create the side-by-side layout
|
136 |
+
side_by_side_html = f"""
|
137 |
+
<div style="margin-bottom: 20px;">
|
138 |
+
{score_info}
|
139 |
+
<div style="display: flex; gap: 20px; margin-top: 10px;">
|
140 |
+
<!-- Model A Column -->
|
141 |
+
<div style="flex: 1; border: 2px solid #e9ecef; border-radius: 8px; padding: 15px; background-color: #f8f9fa;">
|
142 |
+
<h4 style="margin: 0 0 15px 0; padding-bottom: 10px; border-bottom: 2px solid #dee2e6; color: #495057; display: flex; align-items: center;">
|
143 |
+
<span style="background: #007bff; color: white; padding: 4px 8px; border-radius: 4px; font-size: 12px; margin-right: 10px;">A</span>
|
144 |
+
{html.escape(model_a)}
|
145 |
+
{winner_badge_a}
|
146 |
+
</h4>
|
147 |
+
<div style="font-size: 13px; line-height: 1.5;">
|
148 |
+
{html_a}
|
149 |
+
</div>
|
150 |
+
</div>
|
151 |
+
|
152 |
+
<!-- Model B Column -->
|
153 |
+
<div style="flex: 1; border: 2px solid #e9ecef; border-radius: 8px; padding: 15px; background-color: #f8f9fa;">
|
154 |
+
<h4 style="margin: 0 0 15px 0; padding-bottom: 10px; border-bottom: 2px solid #dee2e6; color: #495057; display: flex; align-items: center;">
|
155 |
+
<span style="background: #fd7e14; color: white; padding: 4px 8px; border-radius: 4px; font-size: 12px; margin-right: 10px;">B</span>
|
156 |
+
{html.escape(model_b)}
|
157 |
+
{winner_badge_b}
|
158 |
+
</h4>
|
159 |
+
<div style="font-size: 13px; line-height: 1.5;">
|
160 |
+
{html_b}
|
161 |
+
</div>
|
162 |
+
</div>
|
163 |
+
</div>
|
164 |
+
</div>
|
165 |
+
"""
|
166 |
+
|
167 |
+
return side_by_side_html
|
168 |
+
|
169 |
+
|
170 |
+
def is_side_by_side_dataset(example: Dict[str, Any]) -> bool:
|
171 |
+
"""
|
172 |
+
Check if an example contains side-by-side comparison data.
|
173 |
+
|
174 |
+
Args:
|
175 |
+
example: Example dictionary from the dataset
|
176 |
+
|
177 |
+
Returns:
|
178 |
+
True if the example has both model_a_response and model_b_response
|
179 |
+
"""
|
180 |
+
# Check if this is a side-by-side dataset by looking for both model_a_response and model_b_response
|
181 |
+
return 'model_a_response' in example and 'model_b_response' in example and \
|
182 |
+
example.get('model_a_response') is not None and example.get('model_b_response') is not None
|
183 |
+
|
184 |
+
|
185 |
+
def extract_side_by_side_data(row: Dict[str, Any]) -> Dict[str, Any]:
|
186 |
+
"""
|
187 |
+
Extract side-by-side comparison data from a row.
|
188 |
+
|
189 |
+
Args:
|
190 |
+
row: Row from the dataset
|
191 |
+
|
192 |
+
Returns:
|
193 |
+
Dictionary with extracted side-by-side data
|
194 |
+
"""
|
195 |
+
return {
|
196 |
+
'model_a': row.get('model_a', 'Model A'),
|
197 |
+
'model_b': row.get('model_b', 'Model B'),
|
198 |
+
'model_a_response': row.get('model_a_response', 'N/A'),
|
199 |
+
'model_b_response': row.get('model_b_response', 'N/A'),
|
200 |
+
'winner': row.get('winner', None),
|
201 |
+
'score': row.get('score', None)
|
202 |
+
}
|
lmmvibes/vis_gradio/state.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Shared application state for the LMM-Vibes Gradio viewer.
|
3 |
+
|
4 |
+
This module centralises mutable globals so they can be imported from any other
|
5 |
+
sub-module without circular-import problems.
|
6 |
+
"""
|
7 |
+
from typing import Any, Dict, Optional
|
8 |
+
import os
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
# Global runtime state β mutable and shared across all tabs
|
12 |
+
app_state: Dict[str, Any] = {
|
13 |
+
"clustered_df": None,
|
14 |
+
# NEW canonical key for the FunctionalMetrics dict
|
15 |
+
"metrics": None,
|
16 |
+
# DEPRECATED alias kept temporarily so that untouched modules continue to work
|
17 |
+
"model_stats": None,
|
18 |
+
"results_path": None,
|
19 |
+
"available_models": [],
|
20 |
+
"current_results_dir": None,
|
21 |
+
}
|
22 |
+
|
23 |
+
# Base directory that contains experiment result folders. Can be changed at
|
24 |
+
# runtime via launch_app(results_dir=β¦). A value of None means "not set".
|
25 |
+
# Prefer persistent storage in Spaces at /data/data when available.
|
26 |
+
_default_base = "/data/data" if Path("/data/data").exists() else "data"
|
27 |
+
BASE_RESULTS_DIR: Optional[str] = os.getenv("BASE_RESULTS_DIR", _default_base)
|
lmmvibes/vis_gradio/utils.py
ADDED
@@ -0,0 +1,1673 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utility functions for Gradio pipeline results app.
|
3 |
+
|
4 |
+
This module contains common utility functions used across different components.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
import json
|
10 |
+
import markdown
|
11 |
+
import plotly.graph_objects as go
|
12 |
+
import plotly.express as px
|
13 |
+
from typing import Dict, List, Any, Optional, Tuple
|
14 |
+
import html
|
15 |
+
import ast
|
16 |
+
|
17 |
+
# Conversation rendering helpers are now in a dedicated module for clarity
|
18 |
+
from . import conversation_display as _convdisp
|
19 |
+
from .conversation_display import (
|
20 |
+
convert_to_openai_format,
|
21 |
+
display_openai_conversation_html,
|
22 |
+
pretty_print_embedded_dicts,
|
23 |
+
)
|
24 |
+
|
25 |
+
# NEW IMPLEMENTATION ---------------------------------------------------
|
26 |
+
from .metrics_adapter import get_model_clusters, get_all_models
|
27 |
+
|
28 |
+
# ---------------------------------------------------------------------------
|
29 |
+
# NEW helper utilities for FunctionalMetrics format
|
30 |
+
# ---------------------------------------------------------------------------
|
31 |
+
|
32 |
+
|
33 |
+
def format_confidence_interval(ci: dict | None, decimals: int = 3) -> str:
|
34 |
+
"""Return a pretty string for a CI dict of the form {"lower": x, "upper": y}."""
|
35 |
+
if not ci or not isinstance(ci, dict):
|
36 |
+
return "N/A"
|
37 |
+
lower, upper = ci.get("lower"), ci.get("upper")
|
38 |
+
if lower is None or upper is None:
|
39 |
+
return "N/A"
|
40 |
+
return f"[{lower:.{decimals}f}, {upper:.{decimals}f}]"
|
41 |
+
|
42 |
+
|
43 |
+
def get_confidence_interval_width(ci: dict | None) -> float | None:
|
44 |
+
"""Return CI width (upper-lower) if possible."""
|
45 |
+
if not ci or not isinstance(ci, dict):
|
46 |
+
return None
|
47 |
+
lower, upper = ci.get("lower"), ci.get("upper")
|
48 |
+
if lower is None or upper is None:
|
49 |
+
return None
|
50 |
+
return upper - lower
|
51 |
+
|
52 |
+
|
53 |
+
def has_confidence_intervals(record: dict | None) -> bool:
|
54 |
+
"""Simple check whether any *_ci key with lower/upper exists in a metrics record."""
|
55 |
+
if not record or not isinstance(record, dict):
|
56 |
+
return False
|
57 |
+
for k, v in record.items():
|
58 |
+
if k.endswith("_ci") and isinstance(v, dict) and {"lower", "upper"}.issubset(v.keys()):
|
59 |
+
return True
|
60 |
+
return False
|
61 |
+
|
62 |
+
|
63 |
+
def extract_quality_score(quality_field: Any) -> float | None:
|
64 |
+
"""Given a quality field that may be a dict of metric values or a scalar, return its mean."""
|
65 |
+
if quality_field is None:
|
66 |
+
return None
|
67 |
+
if isinstance(quality_field, (int, float)):
|
68 |
+
return float(quality_field)
|
69 |
+
if isinstance(quality_field, dict) and quality_field:
|
70 |
+
return float(np.mean(list(quality_field.values())))
|
71 |
+
return None
|
72 |
+
|
73 |
+
# ---------------------------------------------------------------------------
|
74 |
+
# UPDATED: get_top_clusters_for_model for FunctionalMetrics format
|
75 |
+
# ---------------------------------------------------------------------------
|
76 |
+
|
77 |
+
|
78 |
+
def get_top_clusters_for_model(metrics: Dict[str, Any], model_name: str, top_n: int = 10) -> List[Tuple[str, Dict[str, Any]]]:
|
79 |
+
"""Return the top N clusters (by salience) for a given model.
|
80 |
+
|
81 |
+
Args:
|
82 |
+
metrics: The FunctionalMetrics dictionary (3-file format) loaded via data_loader.
|
83 |
+
model_name: Name of the model to inspect.
|
84 |
+
top_n: Number of clusters to return.
|
85 |
+
|
86 |
+
Returns:
|
87 |
+
List of (cluster_name, cluster_dict) tuples sorted by descending proportion_delta.
|
88 |
+
"""
|
89 |
+
clusters_dict = get_model_clusters(metrics, model_name)
|
90 |
+
if not clusters_dict:
|
91 |
+
return []
|
92 |
+
|
93 |
+
# Filter out "No properties" clusters
|
94 |
+
clusters_dict = {k: v for k, v in clusters_dict.items() if k != "No properties"}
|
95 |
+
|
96 |
+
sorted_items = sorted(
|
97 |
+
clusters_dict.items(), key=lambda kv: kv[1].get("proportion_delta", 0), reverse=True
|
98 |
+
)
|
99 |
+
return sorted_items[:top_n]
|
100 |
+
|
101 |
+
|
102 |
+
def compute_model_rankings_new(metrics: Dict[str, Any]) -> List[tuple]:
|
103 |
+
"""Compute rankings of models based on mean salience (proportion_delta).
|
104 |
+
|
105 |
+
Args:
|
106 |
+
metrics: The FunctionalMetrics dict loaded by data_loader.
|
107 |
+
|
108 |
+
Returns:
|
109 |
+
List[Tuple[str, Dict[str, float]]]: sorted list of (model_name, summary_dict)
|
110 |
+
"""
|
111 |
+
model_scores: Dict[str, Dict[str, float]] = {}
|
112 |
+
for model in get_all_models(metrics):
|
113 |
+
clusters = get_model_clusters(metrics, model)
|
114 |
+
# Filter out "No properties" clusters
|
115 |
+
clusters = {k: v for k, v in clusters.items() if k != "No properties"}
|
116 |
+
if not clusters:
|
117 |
+
continue
|
118 |
+
saliences = [c.get("proportion_delta", 0.0) for c in clusters.values()]
|
119 |
+
model_scores[model] = {
|
120 |
+
"avg_salience": float(np.mean(saliences)),
|
121 |
+
"median_salience": float(np.median(saliences)),
|
122 |
+
"num_clusters": len(saliences),
|
123 |
+
"top_salience": float(max(saliences)),
|
124 |
+
"std_salience": float(np.std(saliences)),
|
125 |
+
}
|
126 |
+
return sorted(model_scores.items(), key=lambda x: x[1]["avg_salience"], reverse=True)
|
127 |
+
|
128 |
+
|
129 |
+
def create_model_summary_card_new(
|
130 |
+
model_name: str,
|
131 |
+
metrics: Dict[str, Any],
|
132 |
+
top_n: int = 3,
|
133 |
+
score_significant_only: bool = False,
|
134 |
+
quality_significant_only: bool = False,
|
135 |
+
sort_by: str = "quality_asc",
|
136 |
+
min_cluster_size: int = 1,
|
137 |
+
) -> str:
|
138 |
+
"""Generate a **styled** HTML summary card for a single model.
|
139 |
+
|
140 |
+
The new implementation recreates the legacy card design the user prefers:
|
141 |
+
β’ Card header with battle count
|
142 |
+
β’ Each cluster displayed as a vertically-spaced block (NOT a table)
|
143 |
+
β’ Frequency, distinctiveness factor and CI inline; quality score right-aligned
|
144 |
+
"""
|
145 |
+
|
146 |
+
clusters_dict = get_model_clusters(metrics, model_name)
|
147 |
+
if not clusters_dict:
|
148 |
+
return f"<div style='padding:20px'>No cluster data for {model_name}</div>"
|
149 |
+
|
150 |
+
# Filter out "No properties" clusters
|
151 |
+
clusters_dict = {k: v for k, v in clusters_dict.items() if k != "No properties"}
|
152 |
+
|
153 |
+
# Filter clusters ----------------------------------------------------
|
154 |
+
all_clusters = [c for c in clusters_dict.values() if c.get("size", 0) >= min_cluster_size]
|
155 |
+
|
156 |
+
if score_significant_only:
|
157 |
+
if model_name == "all":
|
158 |
+
# For "all" model, we don't have proportion_delta_significant, so skip this filter
|
159 |
+
pass
|
160 |
+
else:
|
161 |
+
all_clusters = [c for c in all_clusters if c.get("proportion_delta_significant", False)]
|
162 |
+
if quality_significant_only:
|
163 |
+
all_clusters = [c for c in all_clusters if any(c.get("quality_delta_significant", {}).values())]
|
164 |
+
|
165 |
+
if not all_clusters:
|
166 |
+
return f"<div style='padding:20px'>No clusters pass filters for {model_name}</div>"
|
167 |
+
|
168 |
+
# Count significant properties ---------------------------------------
|
169 |
+
significant_frequency_count = 0
|
170 |
+
significant_quality_count = 0
|
171 |
+
|
172 |
+
for cluster in clusters_dict.values():
|
173 |
+
if cluster.get("size", 0) >= min_cluster_size:
|
174 |
+
# Count frequency significance
|
175 |
+
if model_name != "all" and cluster.get("proportion_delta_significant", False):
|
176 |
+
significant_frequency_count += 1
|
177 |
+
|
178 |
+
# Count quality significance (sum across all metrics)
|
179 |
+
quality_delta_significant = cluster.get("quality_delta_significant", {})
|
180 |
+
significant_quality_count += sum(quality_delta_significant.values())
|
181 |
+
|
182 |
+
# Sort ---------------------------------------------------------------
|
183 |
+
def _mean_quality(c: dict[str, Any]) -> float:
|
184 |
+
vals = list(c.get("quality", {}).values())
|
185 |
+
return float(np.mean(vals)) if vals else 0.0
|
186 |
+
|
187 |
+
sort_key_map = {
|
188 |
+
"quality_asc": (_mean_quality, False),
|
189 |
+
"quality_desc": (_mean_quality, True),
|
190 |
+
"frequency_desc": (lambda c: c.get("proportion", 0), True),
|
191 |
+
"frequency_asc": (lambda c: c.get("proportion", 0), False),
|
192 |
+
"salience_desc": (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), True),
|
193 |
+
"salience_asc": (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), False),
|
194 |
+
}
|
195 |
+
|
196 |
+
key_fn, reverse = sort_key_map.get(sort_by, (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), True))
|
197 |
+
sorted_clusters = sorted(all_clusters, key=key_fn, reverse=reverse)[:top_n]
|
198 |
+
|
199 |
+
# Determine total conversations for this model ----------------
|
200 |
+
if model_name == "all":
|
201 |
+
# For "all" model, sum the individual model totals to avoid double-counting
|
202 |
+
model_scores = metrics.get("model_scores", {})
|
203 |
+
total_battles = sum(model_data.get("size", 0) for model_data in model_scores.values())
|
204 |
+
else:
|
205 |
+
model_scores_entry = metrics.get("model_scores", {}).get(model_name, {})
|
206 |
+
total_battles = model_scores_entry.get("size")
|
207 |
+
if total_battles is None:
|
208 |
+
# Fallback: deduplicate example IDs across clusters
|
209 |
+
total_battles = sum(c.get("size", 0) for c in clusters_dict.values())
|
210 |
+
|
211 |
+
# Card header --------------------------------------------------------
|
212 |
+
html_parts: list[str] = [f"""
|
213 |
+
<div style="padding: 20px; border:1px solid #e0e0e0; border-radius:8px; margin-bottom:25px;">
|
214 |
+
<h3 style="margin-top:0; font-size: 20px;">{html.escape(model_name)}</h3>
|
215 |
+
<p style="margin: 4px 0 8px 0; color:#555; font-size:13px;">
|
216 |
+
{total_battles} battles Β· Top clusters by frequency
|
217 |
+
</p>
|
218 |
+
<p style="margin: 0 0 18px 0; color:#666; font-size:12px;">
|
219 |
+
π {significant_frequency_count} significant frequency properties Β· {significant_quality_count} significant quality properties
|
220 |
+
</p>
|
221 |
+
"""]
|
222 |
+
|
223 |
+
# Cluster blocks -----------------------------------------------------
|
224 |
+
for i, cluster in enumerate(sorted_clusters):
|
225 |
+
name = html.escape(next(k for k, v in clusters_dict.items() if v is cluster))
|
226 |
+
prop = cluster.get("proportion", 0)
|
227 |
+
freq_pct = prop * 100
|
228 |
+
size = cluster.get("size", 0)
|
229 |
+
|
230 |
+
# Check significance flags
|
231 |
+
is_proportion_significant = False
|
232 |
+
if model_name != "all":
|
233 |
+
is_proportion_significant = cluster.get("proportion_delta_significant", False)
|
234 |
+
|
235 |
+
quality_delta_significant = cluster.get("quality_delta_significant", {})
|
236 |
+
is_quality_significant = any(quality_delta_significant.values())
|
237 |
+
|
238 |
+
# Create significance indicators
|
239 |
+
significance_indicators = []
|
240 |
+
if is_proportion_significant:
|
241 |
+
significance_indicators.append('<span style="background: #28a745; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">FREQ</span>')
|
242 |
+
if is_quality_significant:
|
243 |
+
significance_indicators.append('<span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">QUAL</span>')
|
244 |
+
|
245 |
+
significance_html = " ".join(significance_indicators) if significance_indicators else ""
|
246 |
+
|
247 |
+
# Distinctiveness factor heuristic
|
248 |
+
if model_name == "all":
|
249 |
+
# For "all" model, proportion_delta doesn't make sense, so show proportion instead
|
250 |
+
distinct_factor = prop
|
251 |
+
distinct_text = f"{freq_pct:.1f}% of all conversations"
|
252 |
+
else:
|
253 |
+
sal = cluster.get("proportion_delta", 0)
|
254 |
+
distinct_factor = 1 + (sal / prop) if prop else 1
|
255 |
+
distinct_text = f"proportion delta: {sal:+.3f}"
|
256 |
+
|
257 |
+
# Confidence interval (frequency based)
|
258 |
+
ci = cluster.get("proportion_ci")
|
259 |
+
ci_str = format_confidence_interval(ci) if ci else "N/A"
|
260 |
+
|
261 |
+
# Quality delta β show each metric separately
|
262 |
+
quality_delta = cluster.get("quality_delta", {})
|
263 |
+
quality_delta_html = ""
|
264 |
+
|
265 |
+
if quality_delta:
|
266 |
+
quality_delta_parts = []
|
267 |
+
for metric_name, delta_value in quality_delta.items():
|
268 |
+
color = "#28a745" if delta_value >= 0 else "#dc3545"
|
269 |
+
quality_delta_parts.append(f'<div style="color:{color}; font-weight:500;">{metric_name}: {delta_value:+.3f}</div>')
|
270 |
+
quality_delta_html = "".join(quality_delta_parts)
|
271 |
+
else:
|
272 |
+
quality_delta_html = '<span style="color:#666;">No quality data</span>'
|
273 |
+
|
274 |
+
# Get light color for this cluster
|
275 |
+
cluster_color = get_light_color_for_cluster(name, i)
|
276 |
+
|
277 |
+
html_parts.append(f"""
|
278 |
+
<div style="border-left: 4px solid #4c6ef5; padding: 12px 16px; margin-bottom: 10px; background:{cluster_color}; border-radius: 4px;">
|
279 |
+
<div style="display:flex; justify-content:space-between; align-items:flex-start;">
|
280 |
+
<div style="max-width:80%;">
|
281 |
+
<div style="margin-bottom:4px;">
|
282 |
+
<strong style="font-size:14px;">{name}</strong>
|
283 |
+
</div>
|
284 |
+
<span style="font-size:12px; color:#555;">{freq_pct:.1f}% frequency ({size} out of {total_battles} total) Β· {distinct_text}</span>
|
285 |
+
</div>
|
286 |
+
<div style="font-size:12px; font-weight:normal; white-space:nowrap; text-align:right;">
|
287 |
+
{quality_delta_html}
|
288 |
+
{significance_html}
|
289 |
+
</div>
|
290 |
+
</div>
|
291 |
+
</div>
|
292 |
+
""")
|
293 |
+
|
294 |
+
# Close card div -----------------------------------------------------
|
295 |
+
html_parts.append("</div>")
|
296 |
+
|
297 |
+
return "\n".join(html_parts)
|
298 |
+
|
299 |
+
|
300 |
+
def format_cluster_dataframe(clustered_df: pd.DataFrame,
|
301 |
+
selected_models: Optional[List[str]] = None,
|
302 |
+
cluster_level: str = 'fine') -> pd.DataFrame:
|
303 |
+
"""Format cluster DataFrame for display in Gradio."""
|
304 |
+
df = clustered_df.copy()
|
305 |
+
|
306 |
+
# Debug information
|
307 |
+
print(f"DEBUG: format_cluster_dataframe called")
|
308 |
+
print(f" - Input DataFrame shape: {df.shape}")
|
309 |
+
print(f" - Selected models: {selected_models}")
|
310 |
+
print(f" - Available models in data: {df['model'].unique().tolist() if 'model' in df.columns else 'No model column'}")
|
311 |
+
|
312 |
+
# Filter by models if specified
|
313 |
+
if selected_models:
|
314 |
+
print(f" - Filtering by {len(selected_models)} selected models")
|
315 |
+
df = df[df['model'].isin(selected_models)]
|
316 |
+
print(f" - After filtering shape: {df.shape}")
|
317 |
+
print(f" - Models after filtering: {df['model'].unique().tolist()}")
|
318 |
+
else:
|
319 |
+
print(f" - No model filtering applied")
|
320 |
+
|
321 |
+
# Select relevant columns based on cluster level using correct column names from pipeline
|
322 |
+
if cluster_level == 'fine':
|
323 |
+
id_col = 'property_description_fine_cluster_id'
|
324 |
+
label_col = 'property_description_fine_cluster_label'
|
325 |
+
# Also check for alternative naming without prefix
|
326 |
+
alt_id_col = 'fine_cluster_id'
|
327 |
+
alt_label_col = 'fine_cluster_label'
|
328 |
+
else:
|
329 |
+
id_col = 'property_description_coarse_cluster_id'
|
330 |
+
label_col = 'property_description_coarse_cluster_label'
|
331 |
+
# Also check for alternative naming without prefix
|
332 |
+
alt_id_col = 'coarse_cluster_id'
|
333 |
+
alt_label_col = 'coarse_cluster_label'
|
334 |
+
|
335 |
+
# Try both naming patterns
|
336 |
+
if id_col in df.columns and label_col in df.columns:
|
337 |
+
# Use the expected naming pattern
|
338 |
+
cols = ['question_id', 'model', 'property_description', id_col, label_col, 'score']
|
339 |
+
elif alt_id_col in df.columns and alt_label_col in df.columns:
|
340 |
+
# Use the alternative naming pattern
|
341 |
+
cols = ['question_id', 'model', 'property_description', alt_id_col, alt_label_col, 'score']
|
342 |
+
else:
|
343 |
+
# Fall back to basic columns if cluster columns are missing
|
344 |
+
cols = ['question_id', 'model', 'property_description', 'score']
|
345 |
+
|
346 |
+
# Keep only existing columns
|
347 |
+
available_cols = [col for col in cols if col in df.columns]
|
348 |
+
df = df[available_cols]
|
349 |
+
|
350 |
+
print(f" - Final DataFrame shape: {df.shape}")
|
351 |
+
print(f" - Final columns: {df.columns.tolist()}")
|
352 |
+
|
353 |
+
return df
|
354 |
+
|
355 |
+
|
356 |
+
def truncate_cluster_name(cluster_desc: str, max_length: int = 50) -> str:
|
357 |
+
"""Truncate cluster description to fit in table column."""
|
358 |
+
if len(cluster_desc) <= max_length:
|
359 |
+
return cluster_desc
|
360 |
+
return cluster_desc[:max_length-3] + "..."
|
361 |
+
|
362 |
+
def create_frequency_comparison_table(model_stats: Dict[str, Any],
|
363 |
+
selected_models: List[str],
|
364 |
+
cluster_level: str = "fine", # Ignored β kept for backward-compat
|
365 |
+
top_n: int = 50,
|
366 |
+
selected_model: str | None = None,
|
367 |
+
selected_quality_metric: str | None = None) -> pd.DataFrame:
|
368 |
+
"""Create a comparison table for the new FunctionalMetrics format.
|
369 |
+
|
370 |
+
The old signature is kept (cluster_level arg is ignored) so that callers
|
371 |
+
can be updated incrementally.
|
372 |
+
"""
|
373 |
+
|
374 |
+
if not selected_models:
|
375 |
+
return pd.DataFrame()
|
376 |
+
|
377 |
+
# ------------------------------------------------------------------
|
378 |
+
# 1. Collect per-model, per-cluster rows
|
379 |
+
# ------------------------------------------------------------------
|
380 |
+
all_rows: List[dict] = []
|
381 |
+
for model in selected_models:
|
382 |
+
model_clusters = get_model_clusters(model_stats, model) # type: ignore[arg-type]
|
383 |
+
if not model_clusters:
|
384 |
+
continue
|
385 |
+
|
386 |
+
# Optional filter by a single model after the fact
|
387 |
+
if selected_model and model != selected_model:
|
388 |
+
continue
|
389 |
+
|
390 |
+
for cluster_name, cdata in model_clusters.items():
|
391 |
+
# Filter out "No properties" clusters
|
392 |
+
if cluster_name == "No properties":
|
393 |
+
continue
|
394 |
+
|
395 |
+
# Basic numbers
|
396 |
+
freq_pct = cdata.get("proportion", 0.0) * 100.0
|
397 |
+
prop_ci = cdata.get("proportion_ci")
|
398 |
+
|
399 |
+
# Quality per metric dicts ------------------------------------------------
|
400 |
+
quality_dict = cdata.get("quality", {}) or {}
|
401 |
+
quality_ci_dict = cdata.get("quality_ci", {}) or {}
|
402 |
+
|
403 |
+
# Significance flags
|
404 |
+
sal_sig = bool(cdata.get("proportion_delta_significant", False))
|
405 |
+
quality_sig_flags = cdata.get("quality_delta_significant", {}) or {}
|
406 |
+
|
407 |
+
all_rows.append({
|
408 |
+
"cluster": cluster_name,
|
409 |
+
"model": model,
|
410 |
+
"frequency": freq_pct,
|
411 |
+
"proportion_ci": prop_ci,
|
412 |
+
"quality": quality_dict,
|
413 |
+
"quality_ci": quality_ci_dict,
|
414 |
+
"score_significant": sal_sig,
|
415 |
+
"quality_significant_any": any(quality_sig_flags.values()),
|
416 |
+
"quality_significant_metric": quality_sig_flags.get(selected_quality_metric) if selected_quality_metric else None,
|
417 |
+
})
|
418 |
+
|
419 |
+
if not all_rows:
|
420 |
+
return pd.DataFrame()
|
421 |
+
|
422 |
+
df_all = pd.DataFrame(all_rows)
|
423 |
+
|
424 |
+
# Aggregate frequency across models ----------------------------------
|
425 |
+
freq_sum = df_all.groupby("cluster")["frequency"].sum().sort_values(ascending=False)
|
426 |
+
top_clusters = freq_sum.head(top_n).index.tolist()
|
427 |
+
|
428 |
+
df_top = df_all[df_all["cluster"].isin(top_clusters)].copy()
|
429 |
+
|
430 |
+
table_rows: List[dict] = []
|
431 |
+
for clu in top_clusters:
|
432 |
+
subset = df_top[df_top["cluster"] == clu]
|
433 |
+
avg_freq = subset["frequency"].mean()
|
434 |
+
|
435 |
+
# Aggregate CI (mean of bounds)
|
436 |
+
ci_lowers = [ci.get("lower") for ci in subset["proportion_ci"] if isinstance(ci, dict)]
|
437 |
+
ci_uppers = [ci.get("upper") for ci in subset["proportion_ci"] if isinstance(ci, dict)]
|
438 |
+
freq_ci = {
|
439 |
+
"lower": float(np.mean(ci_lowers)) if ci_lowers else None,
|
440 |
+
"upper": float(np.mean(ci_uppers)) if ci_uppers else None,
|
441 |
+
} if ci_lowers and ci_uppers else None
|
442 |
+
|
443 |
+
# Quality aggregation -----------------------------------------------------
|
444 |
+
q_vals: List[float] = []
|
445 |
+
q_ci_l: List[float] = []
|
446 |
+
q_ci_u: List[float] = []
|
447 |
+
quality_sig_any = False
|
448 |
+
for _, row in subset.iterrows():
|
449 |
+
q_dict = row["quality"]
|
450 |
+
if selected_quality_metric:
|
451 |
+
if selected_quality_metric in q_dict:
|
452 |
+
q_vals.append(q_dict[selected_quality_metric])
|
453 |
+
ci_metric = row["quality_ci"].get(selected_quality_metric) if isinstance(row["quality_ci"], dict) else None
|
454 |
+
if ci_metric:
|
455 |
+
q_ci_l.append(ci_metric.get("lower"))
|
456 |
+
q_ci_u.append(ci_metric.get("upper"))
|
457 |
+
quality_sig_any = quality_sig_any or bool(row["quality_significant_metric"])
|
458 |
+
else:
|
459 |
+
q_vals.extend(q_dict.values())
|
460 |
+
for ci in row["quality_ci"].values():
|
461 |
+
if isinstance(ci, dict):
|
462 |
+
q_ci_l.append(ci.get("lower"))
|
463 |
+
q_ci_u.append(ci.get("upper"))
|
464 |
+
quality_sig_any = quality_sig_any or row["quality_significant_any"]
|
465 |
+
|
466 |
+
quality_val = float(np.mean(q_vals)) if q_vals else None
|
467 |
+
quality_ci = {
|
468 |
+
"lower": float(np.mean(q_ci_l)),
|
469 |
+
"upper": float(np.mean(q_ci_u)),
|
470 |
+
} if q_ci_l and q_ci_u else None
|
471 |
+
|
472 |
+
score_sig = subset["score_significant"].any()
|
473 |
+
|
474 |
+
table_rows.append({
|
475 |
+
"Cluster": clu,
|
476 |
+
"Frequency (%)": f"{avg_freq:.1f}",
|
477 |
+
"Freq CI": format_confidence_interval(freq_ci),
|
478 |
+
"Quality": f"{quality_val:.3f}" if quality_val is not None else "N/A",
|
479 |
+
"Quality CI": format_confidence_interval(quality_ci) if quality_ci else "N/A",
|
480 |
+
"Score Significance": "Yes" if score_sig else "No",
|
481 |
+
"Quality Significance": "Yes" if quality_sig_any else "No",
|
482 |
+
})
|
483 |
+
|
484 |
+
return pd.DataFrame(table_rows)
|
485 |
+
|
486 |
+
|
487 |
+
def create_frequency_comparison_plots(model_stats: Dict[str, Any],
|
488 |
+
selected_models: List[str],
|
489 |
+
cluster_level: str = 'fine',
|
490 |
+
top_n: int = 50,
|
491 |
+
show_confidence_intervals: bool = False) -> Tuple[go.Figure, go.Figure]:
|
492 |
+
"""Create frequency comparison plots (matching frequencies_tab.py exactly)."""
|
493 |
+
|
494 |
+
print(f"\nDEBUG: Plotting function called with:")
|
495 |
+
print(f" - Selected models: {selected_models}")
|
496 |
+
print(f" - Cluster level: {cluster_level}")
|
497 |
+
print(f" - Top N: {top_n}")
|
498 |
+
print(f" - Available models in stats: {list(model_stats.keys())}")
|
499 |
+
|
500 |
+
# Use the same data preparation logic as the table function
|
501 |
+
# Collect all clusters across all models for the chart (exact copy from frequencies_tab.py)
|
502 |
+
all_clusters_data = []
|
503 |
+
for model_name, model_data in model_stats.items():
|
504 |
+
if model_name not in selected_models:
|
505 |
+
continue
|
506 |
+
|
507 |
+
clusters = model_data.get(cluster_level, [])
|
508 |
+
for cluster in clusters:
|
509 |
+
# Filter out "No properties" clusters
|
510 |
+
if cluster.get('property_description') == "No properties":
|
511 |
+
continue
|
512 |
+
|
513 |
+
# Get confidence intervals for quality scores if available
|
514 |
+
quality_score_ci = cluster.get('quality_score_ci', {})
|
515 |
+
has_quality_ci = bool(quality_score_ci)
|
516 |
+
|
517 |
+
# Get distinctiveness score confidence intervals (correct structure)
|
518 |
+
score_ci = cluster.get('score_ci', {})
|
519 |
+
ci_lower = score_ci.get('lower') if score_ci else None
|
520 |
+
ci_upper = score_ci.get('upper') if score_ci else None
|
521 |
+
|
522 |
+
all_clusters_data.append({
|
523 |
+
'property_description': cluster['property_description'],
|
524 |
+
'model': model_name,
|
525 |
+
'frequency': cluster.get('proportion', 0) * 100, # Convert to percentage
|
526 |
+
'size': cluster.get('size', 0),
|
527 |
+
'cluster_size_global': cluster.get('cluster_size_global', 0),
|
528 |
+
'has_ci': has_confidence_intervals(cluster),
|
529 |
+
'ci_lower': ci_lower,
|
530 |
+
'ci_upper': ci_upper,
|
531 |
+
'has_quality_ci': has_quality_ci
|
532 |
+
})
|
533 |
+
|
534 |
+
if not all_clusters_data:
|
535 |
+
# Return empty figures
|
536 |
+
empty_fig = go.Figure()
|
537 |
+
empty_fig.add_annotation(text="No data available", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
|
538 |
+
return empty_fig, empty_fig
|
539 |
+
|
540 |
+
clusters_df = pd.DataFrame(all_clusters_data)
|
541 |
+
|
542 |
+
# Get all unique clusters for the chart
|
543 |
+
all_unique_clusters = clusters_df['property_description'].unique()
|
544 |
+
total_clusters = len(all_unique_clusters)
|
545 |
+
|
546 |
+
# Show all clusters by default
|
547 |
+
top_n_for_chart = min(top_n, total_clusters)
|
548 |
+
|
549 |
+
# Calculate total frequency per cluster and get top clusters
|
550 |
+
cluster_totals = clusters_df.groupby('property_description')['frequency'].sum().sort_values(ascending=False)
|
551 |
+
top_clusters = cluster_totals.head(top_n_for_chart).index.tolist()
|
552 |
+
|
553 |
+
# Get quality scores for the same clusters to sort by quality
|
554 |
+
quality_data_for_sorting = []
|
555 |
+
for model_name, model_data in model_stats.items():
|
556 |
+
if model_name not in selected_models:
|
557 |
+
continue
|
558 |
+
clusters = model_data.get(cluster_level, [])
|
559 |
+
for cluster in clusters:
|
560 |
+
# Filter out "No properties" clusters
|
561 |
+
if cluster.get('property_description') == "No properties":
|
562 |
+
continue
|
563 |
+
|
564 |
+
if cluster['property_description'] in top_clusters:
|
565 |
+
quality_data_for_sorting.append({
|
566 |
+
'property_description': cluster['property_description'],
|
567 |
+
'quality_score': extract_quality_score(cluster.get('quality_score', 0))
|
568 |
+
})
|
569 |
+
|
570 |
+
# Calculate average quality score per cluster and sort
|
571 |
+
if quality_data_for_sorting:
|
572 |
+
quality_df_for_sorting = pd.DataFrame(quality_data_for_sorting)
|
573 |
+
avg_quality_per_cluster = quality_df_for_sorting.groupby('property_description')['quality_score'].mean().sort_values(ascending=True) # Low to high
|
574 |
+
top_clusters = avg_quality_per_cluster.index.tolist()
|
575 |
+
# Reverse the order so low quality appears at top of chart
|
576 |
+
top_clusters = top_clusters[::-1]
|
577 |
+
|
578 |
+
# Filter data to only include top clusters
|
579 |
+
chart_data = clusters_df[clusters_df['property_description'].isin(top_clusters)]
|
580 |
+
|
581 |
+
if chart_data.empty:
|
582 |
+
# Return empty figures
|
583 |
+
empty_fig = go.Figure()
|
584 |
+
empty_fig.add_annotation(text="No data available", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
|
585 |
+
return empty_fig, empty_fig
|
586 |
+
|
587 |
+
# Get unique models for colors
|
588 |
+
models = chart_data['model'].unique()
|
589 |
+
# Use a color palette that avoids yellow - using Set1 which has better contrast
|
590 |
+
colors = px.colors.qualitative.Set1[:len(models)]
|
591 |
+
|
592 |
+
# Create horizontal bar chart for frequencies
|
593 |
+
fig = go.Figure()
|
594 |
+
|
595 |
+
# Add a bar for each model
|
596 |
+
for i, model in enumerate(models):
|
597 |
+
model_data = chart_data[chart_data['model'] == model]
|
598 |
+
|
599 |
+
# Sort by cluster order (same as top_clusters)
|
600 |
+
model_data = model_data.set_index('property_description').reindex(top_clusters).reset_index()
|
601 |
+
|
602 |
+
# Fill NaN values with 0 for missing clusters
|
603 |
+
model_data['frequency'] = model_data['frequency'].fillna(0)
|
604 |
+
model_data['has_ci'] = model_data['has_ci'].fillna(False)
|
605 |
+
# For CI columns, replace NaN with None using where() instead of fillna(None)
|
606 |
+
model_data['ci_lower'] = model_data['ci_lower'].where(pd.notna(model_data['ci_lower']), None)
|
607 |
+
model_data['ci_upper'] = model_data['ci_upper'].where(pd.notna(model_data['ci_upper']), None)
|
608 |
+
|
609 |
+
# Ensure frequency is numeric and non-negative
|
610 |
+
model_data['frequency'] = pd.to_numeric(model_data['frequency'], errors='coerce').fillna(0)
|
611 |
+
model_data['frequency'] = model_data['frequency'].clip(lower=0)
|
612 |
+
|
613 |
+
# Debug: print model data for first model
|
614 |
+
if i == 0: # Only print for first model to avoid spam
|
615 |
+
print(f"DEBUG: Model {model} data sample:")
|
616 |
+
print(f" - Clusters: {len(model_data)}")
|
617 |
+
print(f" - Frequency range: {model_data['frequency'].min():.2f} - {model_data['frequency'].max():.2f}")
|
618 |
+
print(f" - Non-zero frequencies: {(model_data['frequency'] > 0).sum()}")
|
619 |
+
if len(model_data) > 0:
|
620 |
+
print(f" - Sample row: {model_data.iloc[0][['property_description', 'frequency']].to_dict()}")
|
621 |
+
|
622 |
+
# Remove any rows where property_description is NaN (these are clusters this model doesn't appear in)
|
623 |
+
model_data = model_data.dropna(subset=['property_description'])
|
624 |
+
|
625 |
+
# Get confidence intervals for error bars
|
626 |
+
ci_lower = []
|
627 |
+
ci_upper = []
|
628 |
+
for _, row in model_data.iterrows():
|
629 |
+
freq_value = row.get('frequency', 0)
|
630 |
+
if (row.get('has_ci', False) and
|
631 |
+
pd.notna(row.get('ci_lower')) and
|
632 |
+
pd.notna(row.get('ci_upper')) and
|
633 |
+
freq_value > 0): # Only calculate CIs for non-zero frequencies
|
634 |
+
|
635 |
+
# IMPORTANT: These are distinctiveness score CIs, not frequency CIs
|
636 |
+
# The distinctiveness score measures how much more/less frequently
|
637 |
+
# a model exhibits this behavior compared to the median model
|
638 |
+
# We can use this to estimate uncertainty in the frequency measurement
|
639 |
+
distinctiveness_ci_width = row['ci_upper'] - row['ci_lower']
|
640 |
+
|
641 |
+
# Convert to frequency uncertainty (approximate)
|
642 |
+
# A wider distinctiveness CI suggests more uncertainty in the frequency
|
643 |
+
freq_uncertainty = distinctiveness_ci_width * freq_value * 0.1
|
644 |
+
ci_lower.append(max(0, freq_value - freq_uncertainty))
|
645 |
+
ci_upper.append(freq_value + freq_uncertainty)
|
646 |
+
else:
|
647 |
+
ci_lower.append(None)
|
648 |
+
ci_upper.append(None)
|
649 |
+
|
650 |
+
# Debug: Check the data going into the plot
|
651 |
+
print(f"DEBUG: Adding trace for model {model}:")
|
652 |
+
print(f" - Y values (clusters): {model_data['property_description'].tolist()[:3]}...") # First 3 clusters
|
653 |
+
print(f" - X values (frequencies): {model_data['frequency'].tolist()[:3]}...") # First 3 frequencies
|
654 |
+
print(f" - Total data points: {len(model_data)}")
|
655 |
+
|
656 |
+
fig.add_trace(go.Bar(
|
657 |
+
y=model_data['property_description'],
|
658 |
+
x=model_data['frequency'],
|
659 |
+
name=model,
|
660 |
+
orientation='h',
|
661 |
+
marker_color=colors[i],
|
662 |
+
error_x=dict(
|
663 |
+
type='data',
|
664 |
+
array=[u - l if u is not None and l is not None else None for l, u in zip(ci_lower, ci_upper)],
|
665 |
+
arrayminus=[f - l if f is not None and l is not None else None for f, l in zip(model_data['frequency'], ci_lower)],
|
666 |
+
visible=show_confidence_intervals,
|
667 |
+
thickness=1,
|
668 |
+
width=3,
|
669 |
+
color='rgba(0,0,0,0.3)'
|
670 |
+
),
|
671 |
+
hovertemplate='<b>%{y}</b><br>' +
|
672 |
+
f'Model: {model}<br>' +
|
673 |
+
'Frequency: %{x:.1f}%<br>' +
|
674 |
+
'CI: %{customdata[0]}<extra></extra>',
|
675 |
+
customdata=[[
|
676 |
+
format_confidence_interval({
|
677 |
+
'lower': l,
|
678 |
+
'upper': u
|
679 |
+
}) if l is not None and u is not None else "N/A"
|
680 |
+
for l, u in zip(ci_lower, ci_upper)
|
681 |
+
]]
|
682 |
+
))
|
683 |
+
|
684 |
+
# Update layout
|
685 |
+
fig.update_layout(
|
686 |
+
title=f"Model Frequencies in Top {len(top_clusters)} Clusters",
|
687 |
+
xaxis_title="Frequency (%)",
|
688 |
+
yaxis_title="Cluster Description",
|
689 |
+
barmode='group', # Group bars side by side
|
690 |
+
height=max(600, len(top_clusters) * 25), # Adjust height based on number of clusters
|
691 |
+
showlegend=True,
|
692 |
+
legend=dict(
|
693 |
+
orientation="h",
|
694 |
+
yanchor="bottom",
|
695 |
+
y=1.02,
|
696 |
+
xanchor="right",
|
697 |
+
x=1
|
698 |
+
)
|
699 |
+
)
|
700 |
+
|
701 |
+
# Update y-axis to show truncated cluster names
|
702 |
+
fig.update_yaxes(
|
703 |
+
tickmode='array',
|
704 |
+
ticktext=[truncate_cluster_name(desc, 60) for desc in top_clusters],
|
705 |
+
tickvals=top_clusters
|
706 |
+
)
|
707 |
+
|
708 |
+
# Create quality score chart
|
709 |
+
# Get quality scores for the same clusters (single score per cluster)
|
710 |
+
quality_data = []
|
711 |
+
quality_cis = [] # Add confidence intervals for quality scores
|
712 |
+
|
713 |
+
for cluster_desc in top_clusters:
|
714 |
+
# Get the first available quality score for this cluster
|
715 |
+
for model_name, model_data in model_stats.items():
|
716 |
+
clusters = model_data.get(cluster_level, [])
|
717 |
+
for cluster in clusters:
|
718 |
+
if cluster['property_description'] == cluster_desc:
|
719 |
+
quality_score = extract_quality_score(cluster.get('quality_score', 0))
|
720 |
+
quality_data.append({
|
721 |
+
'property_description': cluster_desc,
|
722 |
+
'quality_score': quality_score
|
723 |
+
})
|
724 |
+
|
725 |
+
# Get quality score confidence intervals
|
726 |
+
quality_ci = cluster.get('quality_score_ci', {})
|
727 |
+
if isinstance(quality_ci, dict) and quality_ci:
|
728 |
+
# Get the first available quality CI
|
729 |
+
for score_key, ci_data in quality_ci.items():
|
730 |
+
if isinstance(ci_data, dict):
|
731 |
+
ci_lower = ci_data.get('lower')
|
732 |
+
ci_upper = ci_data.get('upper')
|
733 |
+
if ci_lower is not None and ci_upper is not None:
|
734 |
+
quality_cis.append({
|
735 |
+
'property_description': cluster_desc,
|
736 |
+
'ci_lower': ci_lower,
|
737 |
+
'ci_upper': ci_upper
|
738 |
+
})
|
739 |
+
break
|
740 |
+
else:
|
741 |
+
quality_cis.append({
|
742 |
+
'property_description': cluster_desc,
|
743 |
+
'ci_lower': None,
|
744 |
+
'ci_upper': None
|
745 |
+
})
|
746 |
+
else:
|
747 |
+
quality_cis.append({
|
748 |
+
'property_description': cluster_desc,
|
749 |
+
'ci_lower': None,
|
750 |
+
'ci_upper': None
|
751 |
+
})
|
752 |
+
break
|
753 |
+
if any(q['property_description'] == cluster_desc for q in quality_data):
|
754 |
+
break
|
755 |
+
|
756 |
+
if quality_data:
|
757 |
+
quality_df = pd.DataFrame(quality_data)
|
758 |
+
quality_cis_df = pd.DataFrame(quality_cis) if quality_cis else None
|
759 |
+
|
760 |
+
# Create quality score chart with single bars
|
761 |
+
fig_quality = go.Figure()
|
762 |
+
|
763 |
+
# Prepare confidence intervals for error bars
|
764 |
+
ci_lower = []
|
765 |
+
ci_upper = []
|
766 |
+
for _, row in quality_df.iterrows():
|
767 |
+
cluster_desc = row['property_description']
|
768 |
+
if quality_cis_df is not None:
|
769 |
+
ci_row = quality_cis_df[quality_cis_df['property_description'] == cluster_desc]
|
770 |
+
if not ci_row.empty:
|
771 |
+
ci_lower.append(ci_row.iloc[0]['ci_lower'])
|
772 |
+
ci_upper.append(ci_row.iloc[0]['ci_upper'])
|
773 |
+
else:
|
774 |
+
ci_lower.append(None)
|
775 |
+
ci_upper.append(None)
|
776 |
+
else:
|
777 |
+
ci_lower.append(None)
|
778 |
+
ci_upper.append(None)
|
779 |
+
|
780 |
+
# Add a single bar for each cluster
|
781 |
+
fig_quality.add_trace(go.Bar(
|
782 |
+
y=[truncate_cluster_name(desc, 60) for desc in quality_df['property_description']],
|
783 |
+
x=quality_df['quality_score'],
|
784 |
+
orientation='h',
|
785 |
+
marker_color='lightblue', # Single color for all bars
|
786 |
+
name='Quality Score',
|
787 |
+
showlegend=False,
|
788 |
+
error_x=dict(
|
789 |
+
type='data',
|
790 |
+
array=[u - l if u is not None and l is not None else None for l, u in zip(ci_lower, ci_upper)],
|
791 |
+
arrayminus=[q - l if q is not None and l is not None else None for q, l in zip(quality_df['quality_score'], ci_lower)],
|
792 |
+
visible=show_confidence_intervals,
|
793 |
+
thickness=1,
|
794 |
+
width=3,
|
795 |
+
color='rgba(0,0,0,0.3)'
|
796 |
+
),
|
797 |
+
hovertemplate='<b>%{y}</b><br>' +
|
798 |
+
'Quality Score: %{x:.3f}<br>' +
|
799 |
+
'CI: %{customdata[0]}<extra></extra>',
|
800 |
+
customdata=[[
|
801 |
+
format_confidence_interval({
|
802 |
+
'lower': l,
|
803 |
+
'upper': u
|
804 |
+
}) if l is not None and u is not None else "N/A"
|
805 |
+
for l, u in zip(ci_lower, ci_upper)
|
806 |
+
]]
|
807 |
+
))
|
808 |
+
|
809 |
+
# Update layout
|
810 |
+
fig_quality.update_layout(
|
811 |
+
title=f"Quality Scores",
|
812 |
+
xaxis_title="Quality Score",
|
813 |
+
yaxis_title="", # No y-axis title to save space
|
814 |
+
height=max(600, len(top_clusters) * 25), # Same height as main chart
|
815 |
+
showlegend=False,
|
816 |
+
yaxis=dict(showticklabels=False) # Hide y-axis labels to save space
|
817 |
+
)
|
818 |
+
else:
|
819 |
+
# Create empty quality figure
|
820 |
+
fig_quality = go.Figure()
|
821 |
+
fig_quality.add_annotation(text="No quality score data available",
|
822 |
+
xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
|
823 |
+
|
824 |
+
return fig, fig_quality
|
825 |
+
|
826 |
+
|
827 |
+
def search_clusters_by_text(clustered_df: pd.DataFrame,
|
828 |
+
search_term: str,
|
829 |
+
search_in: str = 'description') -> pd.DataFrame:
|
830 |
+
"""Search clusters by text in descriptions or other fields."""
|
831 |
+
if not search_term:
|
832 |
+
return clustered_df.head(100) # Return first 100 if no search
|
833 |
+
|
834 |
+
search_term = search_term.lower()
|
835 |
+
|
836 |
+
if search_in == 'description':
|
837 |
+
mask = clustered_df['property_description'].str.lower().str.contains(search_term, na=False)
|
838 |
+
elif search_in == 'model':
|
839 |
+
mask = clustered_df['model'].str.lower().str.contains(search_term, na=False)
|
840 |
+
elif search_in == 'cluster_label':
|
841 |
+
# Use correct column names from pipeline
|
842 |
+
fine_label_col = 'property_description_fine_cluster_label'
|
843 |
+
coarse_label_col = 'property_description_coarse_cluster_label'
|
844 |
+
mask = pd.Series([False] * len(clustered_df))
|
845 |
+
|
846 |
+
if fine_label_col in clustered_df.columns:
|
847 |
+
mask |= clustered_df[fine_label_col].str.lower().str.contains(search_term, na=False)
|
848 |
+
if coarse_label_col in clustered_df.columns:
|
849 |
+
mask |= clustered_df[coarse_label_col].str.lower().str.contains(search_term, na=False)
|
850 |
+
else:
|
851 |
+
# Search in all text columns using correct column names
|
852 |
+
text_cols = ['property_description', 'model',
|
853 |
+
'property_description_fine_cluster_label',
|
854 |
+
'property_description_coarse_cluster_label']
|
855 |
+
mask = pd.Series([False] * len(clustered_df))
|
856 |
+
for col in text_cols:
|
857 |
+
if col in clustered_df.columns:
|
858 |
+
mask |= clustered_df[col].str.lower().str.contains(search_term, na=False)
|
859 |
+
|
860 |
+
return clustered_df[mask].head(100)
|
861 |
+
|
862 |
+
|
863 |
+
def search_clusters_only(clustered_df: pd.DataFrame,
|
864 |
+
search_term: str,
|
865 |
+
cluster_level: str = 'fine') -> pd.DataFrame:
|
866 |
+
"""Search only over cluster labels, not individual property descriptions."""
|
867 |
+
if not search_term:
|
868 |
+
return clustered_df
|
869 |
+
|
870 |
+
search_term = search_term.lower()
|
871 |
+
|
872 |
+
# Use the correct column names based on cluster level
|
873 |
+
if cluster_level == 'fine':
|
874 |
+
label_col = 'property_description_fine_cluster_label'
|
875 |
+
alt_label_col = 'fine_cluster_label'
|
876 |
+
else:
|
877 |
+
label_col = 'property_description_coarse_cluster_label'
|
878 |
+
alt_label_col = 'coarse_cluster_label'
|
879 |
+
|
880 |
+
# Try both naming patterns
|
881 |
+
if label_col in clustered_df.columns:
|
882 |
+
mask = clustered_df[label_col].str.lower().str.contains(search_term, na=False)
|
883 |
+
elif alt_label_col in clustered_df.columns:
|
884 |
+
mask = clustered_df[alt_label_col].str.lower().str.contains(search_term, na=False)
|
885 |
+
else:
|
886 |
+
# If neither column exists, return empty DataFrame
|
887 |
+
return pd.DataFrame()
|
888 |
+
|
889 |
+
return clustered_df[mask]
|
890 |
+
|
891 |
+
|
892 |
+
def create_interactive_cluster_viewer(clustered_df: pd.DataFrame,
|
893 |
+
selected_models: Optional[List[str]] = None,
|
894 |
+
cluster_level: str = 'fine') -> str:
|
895 |
+
"""Create interactive cluster viewer HTML similar to Streamlit version."""
|
896 |
+
if clustered_df.empty:
|
897 |
+
return "<p>No cluster data available</p>"
|
898 |
+
|
899 |
+
df = clustered_df.copy()
|
900 |
+
|
901 |
+
# Debug information
|
902 |
+
print(f"DEBUG: create_interactive_cluster_viewer called")
|
903 |
+
print(f" - Input DataFrame shape: {df.shape}")
|
904 |
+
print(f" - Selected models: {selected_models}")
|
905 |
+
print(f" - Available models in data: {df['model'].unique().tolist() if 'model' in df.columns else 'No model column'}")
|
906 |
+
|
907 |
+
# Filter by models if specified
|
908 |
+
if selected_models:
|
909 |
+
print(f" - Filtering by {len(selected_models)} selected models")
|
910 |
+
df = df[df['model'].isin(selected_models)]
|
911 |
+
print(f" - After filtering shape: {df.shape}")
|
912 |
+
print(f" - Models after filtering: {df['model'].unique().tolist()}")
|
913 |
+
else:
|
914 |
+
print(f" - No model filtering applied")
|
915 |
+
|
916 |
+
if df.empty:
|
917 |
+
return f"<p>No data found for selected models: {', '.join(selected_models or [])}</p>"
|
918 |
+
|
919 |
+
# Get cluster scores data for quality and frequency information
|
920 |
+
from .state import app_state
|
921 |
+
cluster_scores = app_state.get("metrics", {}).get("cluster_scores", {})
|
922 |
+
|
923 |
+
# Use the actual column names from the pipeline output (matching Streamlit version)
|
924 |
+
if cluster_level == 'fine':
|
925 |
+
id_col = 'property_description_fine_cluster_id'
|
926 |
+
label_col = 'property_description_fine_cluster_label'
|
927 |
+
# Also check for alternative naming without prefix
|
928 |
+
alt_id_col = 'fine_cluster_id'
|
929 |
+
alt_label_col = 'fine_cluster_label'
|
930 |
+
else:
|
931 |
+
id_col = 'property_description_coarse_cluster_id'
|
932 |
+
label_col = 'property_description_coarse_cluster_label'
|
933 |
+
# Also check for alternative naming without prefix
|
934 |
+
alt_id_col = 'coarse_cluster_id'
|
935 |
+
alt_label_col = 'coarse_cluster_label'
|
936 |
+
|
937 |
+
# Track if we fall back from coarse to fine
|
938 |
+
fell_back_to_fine = False
|
939 |
+
|
940 |
+
# Check if required columns exist and provide helpful debug info
|
941 |
+
# Try both naming patterns
|
942 |
+
if id_col in df.columns and label_col in df.columns:
|
943 |
+
# Use the expected naming pattern
|
944 |
+
pass
|
945 |
+
elif alt_id_col in df.columns and alt_label_col in df.columns:
|
946 |
+
# Use the alternative naming pattern
|
947 |
+
id_col = alt_id_col
|
948 |
+
label_col = alt_label_col
|
949 |
+
else:
|
950 |
+
# If coarse clusters are not available, try to fall back to fine clusters
|
951 |
+
if cluster_level == 'coarse':
|
952 |
+
# Check if fine clusters are available
|
953 |
+
fine_id_col = 'property_description_fine_cluster_id'
|
954 |
+
fine_label_col = 'property_description_fine_cluster_label'
|
955 |
+
fine_alt_id_col = 'fine_cluster_id'
|
956 |
+
fine_alt_label_col = 'fine_cluster_label'
|
957 |
+
|
958 |
+
if (fine_id_col in df.columns and fine_label_col in df.columns) or (fine_alt_id_col in df.columns and fine_alt_label_col in df.columns):
|
959 |
+
# Fall back to fine clusters
|
960 |
+
if fine_id_col in df.columns and fine_label_col in df.columns:
|
961 |
+
id_col = fine_id_col
|
962 |
+
label_col = fine_label_col
|
963 |
+
else:
|
964 |
+
id_col = fine_alt_id_col
|
965 |
+
label_col = fine_alt_label_col
|
966 |
+
cluster_level = 'fine' # Update the cluster level for display
|
967 |
+
fell_back_to_fine = True
|
968 |
+
else:
|
969 |
+
# No cluster columns available at all
|
970 |
+
available_cols = list(df.columns)
|
971 |
+
return f"""
|
972 |
+
<div style="padding: 20px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px;">
|
973 |
+
<h4>β Missing cluster columns in data</h4>
|
974 |
+
<p><strong>Expected:</strong> {id_col}, {label_col} OR {alt_id_col}, {alt_label_col}</p>
|
975 |
+
<p><strong>Available columns:</strong> {', '.join(available_cols)}</p>
|
976 |
+
<p>Please ensure your data contains clustering results from the LMM-Vibes pipeline.</p>
|
977 |
+
</div>
|
978 |
+
"""
|
979 |
+
else:
|
980 |
+
# For fine clusters, show the original error
|
981 |
+
available_cols = list(df.columns)
|
982 |
+
return f"""
|
983 |
+
<div style="padding: 20px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px;">
|
984 |
+
<h4>β Missing {cluster_level} cluster columns in data</h4>
|
985 |
+
<p><strong>Expected:</strong> {id_col}, {label_col} OR {alt_id_col}, {alt_label_col}</p>
|
986 |
+
<p><strong>Available columns:</strong> {', '.join(available_cols)}</p>
|
987 |
+
<p>Please ensure your data contains clustering results from the LMM-Vibes pipeline.</p>
|
988 |
+
</div>
|
989 |
+
"""
|
990 |
+
|
991 |
+
# Group by cluster to get cluster information
|
992 |
+
try:
|
993 |
+
print(f" - Grouping by cluster columns: {id_col}, {label_col}")
|
994 |
+
cluster_groups = df.groupby([id_col, label_col]).agg({
|
995 |
+
'property_description': ['count', lambda x: x.unique().tolist()],
|
996 |
+
'model': lambda x: x.unique().tolist()
|
997 |
+
}).reset_index()
|
998 |
+
|
999 |
+
# Flatten column names
|
1000 |
+
cluster_groups.columns = [
|
1001 |
+
id_col, label_col, 'size', 'property_descriptions', 'models'
|
1002 |
+
]
|
1003 |
+
|
1004 |
+
# Sort by size (largest first)
|
1005 |
+
cluster_groups = cluster_groups.sort_values('size', ascending=False)
|
1006 |
+
|
1007 |
+
# Filter out "No properties" clusters
|
1008 |
+
cluster_groups = cluster_groups[cluster_groups[label_col] != "No properties"]
|
1009 |
+
|
1010 |
+
print(f" - Found {len(cluster_groups)} clusters")
|
1011 |
+
print(f" - Cluster sizes: {cluster_groups['size'].tolist()}")
|
1012 |
+
print(f" - Models per cluster: {[len(models) for models in cluster_groups['models']]}")
|
1013 |
+
|
1014 |
+
except Exception as e:
|
1015 |
+
return f"""
|
1016 |
+
<div style="padding: 20px; background: #f8d7da; border: 1px solid #f5c6cb; border-radius: 8px;">
|
1017 |
+
<h4>β Error processing cluster data</h4>
|
1018 |
+
<p><strong>Error:</strong> {str(e)}</p>
|
1019 |
+
<p>Please check your data format and try again.</p>
|
1020 |
+
</div>
|
1021 |
+
"""
|
1022 |
+
|
1023 |
+
if len(cluster_groups) == 0:
|
1024 |
+
return """
|
1025 |
+
<div style="padding: 20px; background: #d1ecf1; border: 1px solid #bee5eb; border-radius: 8px;">
|
1026 |
+
<h4>βΉοΈ No clusters found</h4>
|
1027 |
+
<p>No clusters match your current filters. Try selecting different models or adjusting your search.</p>
|
1028 |
+
</div>
|
1029 |
+
"""
|
1030 |
+
|
1031 |
+
# Create HTML
|
1032 |
+
html = f"""
|
1033 |
+
<div style="max-width: 1600px; margin: 0 auto;">
|
1034 |
+
<h3>π Interactive Cluster Viewer ({cluster_level.title()} Level)</h3>
|
1035 |
+
<p style="color: #666; margin-bottom: 20px;">
|
1036 |
+
Click on clusters below to explore their property descriptions.
|
1037 |
+
Showing {len(cluster_groups)} clusters sorted by size.
|
1038 |
+
</p>
|
1039 |
+
"""
|
1040 |
+
|
1041 |
+
# Add a note if we fell back from coarse to fine clusters
|
1042 |
+
if cluster_level == 'fine' and fell_back_to_fine:
|
1043 |
+
html += """
|
1044 |
+
<div style="padding: 15px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px; margin-bottom: 20px;">
|
1045 |
+
<strong>Note:</strong> Coarse clusters not available in this dataset. Showing fine clusters instead.
|
1046 |
+
</div>
|
1047 |
+
"""
|
1048 |
+
|
1049 |
+
for i, row in cluster_groups.iterrows():
|
1050 |
+
cluster_id = row[id_col]
|
1051 |
+
cluster_label = row[label_col]
|
1052 |
+
cluster_size = row['size']
|
1053 |
+
property_descriptions = row['property_descriptions']
|
1054 |
+
models_in_cluster = row['models']
|
1055 |
+
|
1056 |
+
# Get quality and frequency information from cluster_scores
|
1057 |
+
cluster_metrics = cluster_scores.get(cluster_label, {})
|
1058 |
+
frequency_pct = cluster_metrics.get("proportion", 0) * 100 if cluster_metrics else 0
|
1059 |
+
quality_scores = cluster_metrics.get("quality", {})
|
1060 |
+
quality_delta = cluster_metrics.get("quality_delta", {})
|
1061 |
+
|
1062 |
+
# Build per-metric header display: "metric: score (delta)"
|
1063 |
+
header_quality_display = "N/A"
|
1064 |
+
if quality_scores or quality_delta:
|
1065 |
+
metric_names = sorted(set(quality_scores.keys()) | set(quality_delta.keys()))
|
1066 |
+
parts: list[str] = []
|
1067 |
+
for metric_name in metric_names:
|
1068 |
+
score_val = quality_scores.get(metric_name)
|
1069 |
+
delta_val = quality_delta.get(metric_name)
|
1070 |
+
score_str = f"{score_val:.3f}" if isinstance(score_val, (int, float)) else "N/A"
|
1071 |
+
if isinstance(delta_val, (int, float)):
|
1072 |
+
color = "#28a745" if delta_val >= 0 else "#dc3545"
|
1073 |
+
parts.append(f"{metric_name}: {score_str} <span style=\"color: {color};\">({delta_val:+.3f})</span>")
|
1074 |
+
else:
|
1075 |
+
parts.append(f"{metric_name}: {score_str}")
|
1076 |
+
header_quality_display = "\n".join(parts)
|
1077 |
+
|
1078 |
+
# Format quality scores for detailed view
|
1079 |
+
quality_html = ""
|
1080 |
+
if quality_scores:
|
1081 |
+
quality_parts = []
|
1082 |
+
for metric_name, score in quality_scores.items():
|
1083 |
+
color = "#28a745" if score >= 0 else "#dc3545"
|
1084 |
+
quality_parts.append(f'<span style="color:{color}; font-weight:500;">{metric_name}: {score:.3f}</span>')
|
1085 |
+
quality_html = " | ".join(quality_parts)
|
1086 |
+
else:
|
1087 |
+
quality_html = '<span style="color:#666;">No quality data</span>'
|
1088 |
+
|
1089 |
+
# Format quality delta (relative to average)
|
1090 |
+
quality_delta_html = ""
|
1091 |
+
if quality_delta:
|
1092 |
+
delta_parts = []
|
1093 |
+
for metric_name, delta in quality_delta.items():
|
1094 |
+
color = "#28a745" if delta >= 0 else "#dc3545"
|
1095 |
+
sign = "+" if delta >= 0 else ""
|
1096 |
+
delta_parts.append(f'<span style="color:{color}; font-weight:500;">{metric_name}: {sign}{delta:.3f}</span>')
|
1097 |
+
quality_delta_html = " | ".join(delta_parts)
|
1098 |
+
else:
|
1099 |
+
quality_delta_html = '<span style="color:#666;">No delta data</span>'
|
1100 |
+
|
1101 |
+
# Format header quality score with visual indicators
|
1102 |
+
header_quality_text = header_quality_display
|
1103 |
+
|
1104 |
+
# Get light color for this cluster (matching overview style)
|
1105 |
+
cluster_color = get_light_color_for_cluster(cluster_label, i)
|
1106 |
+
|
1107 |
+
# Create expandable cluster card with overview-style design
|
1108 |
+
html += f"""
|
1109 |
+
<details style="margin: 15px 0; border: 1px solid #e0e0e0; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
1110 |
+
<summary style="
|
1111 |
+
padding: 15px;
|
1112 |
+
background: {cluster_color};
|
1113 |
+
color: #333;
|
1114 |
+
cursor: pointer;
|
1115 |
+
font-weight: 600;
|
1116 |
+
font-size: 16px;
|
1117 |
+
user-select: none;
|
1118 |
+
list-style: none;
|
1119 |
+
display: flex;
|
1120 |
+
justify-content: space-between;
|
1121 |
+
align-items: center;
|
1122 |
+
border-bottom: 1px solid #dee2e6;
|
1123 |
+
">
|
1124 |
+
<div style="max-width: 80%;">
|
1125 |
+
<div style="margin-bottom: 4px;">
|
1126 |
+
<strong style="font-size: 14px;">{cluster_label}</strong>
|
1127 |
+
</div>
|
1128 |
+
<span style="font-size: 12px; color: #555;">
|
1129 |
+
{frequency_pct:.1f}% frequency ({cluster_size} properties) Β· {len(models_in_cluster)} models
|
1130 |
+
</span>
|
1131 |
+
</div>
|
1132 |
+
<div style="font-size: 12px; font-weight: normal; white-space: nowrap; text-align: right;">
|
1133 |
+
<div style="margin-bottom: 4px;">
|
1134 |
+
<span style="font-weight: 500;">{header_quality_text}</span>
|
1135 |
+
</div>
|
1136 |
+
<div style="color: #6c757d;">
|
1137 |
+
{frequency_pct:.1f}% frequency
|
1138 |
+
</div>
|
1139 |
+
</div>
|
1140 |
+
</summary>
|
1141 |
+
|
1142 |
+
<div style="padding: 20px; background: #f8f9fa;">
|
1143 |
+
<div style="margin-bottom: 15px;">
|
1144 |
+
<strong>Cluster ID:</strong> {cluster_id}<br>
|
1145 |
+
<strong>Size:</strong> {cluster_size} properties<br>
|
1146 |
+
<strong>Models:</strong> {', '.join(models_in_cluster)}<br>
|
1147 |
+
<strong>Frequency:</strong> {frequency_pct:.1f}% of all conversations<br>
|
1148 |
+
<strong>Quality Scores:</strong> {quality_html}<br>
|
1149 |
+
<strong>Quality vs Average:</strong> {quality_delta_html}
|
1150 |
+
</div>
|
1151 |
+
|
1152 |
+
<h4 style="color: #333; margin: 15px 0 10px 0;">
|
1153 |
+
Property Descriptions ({len(property_descriptions)})
|
1154 |
+
</h4>
|
1155 |
+
|
1156 |
+
<div style="max-height: 300px; overflow-y: auto; background: white; border: 1px solid #ddd; border-radius: 4px; padding: 10px;">
|
1157 |
+
"""
|
1158 |
+
|
1159 |
+
# Display property descriptions
|
1160 |
+
for i, desc in enumerate(property_descriptions, 1):
|
1161 |
+
html += f"""
|
1162 |
+
<div style="
|
1163 |
+
padding: 8px;
|
1164 |
+
margin: 4px 0;
|
1165 |
+
background: #f8f9fa;
|
1166 |
+
border-left: 3px solid #667eea;
|
1167 |
+
border-radius: 2px;
|
1168 |
+
">
|
1169 |
+
<strong>{i}.</strong> {desc}
|
1170 |
+
</div>
|
1171 |
+
"""
|
1172 |
+
|
1173 |
+
html += """
|
1174 |
+
</div>
|
1175 |
+
</div>
|
1176 |
+
</details>
|
1177 |
+
"""
|
1178 |
+
|
1179 |
+
html += "</div>"
|
1180 |
+
return html
|
1181 |
+
|
1182 |
+
|
1183 |
+
def get_cluster_statistics(clustered_df: pd.DataFrame,
|
1184 |
+
selected_models: Optional[List[str]] = None) -> Dict[str, Any]:
|
1185 |
+
"""Get cluster statistics for display."""
|
1186 |
+
if clustered_df.empty:
|
1187 |
+
return {}
|
1188 |
+
|
1189 |
+
df = clustered_df.copy()
|
1190 |
+
|
1191 |
+
# Filter by models if specified
|
1192 |
+
if selected_models:
|
1193 |
+
df = df[df['model'].isin(selected_models)]
|
1194 |
+
|
1195 |
+
stats = {
|
1196 |
+
'total_properties': len(df),
|
1197 |
+
'total_models': df['model'].nunique() if 'model' in df.columns else 0,
|
1198 |
+
}
|
1199 |
+
|
1200 |
+
# Fine cluster statistics - try both naming patterns
|
1201 |
+
fine_id_col = 'property_description_fine_cluster_id'
|
1202 |
+
alt_fine_id_col = 'fine_cluster_id'
|
1203 |
+
|
1204 |
+
if fine_id_col in df.columns:
|
1205 |
+
stats['fine_clusters'] = df[fine_id_col].nunique()
|
1206 |
+
cluster_sizes = df.groupby(fine_id_col).size()
|
1207 |
+
stats['min_properties_per_fine_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
|
1208 |
+
stats['max_properties_per_fine_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
|
1209 |
+
stats['avg_properties_per_fine_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
|
1210 |
+
elif alt_fine_id_col in df.columns:
|
1211 |
+
stats['fine_clusters'] = df[alt_fine_id_col].nunique()
|
1212 |
+
cluster_sizes = df.groupby(alt_fine_id_col).size()
|
1213 |
+
stats['min_properties_per_fine_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
|
1214 |
+
stats['max_properties_per_fine_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
|
1215 |
+
stats['avg_properties_per_fine_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
|
1216 |
+
|
1217 |
+
# Coarse cluster statistics - try both naming patterns
|
1218 |
+
coarse_id_col = 'property_description_coarse_cluster_id'
|
1219 |
+
alt_coarse_id_col = 'coarse_cluster_id'
|
1220 |
+
|
1221 |
+
if coarse_id_col in df.columns:
|
1222 |
+
stats['coarse_clusters'] = df[coarse_id_col].nunique()
|
1223 |
+
cluster_sizes = df.groupby(coarse_id_col).size()
|
1224 |
+
stats['min_properties_per_coarse_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
|
1225 |
+
stats['max_properties_per_coarse_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
|
1226 |
+
stats['avg_properties_per_coarse_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
|
1227 |
+
elif alt_coarse_id_col in df.columns:
|
1228 |
+
stats['coarse_clusters'] = df[alt_coarse_id_col].nunique()
|
1229 |
+
cluster_sizes = df.groupby(alt_coarse_id_col).size()
|
1230 |
+
stats['min_properties_per_coarse_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
|
1231 |
+
stats['max_properties_per_coarse_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
|
1232 |
+
stats['avg_properties_per_coarse_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
|
1233 |
+
|
1234 |
+
return stats
|
1235 |
+
|
1236 |
+
|
1237 |
+
def get_unique_values_for_dropdowns(clustered_df: pd.DataFrame) -> Dict[str, List[str]]:
|
1238 |
+
"""Get unique values for dropdown menus."""
|
1239 |
+
if clustered_df.empty:
|
1240 |
+
return {'prompts': [], 'models': [], 'properties': []}
|
1241 |
+
|
1242 |
+
# Get unique values, handling missing columns gracefully
|
1243 |
+
prompts = []
|
1244 |
+
if 'prompt' in clustered_df.columns:
|
1245 |
+
unique_prompts = clustered_df['prompt'].dropna().unique().tolist()
|
1246 |
+
prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
|
1247 |
+
elif 'question' in clustered_df.columns:
|
1248 |
+
unique_prompts = clustered_df['question'].dropna().unique().tolist()
|
1249 |
+
prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
|
1250 |
+
elif 'input' in clustered_df.columns:
|
1251 |
+
unique_prompts = clustered_df['input'].dropna().unique().tolist()
|
1252 |
+
prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
|
1253 |
+
elif 'user_prompt' in clustered_df.columns:
|
1254 |
+
unique_prompts = clustered_df['user_prompt'].dropna().unique().tolist()
|
1255 |
+
prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
|
1256 |
+
|
1257 |
+
# Handle both single model and side-by-side datasets
|
1258 |
+
models = []
|
1259 |
+
if 'model' in clustered_df.columns:
|
1260 |
+
# Single model datasets
|
1261 |
+
models = sorted(clustered_df['model'].dropna().unique().tolist())
|
1262 |
+
elif 'model_a' in clustered_df.columns and 'model_b' in clustered_df.columns:
|
1263 |
+
# Side-by-side datasets - combine models from both columns
|
1264 |
+
models_a = clustered_df['model_a'].dropna().unique().tolist()
|
1265 |
+
models_b = clustered_df['model_b'].dropna().unique().tolist()
|
1266 |
+
all_models = set(models_a + models_b)
|
1267 |
+
models = sorted(list(all_models))
|
1268 |
+
|
1269 |
+
# Use fine cluster labels instead of property descriptions - try both naming patterns
|
1270 |
+
properties = []
|
1271 |
+
fine_label_col = 'property_description_fine_cluster_label'
|
1272 |
+
alt_fine_label_col = 'fine_cluster_label'
|
1273 |
+
|
1274 |
+
if fine_label_col in clustered_df.columns:
|
1275 |
+
unique_properties = clustered_df[fine_label_col].dropna().unique().tolist()
|
1276 |
+
# Filter out "No properties" clusters
|
1277 |
+
unique_properties = [prop for prop in unique_properties if prop != "No properties"]
|
1278 |
+
properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
|
1279 |
+
elif alt_fine_label_col in clustered_df.columns:
|
1280 |
+
unique_properties = clustered_df[alt_fine_label_col].dropna().unique().tolist()
|
1281 |
+
# Filter out "No properties" clusters
|
1282 |
+
unique_properties = [prop for prop in unique_properties if prop != "No properties"]
|
1283 |
+
properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
|
1284 |
+
elif 'property_description' in clustered_df.columns:
|
1285 |
+
# Fallback to property descriptions if cluster labels not available
|
1286 |
+
unique_properties = clustered_df['property_description'].dropna().unique().tolist()
|
1287 |
+
# Filter out "No properties" clusters
|
1288 |
+
unique_properties = [prop for prop in unique_properties if prop != "No properties"]
|
1289 |
+
properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
|
1290 |
+
|
1291 |
+
return {
|
1292 |
+
'prompts': prompts,
|
1293 |
+
'models': models,
|
1294 |
+
'properties': properties
|
1295 |
+
}
|
1296 |
+
|
1297 |
+
# ---------------------------------------------------------------------------
|
1298 |
+
# Example data extraction (restored)
|
1299 |
+
# ---------------------------------------------------------------------------
|
1300 |
+
|
1301 |
+
def get_example_data(
|
1302 |
+
clustered_df: pd.DataFrame,
|
1303 |
+
selected_prompt: str | None = None,
|
1304 |
+
selected_model: str | None = None,
|
1305 |
+
selected_property: str | None = None,
|
1306 |
+
max_examples: int = 5,
|
1307 |
+
show_unexpected_behavior: bool = False,
|
1308 |
+
randomize: bool = False,
|
1309 |
+
) -> List[Dict[str, Any]]:
|
1310 |
+
"""Return a list of example rows filtered by prompt / model / property.
|
1311 |
+
|
1312 |
+
This function was accidentally removed during a refactor; it is required by
|
1313 |
+
*examples_tab.py* and other parts of the UI.
|
1314 |
+
|
1315 |
+
Args:
|
1316 |
+
clustered_df: DataFrame containing the clustered results data
|
1317 |
+
selected_prompt: Prompt to filter by (None for all)
|
1318 |
+
selected_model: Model to filter by (None for all)
|
1319 |
+
selected_property: Property description to filter by (None for all)
|
1320 |
+
max_examples: Maximum number of examples to return
|
1321 |
+
show_unexpected_behavior: If True, filter to only show unexpected behavior
|
1322 |
+
randomize: If True, sample randomly from the filtered set instead of taking the first rows
|
1323 |
+
|
1324 |
+
Returns:
|
1325 |
+
List of example dictionaries with extracted data
|
1326 |
+
"""
|
1327 |
+
|
1328 |
+
if clustered_df.empty:
|
1329 |
+
return []
|
1330 |
+
|
1331 |
+
df = clustered_df.copy()
|
1332 |
+
|
1333 |
+
# Filter by unexpected behavior if requested
|
1334 |
+
if show_unexpected_behavior:
|
1335 |
+
if "unexpected_behavior" in df.columns:
|
1336 |
+
# Assuming True/1 means unexpected behavior
|
1337 |
+
df = df[df["unexpected_behavior"].isin([True, 1, "True", "true"])]
|
1338 |
+
else:
|
1339 |
+
# If no unexpected_behavior column, return empty (or could return all)
|
1340 |
+
return []
|
1341 |
+
|
1342 |
+
# Filter by prompt
|
1343 |
+
if selected_prompt:
|
1344 |
+
prompt_cols = ["prompt", "question", "input", "user_prompt"]
|
1345 |
+
for col in prompt_cols:
|
1346 |
+
if col in df.columns:
|
1347 |
+
df = df[df[col].str.contains(selected_prompt, case=False, na=False)]
|
1348 |
+
break
|
1349 |
+
|
1350 |
+
# Filter by model - handle both single model and side-by-side datasets
|
1351 |
+
if selected_model:
|
1352 |
+
if "model" in df.columns:
|
1353 |
+
# Single model datasets
|
1354 |
+
df = df[df["model"] == selected_model]
|
1355 |
+
elif "model_a" in df.columns and "model_b" in df.columns:
|
1356 |
+
# Side-by-side datasets - filter where either model_a or model_b matches
|
1357 |
+
df = df[(df["model_a"] == selected_model) | (df["model_b"] == selected_model)]
|
1358 |
+
|
1359 |
+
# Filter by property
|
1360 |
+
if selected_property:
|
1361 |
+
property_cols = ["property_description", "cluster", "fine_cluster_label", "property_description_fine_cluster_label"]
|
1362 |
+
for col in property_cols:
|
1363 |
+
if col in df.columns:
|
1364 |
+
df = df[df[col].str.contains(selected_property, case=False, na=False)]
|
1365 |
+
break
|
1366 |
+
|
1367 |
+
# Limit to max_examples (randomized if requested)
|
1368 |
+
if randomize:
|
1369 |
+
if len(df) > max_examples:
|
1370 |
+
df = df.sample(n=max_examples)
|
1371 |
+
else:
|
1372 |
+
df = df.sample(frac=1)
|
1373 |
+
else:
|
1374 |
+
df = df.head(max_examples)
|
1375 |
+
|
1376 |
+
examples: List[Dict[str, Any]] = []
|
1377 |
+
for _, row in df.iterrows():
|
1378 |
+
prompt_val = next(
|
1379 |
+
(row.get(col) for col in ["prompt", "question", "input", "user_prompt"] if row.get(col) is not None),
|
1380 |
+
"N/A",
|
1381 |
+
)
|
1382 |
+
|
1383 |
+
# Check if this is a side-by-side dataset
|
1384 |
+
is_side_by_side = ('model_a_response' in row and 'model_b_response' in row and
|
1385 |
+
row.get('model_a_response') is not None and row.get('model_b_response') is not None)
|
1386 |
+
|
1387 |
+
if is_side_by_side:
|
1388 |
+
# For side-by-side datasets, store both responses separately
|
1389 |
+
response_val = "SIDE_BY_SIDE" # Special marker
|
1390 |
+
model_val = f"{row.get('model_a', 'Model A')} vs {row.get('model_b', 'Model B')}"
|
1391 |
+
else:
|
1392 |
+
# For single response datasets, use the existing logic
|
1393 |
+
response_val = next(
|
1394 |
+
(
|
1395 |
+
row.get(col)
|
1396 |
+
for col in [
|
1397 |
+
"model_response",
|
1398 |
+
"model_a_response",
|
1399 |
+
"model_b_response",
|
1400 |
+
"responses",
|
1401 |
+
"response",
|
1402 |
+
"output",
|
1403 |
+
]
|
1404 |
+
if row.get(col) is not None
|
1405 |
+
),
|
1406 |
+
"N/A",
|
1407 |
+
)
|
1408 |
+
model_val = row.get("model", "N/A")
|
1409 |
+
|
1410 |
+
# Try both naming patterns for cluster data
|
1411 |
+
fine_cluster_id = row.get("property_description_fine_cluster_id", row.get("fine_cluster_id", "N/A"))
|
1412 |
+
fine_cluster_label = row.get("property_description_fine_cluster_label", row.get("fine_cluster_label", "N/A"))
|
1413 |
+
coarse_cluster_id = row.get("property_description_coarse_cluster_id", row.get("coarse_cluster_id", "N/A"))
|
1414 |
+
coarse_cluster_label = row.get("property_description_coarse_cluster_label", row.get("coarse_cluster_label", "N/A"))
|
1415 |
+
|
1416 |
+
example_dict = {
|
1417 |
+
"id": row.get("id", "N/A"),
|
1418 |
+
"model": model_val,
|
1419 |
+
"prompt": prompt_val,
|
1420 |
+
"response": response_val,
|
1421 |
+
"property_description": row.get("property_description", "N/A"),
|
1422 |
+
"score": row.get("score", "N/A"),
|
1423 |
+
"fine_cluster_id": fine_cluster_id,
|
1424 |
+
"fine_cluster_label": fine_cluster_label,
|
1425 |
+
"coarse_cluster_id": coarse_cluster_id,
|
1426 |
+
"coarse_cluster_label": coarse_cluster_label,
|
1427 |
+
"category": row.get("category", "N/A"),
|
1428 |
+
"type": row.get("type", "N/A"),
|
1429 |
+
"impact": row.get("impact", "N/A"),
|
1430 |
+
"reason": row.get("reason", "N/A"),
|
1431 |
+
"evidence": row.get("evidence", "N/A"),
|
1432 |
+
"user_preference_direction": row.get("user_preference_direction", "N/A"),
|
1433 |
+
"raw_response": row.get("raw_response", "N/A"),
|
1434 |
+
"contains_errors": row.get("contains_errors", "N/A"),
|
1435 |
+
"unexpected_behavior": row.get("unexpected_behavior", "N/A"),
|
1436 |
+
}
|
1437 |
+
|
1438 |
+
# Add side-by-side specific fields if applicable
|
1439 |
+
if is_side_by_side:
|
1440 |
+
example_dict.update({
|
1441 |
+
"is_side_by_side": True,
|
1442 |
+
"model_a": row.get("model_a", "Model A"),
|
1443 |
+
"model_b": row.get("model_b", "Model B"),
|
1444 |
+
"model_a_response": row.get("model_a_response", "N/A"),
|
1445 |
+
"model_b_response": row.get("model_b_response", "N/A"),
|
1446 |
+
"winner": row.get("winner", None),
|
1447 |
+
})
|
1448 |
+
else:
|
1449 |
+
example_dict["is_side_by_side"] = False
|
1450 |
+
|
1451 |
+
examples.append(example_dict)
|
1452 |
+
|
1453 |
+
return examples
|
1454 |
+
|
1455 |
+
|
1456 |
+
def format_examples_display(examples: List[Dict[str, Any]],
|
1457 |
+
selected_prompt: str = None,
|
1458 |
+
selected_model: str = None,
|
1459 |
+
selected_property: str = None,
|
1460 |
+
use_accordion: bool = True,
|
1461 |
+
pretty_print_dicts: bool = True) -> str:
|
1462 |
+
"""Format examples for HTML display with proper conversation rendering.
|
1463 |
+
|
1464 |
+
Args:
|
1465 |
+
examples: List of example dictionaries
|
1466 |
+
selected_prompt: Currently selected prompt filter
|
1467 |
+
selected_model: Currently selected model filter
|
1468 |
+
selected_property: Currently selected property filter
|
1469 |
+
use_accordion: If True, group system and info messages in collapsible accordions
|
1470 |
+
pretty_print_dicts: If True, pretty-print embedded dictionaries
|
1471 |
+
|
1472 |
+
Returns:
|
1473 |
+
HTML string for display
|
1474 |
+
"""
|
1475 |
+
from .conversation_display import convert_to_openai_format, display_openai_conversation_html
|
1476 |
+
from .side_by_side_display import display_side_by_side_responses
|
1477 |
+
|
1478 |
+
if not examples:
|
1479 |
+
return "<p style='color: #e74c3c; padding: 20px;'>No examples found matching the current filters.</p>"
|
1480 |
+
|
1481 |
+
# Create filter summary
|
1482 |
+
filter_parts = []
|
1483 |
+
if selected_prompt and selected_prompt != "All Prompts":
|
1484 |
+
filter_parts.append(f"Prompt: {selected_prompt}")
|
1485 |
+
if selected_model and selected_model != "All Models":
|
1486 |
+
filter_parts.append(f"Model: {selected_model}")
|
1487 |
+
if selected_property and selected_property != "All Clusters":
|
1488 |
+
filter_parts.append(f"Cluster: {selected_property}")
|
1489 |
+
|
1490 |
+
filter_summary = ""
|
1491 |
+
if filter_parts:
|
1492 |
+
filter_summary = f"""
|
1493 |
+
<div style="background: #e3f2fd; padding: 15px; border-radius: 8px; margin-bottom: 20px; border-left: 4px solid #2196f3;">
|
1494 |
+
<strong>π Active Filters:</strong> {" β’ ".join(filter_parts)}
|
1495 |
+
</div>
|
1496 |
+
"""
|
1497 |
+
|
1498 |
+
html = f"""
|
1499 |
+
<div style="font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;">
|
1500 |
+
<h3 style="color: #333; margin-bottom: 15px;">π Examples ({len(examples)} found)</h3>
|
1501 |
+
{filter_summary}
|
1502 |
+
"""
|
1503 |
+
|
1504 |
+
for i, example in enumerate(examples, 1):
|
1505 |
+
# Check if this is a side-by-side example
|
1506 |
+
if example.get('is_side_by_side', False):
|
1507 |
+
# Use side-by-side display for comparison datasets
|
1508 |
+
conversation_html = display_side_by_side_responses(
|
1509 |
+
model_a=example['model_a'],
|
1510 |
+
model_b=example['model_b'],
|
1511 |
+
model_a_response=example['model_a_response'],
|
1512 |
+
model_b_response=example['model_b_response'],
|
1513 |
+
use_accordion=use_accordion,
|
1514 |
+
pretty_print_dicts=pretty_print_dicts,
|
1515 |
+
score=example['score'],
|
1516 |
+
winner=example.get('winner')
|
1517 |
+
)
|
1518 |
+
else:
|
1519 |
+
# Convert response to OpenAI format for proper display (single model)
|
1520 |
+
response_data = example['response']
|
1521 |
+
if response_data != 'N/A':
|
1522 |
+
openai_conversation = convert_to_openai_format(response_data)
|
1523 |
+
conversation_html = display_openai_conversation_html(openai_conversation, use_accordion=use_accordion, pretty_print_dicts=pretty_print_dicts)
|
1524 |
+
else:
|
1525 |
+
conversation_html = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
|
1526 |
+
|
1527 |
+
# Determine cluster info
|
1528 |
+
cluster_info = ""
|
1529 |
+
if example['fine_cluster_label'] != 'N/A':
|
1530 |
+
cluster_info = f"""
|
1531 |
+
<div style="margin-top: 10px; font-size: 13px; color: #666;">
|
1532 |
+
<strong>π·οΈ Cluster:</strong> {example['fine_cluster_label']} (ID: {example['fine_cluster_id']})
|
1533 |
+
</div>
|
1534 |
+
"""
|
1535 |
+
|
1536 |
+
# Score display for summary (only for non-side-by-side or when not shown in side-by-side)
|
1537 |
+
score_badge = ""
|
1538 |
+
if not example.get('is_side_by_side', False) and example['score'] != 'N/A':
|
1539 |
+
try:
|
1540 |
+
score_val = float(example['score'])
|
1541 |
+
score_color = '#28a745' if score_val >= 0 else '#dc3545'
|
1542 |
+
score_badge = f"""
|
1543 |
+
<span style="
|
1544 |
+
background: {score_color};
|
1545 |
+
color: white;
|
1546 |
+
padding: 4px 8px;
|
1547 |
+
border-radius: 12px;
|
1548 |
+
font-size: 12px;
|
1549 |
+
font-weight: bold;
|
1550 |
+
margin-left: 10px;
|
1551 |
+
">
|
1552 |
+
Score: {score_val:.3f}
|
1553 |
+
</span>
|
1554 |
+
"""
|
1555 |
+
except:
|
1556 |
+
pass
|
1557 |
+
|
1558 |
+
# Create short preview of prompt for summary
|
1559 |
+
prompt_preview = example['prompt'][:80] + "..." if len(example['prompt']) > 80 else example['prompt']
|
1560 |
+
|
1561 |
+
# Create expandable example card
|
1562 |
+
# First example is expanded by default
|
1563 |
+
open_attr = "open" if i == 1 else ""
|
1564 |
+
|
1565 |
+
html += f"""
|
1566 |
+
<details {open_attr} style="border: 1px solid #dee2e6; border-radius: 8px; margin-bottom: 15px; background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
1567 |
+
<summary style="
|
1568 |
+
padding: 15px;
|
1569 |
+
cursor: pointer;
|
1570 |
+
font-weight: 600;
|
1571 |
+
color: #495057;
|
1572 |
+
background: linear-gradient(90deg, #f8f9fa 0%, #e9ecef 100%);
|
1573 |
+
border-radius: 8px 8px 0 0;
|
1574 |
+
border-bottom: 1px solid #dee2e6;
|
1575 |
+
display: flex;
|
1576 |
+
align-items: center;
|
1577 |
+
justify-content: space-between;
|
1578 |
+
">
|
1579 |
+
<span>
|
1580 |
+
<span style="background: #6c757d; color: white; padding: 4px 8px; border-radius: 4px; font-size: 12px; margin-right: 10px;">#{i}</span>
|
1581 |
+
{prompt_preview}
|
1582 |
+
</span>
|
1583 |
+
<span style="font-size: 12px; color: #6c757d;">
|
1584 |
+
{example['model']}{score_badge}
|
1585 |
+
</span>
|
1586 |
+
</summary>
|
1587 |
+
|
1588 |
+
<div style="padding: 20px;">
|
1589 |
+
<div style="margin-bottom: 15px; padding: 15px; background: #f8f9fa; border-radius: 6px; border-left: 4px solid #17a2b8;">
|
1590 |
+
|
1591 |
+
<div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px; font-size: 13px; color: #666;">
|
1592 |
+
<div><strong>Model:</strong> {example['model']}</div>
|
1593 |
+
<div><strong>ID:</strong> {example['id']}</div>
|
1594 |
+
{f'<div><strong>Category:</strong> {example["category"]}</div>' if example["category"] not in ["N/A", "None"] else ""}
|
1595 |
+
{f'<div><strong>Type:</strong> {example["type"]}</div>' if example["type"] not in ["N/A", "None"] else ""}
|
1596 |
+
{f'<div><strong>Impact:</strong> {example["impact"]}</div>' if example["impact"] not in ["N/A", "None"] else ""}
|
1597 |
+
</div>
|
1598 |
+
|
1599 |
+
<div style="margin-top: 10px;">
|
1600 |
+
{f'<div style="margin-top: 10px;"><strong>Property:</strong> {example["property_description"]}</div>' if example["property_description"] not in ["N/A", "None"] else ""}
|
1601 |
+
{f'<div style="margin-top: 10px;"><strong>Reason:</strong> {example["reason"]}</div>' if example["reason"] not in ["N/A", "None"] else ""}
|
1602 |
+
{f'<div style="margin-top: 10px;"><strong>Evidence:</strong> {example["evidence"]}</div>' if example["evidence"] not in ["N/A", "None"] else ""}
|
1603 |
+
</div>
|
1604 |
+
</div>
|
1605 |
+
|
1606 |
+
<div style="margin-bottom: 15px;">
|
1607 |
+
<h5 style="margin: 0 0 8px 0; color: #333; font-size: 14px;">π¬ {"Response Comparison" if example.get('is_side_by_side', False) else "Conversation"}</h5>
|
1608 |
+
<div style="border-radius: 6px; font-size: 13px; line-height: 1.5;">
|
1609 |
+
{conversation_html}
|
1610 |
+
</div>
|
1611 |
+
</div>
|
1612 |
+
</div>
|
1613 |
+
</details>
|
1614 |
+
"""
|
1615 |
+
|
1616 |
+
html += "</div>"
|
1617 |
+
return html
|
1618 |
+
|
1619 |
+
# ---------------------------------------------------------------------------
|
1620 |
+
# Legacy function aliases (backward compatibility)
|
1621 |
+
# ---------------------------------------------------------------------------
|
1622 |
+
|
1623 |
+
def compute_model_rankings(*args, **kwargs):
|
1624 |
+
"""Legacy alias β forwards to compute_model_rankings_new."""
|
1625 |
+
return compute_model_rankings_new(*args, **kwargs)
|
1626 |
+
|
1627 |
+
|
1628 |
+
def create_model_summary_card(*args, **kwargs):
|
1629 |
+
"""Legacy alias β forwards to create_model_summary_card_new."""
|
1630 |
+
return create_model_summary_card_new(*args, **kwargs)
|
1631 |
+
|
1632 |
+
|
1633 |
+
def get_total_clusters_count(metrics: Dict[str, Any]) -> int:
|
1634 |
+
"""Get the total number of clusters from the metrics data."""
|
1635 |
+
cluster_scores = metrics.get("cluster_scores", {})
|
1636 |
+
# Filter out "No properties" clusters
|
1637 |
+
cluster_scores = {k: v for k, v in cluster_scores.items() if k != "No properties"}
|
1638 |
+
return len(cluster_scores)
|
1639 |
+
|
1640 |
+
|
1641 |
+
def get_light_color_for_cluster(cluster_name: str, index: int) -> str:
|
1642 |
+
"""Generate a light dusty blue background for cluster boxes.
|
1643 |
+
|
1644 |
+
Returns a consistent light dusty blue color for all clusters.
|
1645 |
+
"""
|
1646 |
+
return "#f0f4f8" # Very light dusty blue
|
1647 |
+
|
1648 |
+
__all__ = [
|
1649 |
+
"get_model_clusters",
|
1650 |
+
"get_all_models",
|
1651 |
+
"get_all_clusters",
|
1652 |
+
"format_confidence_interval",
|
1653 |
+
"get_confidence_interval_width",
|
1654 |
+
"has_confidence_intervals",
|
1655 |
+
"extract_quality_score",
|
1656 |
+
"get_top_clusters_for_model",
|
1657 |
+
"compute_model_rankings_new",
|
1658 |
+
"create_model_summary_card_new",
|
1659 |
+
"format_cluster_dataframe",
|
1660 |
+
"truncate_cluster_name",
|
1661 |
+
"create_frequency_comparison_table",
|
1662 |
+
"create_frequency_comparison_plots",
|
1663 |
+
"search_clusters_by_text",
|
1664 |
+
"search_clusters_only",
|
1665 |
+
"create_interactive_cluster_viewer",
|
1666 |
+
"get_cluster_statistics",
|
1667 |
+
"get_unique_values_for_dropdowns",
|
1668 |
+
"get_example_data",
|
1669 |
+
"format_examples_display",
|
1670 |
+
"compute_model_rankings",
|
1671 |
+
"create_model_summary_card",
|
1672 |
+
"get_total_clusters_count",
|
1673 |
+
]
|