Lisa Dunlap commited on
Commit
4862c84
Β·
0 Parent(s):
Files changed (41) hide show
  1. .gitattributes +39 -0
  2. .gitignore +7 -0
  3. data/aci_bench/cluster_scores.json +3 -0
  4. data/aci_bench/cluster_scores_df.jsonl +3 -0
  5. data/aci_bench/clustered_results_lightweight.jsonl +3 -0
  6. data/aci_bench/clusters.json +3 -0
  7. data/aci_bench/model_cluster_scores.json +3 -0
  8. data/aci_bench/model_cluster_scores_df.jsonl +3 -0
  9. data/aci_bench/model_scores.json +3 -0
  10. data/aci_bench/model_scores_df.jsonl +3 -0
  11. data/aci_bench/model_stats.json +3 -0
  12. data/aci_bench/parsed_properties.jsonl +3 -0
  13. data/aci_bench/parsing_error_summary.json +3 -0
  14. data/aci_bench/parsing_failures.jsonl +3 -0
  15. data/aci_bench/parsing_stats.json +3 -0
  16. data/aci_bench/summary.txt +33 -0
  17. data/aci_bench/summary_table.json +3 -0
  18. data/aci_bench/summary_table.jsonl +3 -0
  19. data/aci_bench/validated_properties.jsonl +3 -0
  20. data/aci_bench/validation_stats.json +3 -0
  21. lmmvibes/__init__.py +0 -0
  22. lmmvibes/metrics/plotting.py +616 -0
  23. lmmvibes/utils/__init__.py +1 -0
  24. lmmvibes/utils/persistent_storage.py +80 -0
  25. lmmvibes/vis_gradio/__init__.py +13 -0
  26. lmmvibes/vis_gradio/app.py +697 -0
  27. lmmvibes/vis_gradio/clusters_tab.py +199 -0
  28. lmmvibes/vis_gradio/conversation_display.py +507 -0
  29. lmmvibes/vis_gradio/data_loader.py +189 -0
  30. lmmvibes/vis_gradio/debug_tab.py +83 -0
  31. lmmvibes/vis_gradio/demo.py +73 -0
  32. lmmvibes/vis_gradio/examples_tab.py +129 -0
  33. lmmvibes/vis_gradio/frequency_tab.py +307 -0
  34. lmmvibes/vis_gradio/launcher.py +122 -0
  35. lmmvibes/vis_gradio/load_data_tab.py +147 -0
  36. lmmvibes/vis_gradio/metrics_adapter.py +46 -0
  37. lmmvibes/vis_gradio/overview_tab.py +82 -0
  38. lmmvibes/vis_gradio/plots_tab.py +284 -0
  39. lmmvibes/vis_gradio/side_by_side_display.py +202 -0
  40. lmmvibes/vis_gradio/state.py +27 -0
  41. lmmvibes/vis_gradio/utils.py +1673 -0
.gitattributes ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ *.json filter=lfs diff=lfs merge=lfs -text
38
+ *.png filter=lfs diff=lfs merge=lfs -text
39
+ results/**/plots/*.png -filter -merge -diff -text
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+
3
+ # Ignore generated plot images
4
+ results/**/plots/*.png
5
+
6
+ # Ignore large results directories (data now tracked with LFS)
7
+ results/**
data/aci_bench/cluster_scores.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9800cfb95cb3992d39649d61d01d326f7cd57fdc1e6253cd7a21b83be007762
3
+ size 35290231
data/aci_bench/cluster_scores_df.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da1c901319ffa8aa23f4e53cfd7bf8f81bf1013c30369e589adb3383136a88cb
3
+ size 33773423
data/aci_bench/clustered_results_lightweight.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:194429736798d0857962dd4b719c23608ae29606137ecd5d0fd979cacb1deb4a
3
+ size 92743484
data/aci_bench/clusters.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a00c7a0b16723d80fd3490ef658c913b1384f8eb68c8a549e8b50251c4bdf60
3
+ size 447437
data/aci_bench/model_cluster_scores.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4af5a4765a37b003e115b808a09ec4e95ebce3e302854957893f9b563b3cdc1e
3
+ size 35639398
data/aci_bench/model_cluster_scores_df.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cabadb1369aae14d6dbe08dbc4dee6d701891fe9426fbe52588bbc477a1b5995
3
+ size 33839755
data/aci_bench/model_scores.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93968fdf5604d473d031a4731127603eb3a6f27eba041e7564e52df85dc987f5
3
+ size 35279538
data/aci_bench/model_scores_df.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d2c024085528e9afeda447a975da35099b9f323a57db7e6695e444f6021dd13
3
+ size 33766092
data/aci_bench/model_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e7b3e1831735691cb43135355719f8d822deda3b64af9baeb02eb403cfb1546
3
+ size 127543
data/aci_bench/parsed_properties.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db2a42c37fd60ecd830569cb776973e16da4acbd4ff9581d8a064239e702e66d
3
+ size 2441177
data/aci_bench/parsing_error_summary.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2915c2fa4df41abe202b65cb7f84c1824fd64bad5a993d88c9349e25352b47ff
3
+ size 27
data/aci_bench/parsing_failures.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b10b336d6d58227d03a5f83fa8e0cbbefaadeb73a497363b67e68e3a01cf742
3
+ size 3665
data/aci_bench/parsing_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00197f1b62199cf7d8265acb34f073f0938c694b7230827a67086cd901c3f32e
3
+ size 219
data/aci_bench/summary.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LMM-Vibes Results Summary
2
+ ==================================================
3
+
4
+ Total conversations: 720
5
+ Total properties: 4146
6
+ Models analyzed: 1
7
+
8
+ Output files:
9
+ - raw_properties.jsonl: Raw LLM responses
10
+ - extraction_stats.json: Extraction statistics
11
+ - extraction_samples.jsonl: Sample inputs/outputs
12
+ - parsed_properties.jsonl: Parsed property objects
13
+ - parsing_stats.json: Parsing statistics
14
+ - parsing_failures.jsonl: Failed parsing attempts
15
+ - validated_properties.jsonl: Validated properties
16
+ - validation_stats.json: Validation statistics
17
+ - clustered_results.jsonl: Complete clustered data
18
+ - embeddings.parquet: Embeddings data
19
+ - clustered_results_lightweight.jsonl: Data without embeddings
20
+ - summary_table.jsonl: Clustering summary
21
+ - model_cluster_scores.json: Per model-cluster combination metrics
22
+ - cluster_scores.json: Per cluster metrics (aggregated across models)
23
+ - model_scores.json: Per model metrics (aggregated across clusters)
24
+ - full_dataset.json: Complete PropertyDataset (JSON format)
25
+ - full_dataset.parquet: Complete PropertyDataset (parquet format, or .jsonl if mixed data types)
26
+
27
+ Model Rankings (by average quality score):
28
+ 1. openai/gpt-4o: 0.833
29
+ 2. google/gemini-1.5-pro-001: 0.828
30
+ 3. openai/gpt-4o-mini: 0.828
31
+ 4. meta/llama-3.3-70b-instruct: 0.827
32
+ 5. qwen/qwen2.5-7b-instruct: 0.818
33
+ 6. microsoft/phi-3.5-mini-instruct: 0.806
data/aci_bench/summary_table.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dca91c6976f8751e65d262c12e42451e9880386ae51fe93a62e53e355ac9ba9f
3
+ size 58069
data/aci_bench/summary_table.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098126fa13c7dd247263c87866cbacbcd583229470a34411022d5af130967d52
3
+ size 56818
data/aci_bench/validated_properties.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db2a42c37fd60ecd830569cb776973e16da4acbd4ff9581d8a064239e702e66d
3
+ size 2441177
data/aci_bench/validation_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ba5d5c25ab20c2a8bfa51202ebc7a4c59af49af68fbe385ac0aca9c2960c4ce
3
+ size 137
lmmvibes/__init__.py ADDED
File without changes
lmmvibes/metrics/plotting.py ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Plotting functionality for functional metrics.
3
+
4
+ This module provides comprehensive visualization of metrics from functional_metrics.py,
5
+ """
6
+
7
+ import json
8
+ import pandas as pd
9
+ import numpy as np
10
+ from pathlib import Path
11
+ from typing import Dict, Any, List, Optional
12
+ import warnings
13
+
14
+ import plotly.graph_objects as go
15
+ import plotly.express as px
16
+ from plotly.subplots import make_subplots
17
+ import plotly.io as pio
18
+
19
+ # Set plotly template
20
+ pio.templates.default = "plotly_white"
21
+ warnings.filterwarnings('ignore')
22
+
23
+
24
+ def create_model_cluster_dataframe(model_cluster_scores: Dict[str, Any]) -> pd.DataFrame:
25
+ """Convert model-cluster scores to a tidy dataframe."""
26
+ rows = []
27
+ for model, clusters in model_cluster_scores.items():
28
+ for cluster, metrics in clusters.items():
29
+ # Filter out "No properties" clusters
30
+ if cluster == "No properties":
31
+ continue
32
+
33
+ row = {
34
+ 'model': model,
35
+ 'cluster': cluster,
36
+ 'size': metrics.get('size', 0),
37
+ 'proportion': metrics.get('proportion', 0),
38
+ 'proportion_delta': metrics.get('proportion_delta', 0)
39
+ }
40
+
41
+ # Add confidence intervals if available
42
+ if 'proportion_ci' in metrics:
43
+ ci = metrics['proportion_ci']
44
+ row.update({
45
+ 'proportion_ci_lower': ci.get('lower', 0),
46
+ 'proportion_ci_upper': ci.get('upper', 0),
47
+ 'proportion_ci_mean': ci.get('mean', 0)
48
+ })
49
+
50
+ if 'proportion_delta_ci' in metrics:
51
+ ci = metrics['proportion_delta_ci']
52
+ row.update({
53
+ 'proportion_delta_ci_lower': ci.get('lower', 0),
54
+ 'proportion_delta_ci_upper': ci.get('upper', 0),
55
+ 'proportion_delta_ci_mean': ci.get('mean', 0)
56
+ })
57
+
58
+ # Add significance flags
59
+ row['proportion_delta_significant'] = metrics.get('proportion_delta_significant', False)
60
+
61
+ # Add quality metrics
62
+ quality = metrics.get('quality', {})
63
+ quality_delta = metrics.get('quality_delta', {})
64
+ quality_ci = metrics.get('quality_ci', {})
65
+ quality_delta_ci = metrics.get('quality_delta_ci', {})
66
+ quality_delta_significant = metrics.get('quality_delta_significant', {})
67
+
68
+ for metric_name in quality.keys():
69
+ row[f'quality_{metric_name}'] = quality[metric_name]
70
+ row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
71
+ row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
72
+
73
+ if metric_name in quality_ci:
74
+ ci = quality_ci[metric_name]
75
+ row.update({
76
+ f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
77
+ f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
78
+ f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
79
+ })
80
+
81
+ if metric_name in quality_delta_ci:
82
+ ci = quality_delta_ci[metric_name]
83
+ row.update({
84
+ f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
85
+ f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
86
+ f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
87
+ })
88
+
89
+ rows.append(row)
90
+
91
+ return pd.DataFrame(rows)
92
+
93
+
94
+ def create_cluster_dataframe(cluster_scores: Dict[str, Any]) -> pd.DataFrame:
95
+ """Convert cluster scores to a tidy dataframe."""
96
+ rows = []
97
+ for cluster, metrics in cluster_scores.items():
98
+ # Filter out "No properties" clusters
99
+ if cluster == "No properties":
100
+ continue
101
+
102
+ row = {
103
+ 'cluster': cluster,
104
+ 'size': metrics.get('size', 0),
105
+ 'proportion': metrics.get('proportion', 0)
106
+ }
107
+
108
+ # Add confidence intervals if available
109
+ if 'proportion_ci' in metrics:
110
+ ci = metrics['proportion_ci']
111
+ row.update({
112
+ 'proportion_ci_lower': ci.get('lower', 0),
113
+ 'proportion_ci_upper': ci.get('upper', 0),
114
+ 'proportion_ci_mean': ci.get('mean', 0)
115
+ })
116
+
117
+ # Add quality metrics
118
+ quality = metrics.get('quality', {})
119
+ quality_delta = metrics.get('quality_delta', {})
120
+ quality_ci = metrics.get('quality_ci', {})
121
+ quality_delta_ci = metrics.get('quality_delta_ci', {})
122
+ quality_delta_significant = metrics.get('quality_delta_significant', {})
123
+
124
+ for metric_name in quality.keys():
125
+ row[f'quality_{metric_name}'] = quality[metric_name]
126
+ row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
127
+ row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
128
+
129
+ if metric_name in quality_ci:
130
+ ci = quality_ci[metric_name]
131
+ row.update({
132
+ f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
133
+ f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
134
+ f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
135
+ })
136
+
137
+ if metric_name in quality_delta_ci:
138
+ ci = quality_delta_ci[metric_name]
139
+ row.update({
140
+ f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
141
+ f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
142
+ f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
143
+ })
144
+
145
+ rows.append(row)
146
+
147
+ return pd.DataFrame(rows)
148
+
149
+
150
+ def create_model_dataframe(model_scores: Dict[str, Any]) -> pd.DataFrame:
151
+ """Convert model scores to a tidy dataframe."""
152
+ rows = []
153
+ for model, metrics in model_scores.items():
154
+ row = {
155
+ 'model': model,
156
+ 'size': metrics.get('size', 0),
157
+ 'proportion': metrics.get('proportion', 0)
158
+ }
159
+
160
+ # Add confidence intervals if available
161
+ if 'proportion_ci' in metrics:
162
+ ci = metrics['proportion_ci']
163
+ row.update({
164
+ 'proportion_ci_lower': ci.get('lower', 0),
165
+ 'proportion_ci_upper': ci.get('upper', 0),
166
+ 'proportion_ci_mean': ci.get('mean', 0)
167
+ })
168
+
169
+ # Add quality metrics
170
+ quality = metrics.get('quality', {})
171
+ quality_delta = metrics.get('quality_delta', {})
172
+ quality_ci = metrics.get('quality_ci', {})
173
+ quality_delta_ci = metrics.get('quality_delta_ci', {})
174
+ quality_delta_significant = metrics.get('quality_delta_significant', {})
175
+
176
+ for metric_name in quality.keys():
177
+ row[f'quality_{metric_name}'] = quality[metric_name]
178
+ row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
179
+ row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
180
+
181
+ if metric_name in quality_ci:
182
+ ci = quality_ci[metric_name]
183
+ row.update({
184
+ f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
185
+ f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
186
+ f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
187
+ })
188
+
189
+ if metric_name in quality_delta_ci:
190
+ ci = quality_delta_ci[metric_name]
191
+ row.update({
192
+ f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
193
+ f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
194
+ f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
195
+ })
196
+
197
+ rows.append(row)
198
+
199
+ return pd.DataFrame(rows)
200
+
201
+
202
+ def get_quality_metrics(df: pd.DataFrame) -> List[str]:
203
+ """Extract quality metric names from dataframe columns."""
204
+ quality_cols = [col for col in df.columns if col.startswith('quality_') and not col.endswith(('_ci_lower', '_ci_upper', '_ci_mean', '_significant'))]
205
+ return [col.replace('quality_', '') for col in quality_cols]
206
+
207
+
208
+ def create_interactive_cluster_plot(cluster_df: pd.DataFrame, model_cluster_df: pd.DataFrame,
209
+ metric_col: str, title: str,
210
+ ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
211
+ significant_col: Optional[str] = None) -> go.Figure:
212
+ """Create an interactive cluster plot with dropdown for view mode."""
213
+
214
+ # Create the figure with subplots
215
+ fig = make_subplots(
216
+ rows=1, cols=1,
217
+ specs=[[{"secondary_y": False}]],
218
+ subplot_titles=[title]
219
+ )
220
+
221
+ # Prepare cluster_df - reset index if cluster is the index
222
+ if 'cluster' not in cluster_df.columns and cluster_df.index.name == 'cluster':
223
+ cluster_df = cluster_df.reset_index()
224
+
225
+ # Sort clusters by metric value in descending order for consistent ordering
226
+ cluster_df = cluster_df.sort_values(metric_col, ascending=False)
227
+
228
+ # Add aggregated view (default) - using cluster_df
229
+ if ci_lower_col and ci_upper_col and ci_lower_col in cluster_df.columns and ci_upper_col in cluster_df.columns:
230
+ fig.add_trace(
231
+ go.Bar(
232
+ x=cluster_df['cluster'],
233
+ y=cluster_df[metric_col],
234
+ name='Aggregated (All Models)',
235
+ error_y=dict(
236
+ type='data',
237
+ array=cluster_df[ci_upper_col] - cluster_df[metric_col],
238
+ arrayminus=cluster_df[metric_col] - cluster_df[ci_lower_col],
239
+ visible=True
240
+ ),
241
+ visible=True
242
+ )
243
+ )
244
+ else:
245
+ fig.add_trace(
246
+ go.Bar(
247
+ x=cluster_df['cluster'],
248
+ y=cluster_df[metric_col],
249
+ name='Aggregated (All Models)',
250
+ visible=True
251
+ )
252
+ )
253
+
254
+ # Grouped by model view - using model_cluster_df
255
+ for model in model_cluster_df['model'].unique():
256
+ model_df = model_cluster_df[model_cluster_df['model'] == model]
257
+ # Sort model_df to match the cluster order
258
+ model_df = model_df.set_index('cluster').reindex(cluster_df['cluster']).reset_index()
259
+ if ci_lower_col and ci_upper_col and ci_lower_col in model_cluster_df.columns and ci_upper_col in model_cluster_df.columns:
260
+ fig.add_trace(
261
+ go.Bar(
262
+ x=model_df['cluster'],
263
+ y=model_df[metric_col],
264
+ name=f'Model: {model}',
265
+ error_y=dict(
266
+ type='data',
267
+ array=model_df[ci_upper_col] - model_df[metric_col],
268
+ arrayminus=model_df[metric_col] - model_df[ci_lower_col],
269
+ visible=False
270
+ ),
271
+ visible=False
272
+ )
273
+ )
274
+ else:
275
+ fig.add_trace(
276
+ go.Bar(
277
+ x=model_df['cluster'],
278
+ y=model_df[metric_col],
279
+ name=f'Model: {model}',
280
+ visible=False
281
+ )
282
+ )
283
+
284
+ # Add significance markers if available (for aggregated view)
285
+ # Red asterisks (*) indicate clusters with statistically significant quality delta values
286
+ # (confidence intervals that do not contain 0)
287
+ if significant_col and significant_col in cluster_df.columns:
288
+ for i, (cluster, is_sig) in enumerate(zip(cluster_df['cluster'], cluster_df[significant_col])):
289
+ if is_sig:
290
+ fig.add_annotation(
291
+ x=cluster,
292
+ y=cluster_df[cluster_df['cluster'] == cluster][metric_col].iloc[0],
293
+ text="*",
294
+ showarrow=False,
295
+ font=dict(size=16, color="red"),
296
+ yshift=10
297
+ )
298
+
299
+ # Update layout
300
+ fig.update_layout(
301
+ title=title,
302
+ xaxis_title="Cluster",
303
+ yaxis_title=metric_col.replace('_', ' ').title(),
304
+ barmode='group',
305
+ height=500,
306
+ showlegend=True,
307
+ annotations=[
308
+ dict(
309
+ text="* = Statistically significant (CI does not contain 0)",
310
+ showarrow=False,
311
+ xref="paper", yref="paper",
312
+ x=0.01, y=0.01,
313
+ xanchor="left", yanchor="bottom",
314
+ font=dict(size=10, color="red")
315
+ )
316
+ ] if significant_col and significant_col in cluster_df.columns else []
317
+ )
318
+
319
+ # Add dropdown for view selection - only 2 options
320
+ buttons = []
321
+
322
+ # Aggregated view button (all models combined)
323
+ visibility = [True] + [False] * len(model_cluster_df['model'].unique())
324
+ buttons.append(
325
+ dict(
326
+ label="Aggregated (All Models)",
327
+ method="update",
328
+ args=[{"visible": visibility, "barmode": "group"}]
329
+ )
330
+ )
331
+
332
+ # Grouped by model view (each model as separate bars)
333
+ visibility = [False] + [True] * len(model_cluster_df['model'].unique())
334
+ buttons.append(
335
+ dict(
336
+ label="Grouped by Model",
337
+ method="update",
338
+ args=[{"visible": visibility, "barmode": "group"}]
339
+ )
340
+ )
341
+
342
+ fig.update_layout(
343
+ updatemenus=[
344
+ dict(
345
+ buttons=buttons,
346
+ direction="down",
347
+ showactive=True,
348
+ x=0.95,
349
+ xanchor="right",
350
+ y=1.25,
351
+ yanchor="top"
352
+ )
353
+ ]
354
+ )
355
+
356
+ return fig
357
+
358
+
359
+ def create_interactive_heatmap(df: pd.DataFrame, value_col: str, title: str,
360
+ pivot_index: str = 'model', pivot_columns: str = 'cluster',
361
+ significant_col: Optional[str] = None) -> go.Figure:
362
+ """Create an interactive heatmap with hover information."""
363
+
364
+ # Create pivot table
365
+ pivot_df = df.pivot(index=pivot_index, columns=pivot_columns, values=value_col)
366
+
367
+ # Sort by mean values for consistent ordering
368
+ if pivot_index == 'model':
369
+ # Sort models by their mean values across clusters
370
+ model_means = pivot_df.mean(axis=1).sort_values(ascending=False)
371
+ pivot_df = pivot_df.reindex(model_means.index)
372
+ else:
373
+ # Sort clusters by their mean values across models
374
+ cluster_means = pivot_df.mean(axis=0).sort_values(ascending=False)
375
+ pivot_df = pivot_df.reindex(columns=cluster_means.index)
376
+
377
+ # Transpose the data for more intuitive visualization (models on x-axis, clusters on y-axis)
378
+ pivot_df = pivot_df.T
379
+
380
+ # Create heatmap
381
+ fig = go.Figure(data=go.Heatmap(
382
+ z=pivot_df.values,
383
+ x=pivot_df.columns, # Models
384
+ y=pivot_df.index, # Clusters
385
+ colorscale='RdBu_r' if 'delta' in value_col else 'Viridis',
386
+ zmid=0 if 'delta' in value_col else None,
387
+ text=pivot_df.values.round(3),
388
+ texttemplate="%{text}",
389
+ textfont={"size": 10},
390
+ hoverongaps=False
391
+ ))
392
+
393
+ # Add significance markers if available
394
+ if significant_col and significant_col in df.columns:
395
+ sig_pivot = df.pivot(index=pivot_index, columns=pivot_columns, values=significant_col)
396
+ # Apply same sorting as the main pivot
397
+ if pivot_index == 'model':
398
+ sig_pivot = sig_pivot.reindex(model_means.index)
399
+ else:
400
+ sig_pivot = sig_pivot.reindex(columns=cluster_means.index)
401
+ sig_pivot = sig_pivot.T # Transpose to match the main heatmap
402
+ for i, cluster in enumerate(pivot_df.index):
403
+ for j, model in enumerate(pivot_df.columns):
404
+ if sig_pivot.loc[cluster, model]:
405
+ fig.add_annotation(
406
+ x=model,
407
+ y=cluster,
408
+ text="*",
409
+ showarrow=False,
410
+ font=dict(size=16, color="red"),
411
+ xshift=10,
412
+ yshift=10
413
+ )
414
+
415
+ fig.update_layout(
416
+ title=title,
417
+ xaxis_title="Model",
418
+ yaxis_title="Cluster",
419
+ height=500,
420
+ annotations=[
421
+ dict(
422
+ text="* = Statistically significant (CI does not contain 0)",
423
+ showarrow=False,
424
+ xref="paper", yref="paper",
425
+ x=0.01, y=0.01,
426
+ xanchor="left", yanchor="bottom",
427
+ font=dict(size=10, color="red")
428
+ )
429
+ ] if significant_col and significant_col in df.columns else []
430
+ )
431
+
432
+ return fig
433
+
434
+
435
+ def create_interactive_model_plot(model_df: pd.DataFrame, model_cluster_df: pd.DataFrame,
436
+ metric_col: str, title: str,
437
+ ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
438
+ significant_col: Optional[str] = None) -> go.Figure:
439
+ """Create an interactive model plot with dropdown for view mode."""
440
+
441
+ # Create the figure with subplots
442
+ fig = make_subplots(
443
+ rows=1, cols=1,
444
+ specs=[[{"secondary_y": False}]],
445
+ subplot_titles=[title]
446
+ )
447
+
448
+ # Prepare model_df - reset index if model is the index
449
+ if 'model' not in model_df.columns and model_df.index.name == 'model':
450
+ model_df = model_df.reset_index()
451
+
452
+ # Add aggregated view (default) - using model_df
453
+ if ci_lower_col and ci_upper_col and ci_lower_col in model_df.columns and ci_upper_col in model_df.columns:
454
+ fig.add_trace(
455
+ go.Bar(
456
+ x=model_df['model'],
457
+ y=model_df[metric_col],
458
+ name='Aggregated (All Clusters)',
459
+ error_y=dict(
460
+ type='data',
461
+ array=model_df[ci_upper_col] - model_df[metric_col],
462
+ arrayminus=model_df[metric_col] - model_df[ci_lower_col],
463
+ visible=True
464
+ ),
465
+ visible=True
466
+ )
467
+ )
468
+ else:
469
+ fig.add_trace(
470
+ go.Bar(
471
+ x=model_df['model'],
472
+ y=model_df[metric_col],
473
+ name='Aggregated (All Clusters)',
474
+ visible=True
475
+ )
476
+ )
477
+
478
+ # Grouped by cluster view - using model_cluster_df
479
+ for cluster in model_cluster_df['cluster'].unique():
480
+ cluster_df = model_cluster_df[model_cluster_df['cluster'] == cluster]
481
+ if ci_lower_col and ci_upper_col and ci_lower_col in cluster_df.columns and ci_upper_col in cluster_df.columns:
482
+ fig.add_trace(
483
+ go.Bar(
484
+ x=cluster_df['model'],
485
+ y=cluster_df[metric_col],
486
+ name=f'Cluster: {cluster}',
487
+ error_y=dict(
488
+ type='data',
489
+ array=cluster_df[ci_upper_col] - cluster_df[metric_col],
490
+ arrayminus=cluster_df[metric_col] - cluster_df[ci_lower_col],
491
+ visible=False
492
+ ),
493
+ visible=False
494
+ )
495
+ )
496
+ else:
497
+ fig.add_trace(
498
+ go.Bar(
499
+ x=cluster_df['model'],
500
+ y=cluster_df[metric_col],
501
+ name=f'Cluster: {cluster}',
502
+ visible=False
503
+ )
504
+ )
505
+
506
+ # Add significance markers if available (for aggregated view)
507
+ if significant_col and significant_col in model_df.columns:
508
+ for i, (model, is_sig) in enumerate(zip(model_df['model'], model_df[significant_col])):
509
+ if is_sig:
510
+ fig.add_annotation(
511
+ x=model,
512
+ y=model_df[model_df['model'] == model][metric_col].iloc[0],
513
+ text="*",
514
+ showarrow=False,
515
+ font=dict(size=16, color="red"),
516
+ yshift=10
517
+ )
518
+
519
+ # Update layout
520
+ fig.update_layout(
521
+ title=title,
522
+ xaxis_title="Model",
523
+ yaxis_title=metric_col.replace('_', ' ').title(),
524
+ barmode='group',
525
+ height=500,
526
+ showlegend=True
527
+ )
528
+
529
+ # Add dropdown for view selection - only 2 options
530
+ buttons = []
531
+
532
+ # Aggregated view button (all clusters combined)
533
+ visibility = [True] + [False] * len(model_cluster_df['cluster'].unique())
534
+ buttons.append(
535
+ dict(
536
+ label="Aggregated (All Clusters)",
537
+ method="update",
538
+ args=[{"visible": visibility, "barmode": "group"}]
539
+ )
540
+ )
541
+
542
+ # Grouped by cluster view (each cluster as separate bars)
543
+ visibility = [False] + [True] * len(model_cluster_df['cluster'].unique())
544
+ buttons.append(
545
+ dict(
546
+ label="Grouped by Cluster",
547
+ method="update",
548
+ args=[{"visible": visibility, "barmode": "group"}]
549
+ )
550
+ )
551
+
552
+ fig.update_layout(
553
+ updatemenus=[
554
+ dict(
555
+ buttons=buttons,
556
+ direction="down",
557
+ showactive=True,
558
+ x=0.95,
559
+ xanchor="right",
560
+ y=1.25,
561
+ yanchor="top"
562
+ )
563
+ ]
564
+ )
565
+
566
+ return fig
567
+
568
+
569
+ def create_interactive_model_cluster_plot(df: pd.DataFrame, metric_col: str, title: str,
570
+ ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
571
+ significant_col: Optional[str] = None) -> go.Figure:
572
+ """Create an interactive model-cluster plot with grouped bars."""
573
+
574
+ # Create grouped bar chart
575
+ if ci_lower_col and ci_upper_col and ci_lower_col in df.columns and ci_upper_col in df.columns:
576
+ fig = px.bar(
577
+ df,
578
+ x='cluster',
579
+ y=metric_col,
580
+ color='model',
581
+ error_y=df[ci_upper_col] - df[metric_col],
582
+ error_y_minus=df[metric_col] - df[ci_lower_col],
583
+ title=title,
584
+ barmode='group'
585
+ )
586
+ else:
587
+ fig = px.bar(
588
+ df,
589
+ x='cluster',
590
+ y=metric_col,
591
+ color='model',
592
+ title=title,
593
+ barmode='group'
594
+ )
595
+
596
+ # Add significance markers if available
597
+ if significant_col and significant_col in df.columns:
598
+ for i, row in df.iterrows():
599
+ if row[significant_col]:
600
+ fig.add_annotation(
601
+ x=row['cluster'],
602
+ y=row[metric_col],
603
+ text="*",
604
+ showarrow=False,
605
+ font=dict(size=16, color="red"),
606
+ yshift=10
607
+ )
608
+
609
+ fig.update_layout(
610
+ height=500,
611
+ xaxis_title="Cluster",
612
+ yaxis_title=metric_col.replace('_', ' ').title()
613
+ )
614
+
615
+ return fig
616
+
lmmvibes/utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Utilities package for LMM-Vibes."""
lmmvibes/utils/persistent_storage.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities for persistent storage in Hugging Face Spaces.
3
+ """
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+
9
+ def get_persistent_data_dir() -> Optional[Path]:
10
+ """Get the persistent data directory if available.
11
+
12
+ Returns:
13
+ Path to persistent storage directory if available, None otherwise.
14
+ """
15
+ if os.path.isdir("/data"):
16
+ data_dir = Path("/data/app_data")
17
+ data_dir.mkdir(exist_ok=True)
18
+ return data_dir
19
+ return None
20
+
21
+
22
+ def get_cache_dir() -> Path:
23
+ """Get the appropriate cache directory (persistent if available, temp otherwise).
24
+
25
+ Returns:
26
+ Path to cache directory.
27
+ """
28
+ if os.path.isdir("/data"):
29
+ cache_dir = Path("/data/.cache")
30
+ cache_dir.mkdir(exist_ok=True)
31
+ return cache_dir
32
+ else:
33
+ # Fallback to temp directory
34
+ import tempfile
35
+ return Path(tempfile.gettempdir()) / "app_cache"
36
+
37
+
38
+ def save_uploaded_file(uploaded_file, filename: str) -> Optional[Path]:
39
+ """Save an uploaded file to persistent storage.
40
+
41
+ Args:
42
+ uploaded_file: Gradio uploaded file object
43
+ filename: Name to save the file as
44
+
45
+ Returns:
46
+ Path to saved file if successful, None otherwise.
47
+ """
48
+ persistent_dir = get_persistent_data_dir()
49
+ if persistent_dir and uploaded_file:
50
+ save_path = persistent_dir / filename
51
+ save_path.parent.mkdir(parents=True, exist_ok=True)
52
+
53
+ # Copy the uploaded file to persistent storage
54
+ import shutil
55
+ shutil.copy2(uploaded_file, save_path)
56
+ return save_path
57
+ return None
58
+
59
+
60
+ def is_persistent_storage_available() -> bool:
61
+ """Check if persistent storage is available.
62
+
63
+ Returns:
64
+ True if persistent storage is available, False otherwise.
65
+ """
66
+ return os.path.isdir("/data")
67
+
68
+
69
+ def get_persistent_results_dir() -> Optional[Path]:
70
+ """Get the persistent results directory for storing pipeline results.
71
+
72
+ Returns:
73
+ Path to persistent results directory if available, None otherwise.
74
+ """
75
+ persistent_dir = get_persistent_data_dir()
76
+ if persistent_dir:
77
+ results_dir = persistent_dir / "results"
78
+ results_dir.mkdir(exist_ok=True)
79
+ return results_dir
80
+ return None
lmmvibes/vis_gradio/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio-based visualization for LMM-Vibes pipeline results.
2
+
3
+ This module provides a Gradio interface for exploring model performance,
4
+ cluster analysis, and detailed examples from pipeline output.
5
+
6
+ Usage:
7
+ from lmmvibes.vis_gradio import launch_app
8
+ launch_app(results_dir="path/to/results")
9
+ """
10
+
11
+ from .app import launch_app, create_app
12
+
13
+ __all__ = ["launch_app", "create_app"]
lmmvibes/vis_gradio/app.py ADDED
@@ -0,0 +1,697 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main Gradio application for LMM-Vibes pipeline results visualization.
3
+
4
+ This module creates a comprehensive Gradio interface for exploring model performance,
5
+ cluster analysis, and detailed examples from pipeline output.
6
+ """
7
+
8
+ import gradio as gr
9
+ import pandas as pd
10
+ import numpy as np
11
+ import plotly.graph_objects as go
12
+ from pathlib import Path
13
+ from typing import Dict, List, Any, Optional, Tuple
14
+ import os
15
+
16
+ from .data_loader import (
17
+ load_pipeline_results,
18
+ load_property_examples,
19
+ scan_for_result_subfolders,
20
+ validate_results_directory,
21
+ get_available_models
22
+ )
23
+ from .utils import (
24
+ compute_model_rankings,
25
+ create_model_summary_card,
26
+ format_cluster_dataframe,
27
+ create_frequency_comparison_table,
28
+ create_frequency_comparison_plots,
29
+ search_clusters_by_text,
30
+ get_top_clusters_for_model,
31
+ create_interactive_cluster_viewer,
32
+ get_cluster_statistics,
33
+ get_unique_values_for_dropdowns,
34
+ get_example_data,
35
+ format_examples_display,
36
+ get_total_clusters_count
37
+ )
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # NEW: centralised state + logic split into per-tab modules
41
+ # ---------------------------------------------------------------------------
42
+ from .state import app_state, BASE_RESULTS_DIR
43
+
44
+ # Tab-specific logic (moved out of this file)
45
+ from .load_data_tab import (
46
+ load_data,
47
+ get_available_experiments,
48
+ get_experiment_choices,
49
+ refresh_experiment_dropdown,
50
+ load_experiment_data,
51
+ )
52
+ from .overview_tab import create_overview
53
+ from .clusters_tab import view_clusters_interactive, view_clusters_table
54
+ from .examples_tab import (
55
+ get_dropdown_choices,
56
+ update_example_dropdowns,
57
+ view_examples,
58
+ )
59
+ # Frequency and debug remain
60
+ from .frequency_tab import create_frequency_comparison, create_frequency_plots
61
+ from .debug_tab import debug_data_structure
62
+ from .plots_tab import create_plots_tab, create_plot_with_toggle, update_quality_metric_dropdown, update_quality_metric_visibility
63
+
64
+ # app_state and BASE_RESULTS_DIR now come from vis_gradio.state
65
+
66
+
67
+ def update_top_n_slider_maximum():
68
+ """Update the top N slider maximum based on total clusters in loaded data."""
69
+ from .state import app_state
70
+
71
+ if not app_state.get("metrics"):
72
+ return gr.Slider(minimum=1, maximum=10, value=3, step=1)
73
+
74
+ total_clusters = get_total_clusters_count(app_state["metrics"])
75
+ max_value = max(10, total_clusters) # At least 10, or total clusters if more
76
+
77
+ return gr.Slider(
78
+ label="Top N Clusters per Model",
79
+ minimum=1,
80
+ maximum=max_value,
81
+ value=min(3, max_value),
82
+ step=1,
83
+ info=f"Number of top clusters to show per model (max: {total_clusters})"
84
+ )
85
+
86
+
87
+ def create_app() -> gr.Blocks:
88
+ """Create the main Gradio application."""
89
+
90
+ # Custom CSS for reduced margins and better sidebar layout
91
+ custom_css = """
92
+ .main-container {
93
+ max-width: 100% !important;
94
+ margin: 0 !important;
95
+ padding: 0 !important;
96
+ }
97
+ .gradio-container {
98
+ max-width: 100% !important;
99
+ margin: 0 !important;
100
+ padding: 0 10px !important;
101
+ }
102
+ .tabs {
103
+ margin: 0 !important;
104
+ padding: 0 !important;
105
+ }
106
+ .tab-nav {
107
+ margin: 0 !important;
108
+ padding: 0 !important;
109
+ }
110
+ .tab-content {
111
+ margin: 0 !important;
112
+ padding: 10px !important;
113
+ }
114
+ .sidebar {
115
+ border-right: 1px solid #e0e0e0;
116
+ background-color: #f8f9fa;
117
+ padding: 15px !important;
118
+ }
119
+ .main-content {
120
+ padding: 10px !important;
121
+ }
122
+ """
123
+
124
+ with gr.Blocks(title="LMM-Vibes Pipeline Results Explorer", theme=gr.themes.Soft(), css=custom_css) as app:
125
+ gr.Markdown("""
126
+ **Comprehensive analysis of model behavioral properties and performance**
127
+
128
+ Upload your pipeline results directory to explore model performance, cluster analysis, and detailed examples.
129
+ """)
130
+
131
+ with gr.Row():
132
+ # Sidebar for data loading and model selection
133
+ with gr.Column(scale=1, min_width=300, elem_classes=["sidebar"]):
134
+ gr.Markdown("### Load Data")
135
+ if BASE_RESULTS_DIR:
136
+ gr.Markdown(f"**Base Results Directory:** `{BASE_RESULTS_DIR}`")
137
+ gr.Markdown("**WARNING: this might take a while to load**")
138
+ gr.Markdown("Select an experiment from the dropdown below to load its results.")
139
+ else:
140
+ gr.Markdown("Provide the path to your pipeline results directory containing either:")
141
+ gr.Markdown("β€’ **Legacy format**: `model_stats.json` + `clustered_results.jsonl`")
142
+ gr.Markdown("β€’ **Functional format**: `model_cluster_scores.json` + `cluster_scores.json` + `model_scores.json` + `clustered_results.jsonl`")
143
+ gr.Markdown("*The app will automatically detect which format you're using.*")
144
+
145
+ if BASE_RESULTS_DIR:
146
+ experiment_dropdown = gr.Dropdown(
147
+ label="Select Experiment",
148
+ choices=get_experiment_choices(),
149
+ value="Select an experiment...",
150
+ info="Choose an experiment to load its results"
151
+ )
152
+ else:
153
+ results_dir_input = gr.Textbox(
154
+ label="Results Directory Path",
155
+ placeholder="/path/to/your/results/directory",
156
+ info="Directory containing pipeline results (legacy or functional format)"
157
+ )
158
+
159
+ load_btn = gr.Button("Load Data", variant="primary")
160
+
161
+ data_status = gr.Markdown("")
162
+ models_info = gr.Markdown("")
163
+
164
+ # Model selection (will be updated after loading)
165
+ selected_models = gr.CheckboxGroup(
166
+ label="Select Models for Analysis",
167
+ choices=[],
168
+ value=[],
169
+ info="Choose which models to include in comparisons"
170
+ )
171
+
172
+ # Main content area with reduced margins
173
+ with gr.Column(scale=4, elem_classes=["main-content"]):
174
+ with gr.Tabs():
175
+ # Tab 1: Overview
176
+ with gr.TabItem("πŸ“Š Overview"):
177
+ with gr.Row():
178
+ min_cluster_size = gr.Slider(
179
+ label="Minimum Cluster Size",
180
+ minimum=1, maximum=50, value=5, step=1,
181
+ info="Hide clusters with fewer than this many examples"
182
+ )
183
+ score_significant_only = gr.Checkbox(
184
+ label="Show Only Frequency Significant Clusters",
185
+ value=False,
186
+ info="Only show clusters where the distinctiveness score is statistically significant"
187
+ )
188
+ quality_significant_only = gr.Checkbox(
189
+ label="Show Only Quality Significant Clusters",
190
+ value=False,
191
+ info="Only show clusters where the quality score is statistically significant"
192
+ )
193
+
194
+ with gr.Row():
195
+ sort_by = gr.Dropdown(
196
+ label="Sort Clusters By",
197
+ choices=[
198
+ ("Proportion Delta (Descending)", "salience_desc"),
199
+ ("Proportion Delta (Ascending)", "salience_asc"),
200
+ ("Quality (Ascending)", "quality_asc"),
201
+ ("Quality (Descending)", "quality_desc"),
202
+ ("Frequency (Descending)", "frequency_desc"),
203
+ ("Frequency (Ascending)", "frequency_asc")
204
+ ],
205
+ value="quality_asc",
206
+ info="How to sort clusters within each model card"
207
+ )
208
+ top_n_overview = gr.Slider(
209
+ label="Top N Clusters per Model",
210
+ minimum=1, maximum=10, value=3, step=1,
211
+ info="Number of top clusters to show per model"
212
+ )
213
+
214
+ overview_display = gr.HTML(label="Model Overview")
215
+
216
+ refresh_overview_btn = gr.Button("Refresh Overview")
217
+
218
+ # Tab 2: View Clusters
219
+ with gr.TabItem("πŸ“‹ View Clusters"):
220
+ gr.Markdown("### Interactive Cluster Viewer")
221
+ gr.Markdown("Explore clusters with detailed property descriptions. Click on clusters to expand and view all properties within each cluster.")
222
+
223
+ with gr.Row():
224
+ search_clusters = gr.Textbox(
225
+ label="Search Clusters",
226
+ placeholder="Search in cluster descriptions...",
227
+ info="Search for specific terms in cluster descriptions only"
228
+ )
229
+
230
+ clusters_display = gr.HTML(
231
+ label="Interactive Cluster Viewer",
232
+ value="<p style='color: #666; padding: 20px;'>Load data and select models to view clusters</p>"
233
+ )
234
+
235
+ refresh_clusters_btn = gr.Button("Refresh Clusters")
236
+
237
+ # Tab 3: View Examples
238
+ with gr.TabItem("πŸ“‹ View Examples"):
239
+ # gr.Markdown("### Individual Example Viewer")
240
+ # gr.Markdown("Explore individual examples with full prompts, model responses, and property information. Click on examples to expand and view full details.")
241
+
242
+ with gr.Row():
243
+ search_examples = gr.Textbox(
244
+ label="Search Clusters",
245
+ placeholder="Search in cluster descriptions...",
246
+ info="Search for specific terms in cluster descriptions to filter examples"
247
+ )
248
+
249
+ with gr.Row():
250
+ with gr.Column(scale=1):
251
+ example_prompt_dropdown = gr.Dropdown(
252
+ label="Select Prompt",
253
+ choices=["All Prompts"],
254
+ value="All Prompts",
255
+ info="Choose a specific prompt or 'All Prompts'"
256
+ )
257
+ with gr.Column(scale=1):
258
+ example_model_dropdown = gr.Dropdown(
259
+ label="Select Model",
260
+ choices=["All Models"],
261
+ value="All Models",
262
+ info="Choose a specific model or 'All Models'"
263
+ )
264
+ with gr.Column(scale=1):
265
+ example_property_dropdown = gr.Dropdown(
266
+ label="Select Cluster (Optional)",
267
+ choices=["All Clusters"],
268
+ value="All Clusters",
269
+ info="Choose a specific cluster or 'All Clusters'"
270
+ )
271
+
272
+ with gr.Row():
273
+ max_examples_slider = gr.Slider(
274
+ label="Max Examples",
275
+ minimum=1, maximum=20, value=5, step=1,
276
+ info="Maximum number of examples to display"
277
+ )
278
+ use_accordion_checkbox = gr.Checkbox(
279
+ label="Use Accordion for System/Info Messages",
280
+ value=True,
281
+ info="Group system and info messages in collapsible sections"
282
+ )
283
+ pretty_print_checkbox = gr.Checkbox(
284
+ label="Pretty-print dictionaries",
285
+ value=True,
286
+ info="Format embedded dictionaries for readability"
287
+ )
288
+ show_unexpected_behavior_checkbox = gr.Checkbox(
289
+ label="Show Unexpected Behavior Only",
290
+ value=False,
291
+ info="Filter to show only examples with unexpected behavior"
292
+ )
293
+ view_examples_btn = gr.Button("View Examples", variant="primary")
294
+
295
+ examples_display = gr.HTML(
296
+ label="Examples",
297
+ value="<p style='color: #666; padding: 20px;'>Load data and select filters to view examples</p>"
298
+ )
299
+
300
+ # Tab 4: Frequency Comparison
301
+ with gr.TabItem("πŸ“ˆ Functional Metrics Tables"):
302
+ gr.Markdown("View the three tables created by the functional metrics pipeline:")
303
+ gr.Markdown("β€’ **Model-Cluster Scores**: Per model-cluster combination metrics")
304
+ gr.Markdown("β€’ **Cluster Scores**: Per cluster metrics (aggregated across all models)")
305
+ gr.Markdown("β€’ **Model Scores**: Per model metrics (aggregated across all clusters)")
306
+
307
+ frequency_table_info = gr.Markdown("")
308
+
309
+ # Three separate tables for the functional metrics
310
+ gr.Markdown("### Model-Cluster Scores")
311
+ gr.Markdown("Per model-cluster combination metrics")
312
+ model_cluster_table = gr.Dataframe(
313
+ label="Model-Cluster Scores",
314
+ interactive=False,
315
+ wrap=True,
316
+ max_height=600,
317
+ elem_classes=["frequency-comparison-table"],
318
+ show_search="search",
319
+ pinned_columns=2
320
+ )
321
+
322
+ gr.Markdown("### Cluster Scores")
323
+ gr.Markdown("Per cluster metrics (aggregated across all models)")
324
+ cluster_table = gr.Dataframe(
325
+ label="Cluster Scores",
326
+ interactive=False,
327
+ wrap=True,
328
+ max_height=600,
329
+ elem_classes=["frequency-comparison-table"],
330
+ show_search="search",
331
+ pinned_columns=2
332
+ )
333
+
334
+ gr.Markdown("### Model Scores")
335
+ gr.Markdown("Per model metrics (aggregated across all clusters)")
336
+ model_table = gr.Dataframe(
337
+ label="Model Scores",
338
+ interactive=False,
339
+ wrap=True,
340
+ max_height=600,
341
+ elem_classes=["frequency-comparison-table"],
342
+ show_search="search"
343
+ )
344
+
345
+ # Plots section has been removed
346
+
347
+ # Remove all custom CSS styling - use Gradio defaults
348
+
349
+ # Tab 5: Plots
350
+ with gr.TabItem("πŸ“Š Plots"):
351
+ plot_display, plot_info, show_ci_checkbox, plot_type_dropdown, quality_metric_dropdown = create_plots_tab()
352
+
353
+ # (Search Examples tab removed)
354
+ # Tab 6: Debug Data
355
+ with gr.TabItem("πŸ› Debug Data"):
356
+ gr.Markdown("### Data Structure Debug")
357
+ gr.Markdown("If tables aren't loading correctly, use this tab to inspect your data structure and identify issues.")
358
+
359
+ debug_display = gr.HTML(
360
+ label="Debug Information",
361
+ value="<p style='color: #666; padding: 20px;'>Load data to see debug information</p>"
362
+ )
363
+
364
+ debug_btn = gr.Button("Show Debug Info", variant="secondary")
365
+
366
+ # Event handlers
367
+ if BASE_RESULTS_DIR:
368
+ # Use dropdown for experiment selection
369
+ if 'experiment_dropdown' in locals():
370
+ (experiment_dropdown.change(
371
+ fn=load_experiment_data,
372
+ inputs=[experiment_dropdown],
373
+ outputs=[data_status, models_info, selected_models]
374
+ ).then(
375
+ fn=update_example_dropdowns,
376
+ outputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown]
377
+ ).then(
378
+ fn=view_examples,
379
+ inputs=[
380
+ example_prompt_dropdown,
381
+ example_model_dropdown,
382
+ example_property_dropdown,
383
+ max_examples_slider,
384
+ use_accordion_checkbox,
385
+ pretty_print_checkbox,
386
+ search_examples,
387
+ show_unexpected_behavior_checkbox,
388
+ ],
389
+ outputs=[examples_display]
390
+ ).then(
391
+ fn=update_top_n_slider_maximum,
392
+ outputs=[top_n_overview]
393
+ ).then(
394
+ fn=create_frequency_comparison,
395
+ inputs=[selected_models],
396
+ outputs=[model_cluster_table, cluster_table, model_table, frequency_table_info]
397
+ ).then(
398
+ fn=create_plot_with_toggle,
399
+ inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
400
+ outputs=[plot_display, plot_info]
401
+ ).then(
402
+ fn=update_quality_metric_dropdown,
403
+ outputs=[quality_metric_dropdown]
404
+ ))
405
+ else:
406
+ # Use textbox for manual path entry
407
+ if 'load_btn' in locals() and 'results_dir_input' in locals():
408
+ (load_btn.click(
409
+ fn=load_data,
410
+ inputs=[results_dir_input],
411
+ outputs=[data_status, models_info, selected_models]
412
+ ).then(
413
+ fn=update_example_dropdowns,
414
+ outputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown]
415
+ ).then(
416
+ fn=view_examples,
417
+ inputs=[
418
+ example_prompt_dropdown,
419
+ example_model_dropdown,
420
+ example_property_dropdown,
421
+ max_examples_slider,
422
+ use_accordion_checkbox,
423
+ pretty_print_checkbox,
424
+ search_examples,
425
+ show_unexpected_behavior_checkbox,
426
+ ],
427
+ outputs=[examples_display]
428
+ ).then(
429
+ fn=update_top_n_slider_maximum,
430
+ outputs=[top_n_overview]
431
+ ).then(
432
+ fn=create_frequency_comparison,
433
+ inputs=[selected_models],
434
+ outputs=[model_cluster_table, cluster_table, model_table, frequency_table_info]
435
+ ).then(
436
+ fn=create_plot_with_toggle,
437
+ inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
438
+ outputs=[plot_display, plot_info]
439
+ ).then(
440
+ fn=update_quality_metric_dropdown,
441
+ outputs=[quality_metric_dropdown]
442
+ ))
443
+
444
+ refresh_overview_btn.click(
445
+ fn=create_overview,
446
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
447
+ outputs=[overview_display]
448
+ )
449
+
450
+ refresh_clusters_btn.click(
451
+ fn=view_clusters_interactive,
452
+ inputs=[selected_models, search_clusters],
453
+ outputs=[clusters_display]
454
+ )
455
+
456
+ # View Examples handlers
457
+ view_examples_btn.click(
458
+ fn=view_examples,
459
+ inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
460
+ outputs=[examples_display]
461
+ )
462
+
463
+ # Auto-refresh examples when dropdowns change
464
+ example_prompt_dropdown.change(
465
+ fn=view_examples,
466
+ inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
467
+ outputs=[examples_display]
468
+ )
469
+
470
+ example_model_dropdown.change(
471
+ fn=view_examples,
472
+ inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
473
+ outputs=[examples_display]
474
+ )
475
+
476
+ example_property_dropdown.change(
477
+ fn=view_examples,
478
+ inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
479
+ outputs=[examples_display]
480
+ )
481
+
482
+ # Auto-refresh examples when search term changes
483
+ search_examples.change(
484
+ fn=view_examples,
485
+ inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
486
+ outputs=[examples_display]
487
+ )
488
+
489
+ # Auto-refresh examples when unexpected behavior checkbox changes
490
+ show_unexpected_behavior_checkbox.change(
491
+ fn=view_examples,
492
+ inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox],
493
+ outputs=[examples_display]
494
+ )
495
+
496
+ # Frequency Tab Handlers
497
+ freq_inputs = [selected_models]
498
+ freq_outputs = [model_cluster_table, cluster_table, model_table, frequency_table_info]
499
+
500
+ selected_models.change(fn=create_frequency_comparison, inputs=freq_inputs, outputs=freq_outputs)
501
+
502
+ # (Search Examples tab removed – no search_btn handler required)
503
+
504
+ debug_btn.click(
505
+ fn=debug_data_structure,
506
+ outputs=[debug_display]
507
+ )
508
+
509
+ # Plots Tab Handlers
510
+ show_ci_checkbox.change(
511
+ fn=create_plot_with_toggle,
512
+ inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
513
+ outputs=[plot_display, plot_info]
514
+ )
515
+
516
+ # Quality metric dropdown handlers (only for quality plots)
517
+ quality_metric_dropdown.change(
518
+ fn=create_plot_with_toggle,
519
+ inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
520
+ outputs=[plot_display, plot_info]
521
+ )
522
+
523
+ # Update quality metric visibility and plot based on plot type
524
+ plot_type_dropdown.change(
525
+ fn=update_quality_metric_visibility,
526
+ inputs=[plot_type_dropdown],
527
+ outputs=[quality_metric_dropdown]
528
+ ).then(
529
+ fn=create_plot_with_toggle,
530
+ inputs=[plot_type_dropdown, quality_metric_dropdown, show_ci_checkbox],
531
+ outputs=[plot_display, plot_info]
532
+ )
533
+
534
+ # Auto-refresh on model selection change
535
+ selected_models.change(
536
+ fn=create_overview,
537
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
538
+ outputs=[overview_display]
539
+ )
540
+
541
+ # Auto-refresh on significance filter changes
542
+ score_significant_only.change(
543
+ fn=create_overview,
544
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
545
+ outputs=[overview_display]
546
+ )
547
+
548
+ quality_significant_only.change(
549
+ fn=create_overview,
550
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
551
+ outputs=[overview_display]
552
+ )
553
+
554
+ # Auto-refresh on sort dropdown change
555
+ sort_by.change(
556
+ fn=create_overview,
557
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
558
+ outputs=[overview_display]
559
+ )
560
+
561
+ # Auto-refresh on cluster level change
562
+ # cluster_level.change(
563
+ # fn=create_overview,
564
+ # inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
565
+ # outputs=[overview_display]
566
+ # )
567
+
568
+ # Auto-refresh on top N change
569
+ top_n_overview.change(
570
+ fn=create_overview,
571
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
572
+ outputs=[overview_display]
573
+ )
574
+
575
+ # Auto-refresh on minimum cluster size change
576
+ min_cluster_size.change(
577
+ fn=create_overview,
578
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size],
579
+ outputs=[overview_display]
580
+ )
581
+
582
+ selected_models.change(
583
+ fn=view_clusters_interactive,
584
+ inputs=[selected_models, gr.State("fine"), search_clusters],
585
+ outputs=[clusters_display]
586
+ )
587
+
588
+ # Auto-refresh clusters when search term changes (with debouncing)
589
+ search_clusters.change(
590
+ fn=view_clusters_interactive,
591
+ inputs=[selected_models, gr.State("fine"), search_clusters],
592
+ outputs=[clusters_display]
593
+ )
594
+
595
+ return app
596
+
597
+
598
+ def launch_app(results_dir: Optional[str] = None,
599
+ share: bool = False,
600
+ server_name: str = "127.0.0.1",
601
+ server_port: int = 7860,
602
+ **kwargs) -> None:
603
+ """Launch the Gradio application.
604
+
605
+ Args:
606
+ results_dir: Optional path to base results directory containing experiment subfolders
607
+ share: Whether to create a public link
608
+ server_name: Server address
609
+ server_port: Server port
610
+ **kwargs: Additional arguments for gr.Blocks.launch()
611
+ """
612
+ global BASE_RESULTS_DIR
613
+
614
+ # Set the global base results directory
615
+ if results_dir:
616
+ BASE_RESULTS_DIR = results_dir
617
+ print(f"πŸ“ Base results directory set to: {results_dir}")
618
+
619
+ # Check if it's a valid directory
620
+ if not os.path.exists(results_dir):
621
+ print(f"⚠️ Warning: Base results directory does not exist: {results_dir}")
622
+ BASE_RESULTS_DIR = None
623
+ else:
624
+ # Scan for available experiments
625
+ experiments = get_available_experiments(results_dir)
626
+ print(f"πŸ” Found {len(experiments)} experiments: {experiments}")
627
+
628
+ app = create_app()
629
+
630
+ # Auto-load data if results_dir is provided and contains a single experiment
631
+ if results_dir and os.path.exists(results_dir):
632
+ experiments = get_available_experiments(results_dir)
633
+ if len(experiments) == 1:
634
+ # Auto-load the single experiment
635
+ experiment_path = os.path.join(results_dir, experiments[0])
636
+ try:
637
+ clustered_df, model_stats, model_cluster_df, results_path = load_pipeline_results(experiment_path)
638
+ app_state['clustered_df'] = clustered_df
639
+ app_state['model_stats'] = model_stats
640
+ app_state['model_cluster_df'] = model_cluster_df
641
+ app_state['results_path'] = results_path
642
+ app_state['available_models'] = get_available_models(model_stats)
643
+ app_state['current_results_dir'] = experiment_path
644
+ print(f"βœ… Auto-loaded data from: {experiment_path}")
645
+ except Exception as e:
646
+ print(f"❌ Failed to auto-load data: {e}")
647
+ elif len(experiments) > 1:
648
+ print(f"πŸ“‹ Multiple experiments found. Please select one from the dropdown.")
649
+
650
+ print(f"πŸš€ Launching Gradio app on {server_name}:{server_port}")
651
+ print(f"Share mode: {share}")
652
+ print(f"πŸ”§ Additional kwargs: {kwargs}")
653
+
654
+ try:
655
+ app.launch(
656
+ share=share,
657
+ server_name=server_name,
658
+ server_port=server_port,
659
+ show_error=True, # Show detailed error messages
660
+ quiet=False, # Show more verbose output
661
+ **kwargs
662
+ )
663
+ except Exception as e:
664
+ print(f"❌ Failed to launch on port {server_port}: {e}")
665
+ print("πŸ”„ Trying alternative port configuration...")
666
+
667
+ # Try with a port range instead of port 0
668
+ try:
669
+ # Try ports in a reasonable range
670
+ for alt_port in [8080, 8081, 8082, 8083, 8084, 8085, 8086, 8087, 8088, 8089]:
671
+ try:
672
+ print(f"πŸ”„ Trying port {alt_port}...")
673
+ app.launch(
674
+ share=share,
675
+ server_name=server_name,
676
+ server_port=alt_port,
677
+ show_error=True,
678
+ quiet=False,
679
+ **kwargs
680
+ )
681
+ break # If successful, break out of the loop
682
+ except Exception as port_error:
683
+ if "Cannot find empty port" in str(port_error):
684
+ print(f" Port {alt_port} is busy, trying next...")
685
+ continue
686
+ else:
687
+ raise port_error
688
+ else:
689
+ # If we get here, all ports in our range were busy
690
+ raise Exception("All attempted ports (8080-8089) are busy")
691
+
692
+ except Exception as e2:
693
+ print(f"❌ Failed to launch with alternative ports: {e2}")
694
+ print("πŸ’‘ Try specifying a different port manually:")
695
+ print(f" python -m lmmvibes.vis_gradio.launcher --port 9000")
696
+ print(f" python -m lmmvibes.vis_gradio.launcher --auto_port")
697
+ raise e2
lmmvibes/vis_gradio/clusters_tab.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helpers for the **View Clusters** tab – both the interactive HTML and
2
+ fallback dataframe view."""
3
+ from typing import List
4
+
5
+ import pandas as pd
6
+
7
+ from .state import app_state
8
+ from .utils import (
9
+ search_clusters_by_text,
10
+ search_clusters_only,
11
+ create_interactive_cluster_viewer,
12
+ get_cluster_statistics,
13
+ format_cluster_dataframe,
14
+ )
15
+
16
+ __all__ = ["view_clusters_interactive", "view_clusters_table"]
17
+
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Interactive HTML view
21
+ # ---------------------------------------------------------------------------
22
+
23
+ def view_clusters_interactive(
24
+ selected_models: List[str],
25
+ cluster_level: str,
26
+ search_term: str = "",
27
+ ) -> str:
28
+ if app_state["clustered_df"] is None:
29
+ return (
30
+ "<p style='color: #e74c3c; padding: 20px;'>❌ Please load data first "
31
+ "using the 'Load Data' tab</p>"
32
+ )
33
+
34
+ df = app_state["clustered_df"].dropna(subset=["property_description"]).copy()
35
+
36
+ # Apply search filter first
37
+ if search_term and search_term.strip():
38
+ df = search_clusters_only(df, search_term.strip(), cluster_level)
39
+
40
+ # Build interactive viewer
41
+ cluster_html = create_interactive_cluster_viewer(df, selected_models, cluster_level)
42
+
43
+ # Statistics summary at the top
44
+ stats = get_cluster_statistics(df, selected_models)
45
+ if not stats:
46
+ return (
47
+ "<p style='color: #e74c3c; padding: 20px;'>❌ No cluster data available</p>"
48
+ )
49
+
50
+ # Get additional metrics from cluster_scores
51
+ cluster_scores = app_state.get("metrics", {}).get("cluster_scores", {})
52
+
53
+ # Calculate average quality scores and frequency
54
+ total_frequency = 0
55
+ quality_scores_list = []
56
+ metric_names = set()
57
+
58
+ for cluster_name, cluster_data in cluster_scores.items():
59
+ total_frequency += cluster_data.get("proportion", 0) * 100
60
+ quality_scores = cluster_data.get("quality", {})
61
+ if quality_scores:
62
+ quality_scores_list.extend(quality_scores.values())
63
+ metric_names.update(quality_scores.keys())
64
+
65
+ avg_quality = sum(quality_scores_list) / len(quality_scores_list) if quality_scores_list else 0
66
+ metrics_suffix = f" ({', '.join(sorted(metric_names))})" if metric_names else ""
67
+
68
+ stats_html = f"""
69
+ <div style="
70
+ background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
71
+ color: white;
72
+ padding: 20px;
73
+ border-radius: 8px;
74
+ margin-bottom: 20px;
75
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
76
+ ">
77
+ <h3 style="margin: 0 0 15px 0;">Cluster Statistics</h3>
78
+ <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
79
+ <div>
80
+ <div style="font-size: 24px; font-weight: bold;">{stats['total_properties']:,}</div>
81
+ <div style="opacity: 0.9;">Total Properties</div>
82
+ </div>
83
+ <div>
84
+ <div style="font-size: 24px; font-weight: bold;">{stats['total_models']}</div>
85
+ <div style="opacity: 0.9;">Models</div>
86
+ </div>
87
+ <div>
88
+ <div style="font-size: 24px; font-weight: bold;">{avg_quality:.3f}</div>
89
+ <div style="opacity: 0.9;">Avg Quality{metrics_suffix}</div>
90
+ </div>
91
+ """
92
+
93
+ if cluster_level == "fine" and "fine_clusters" in stats:
94
+ stats_html += f"""
95
+ <div>
96
+ <div style="font-size: 24px; font-weight: bold;">{stats['fine_clusters']}</div>
97
+ <div style="opacity: 0.9;">Fine Clusters</div>
98
+ </div>
99
+ <div>
100
+ <div style="font-size: 24px; font-weight: bold;">{stats['avg_properties_per_fine_cluster']:.1f}</div>
101
+ <div style="opacity: 0.9;">Avg Properties/Cluster</div>
102
+ </div>
103
+ """
104
+ elif cluster_level == "coarse" and "coarse_clusters" in stats:
105
+ stats_html += f"""
106
+ <div>
107
+ <div style="font-size: 24px; font-weight: bold;">{stats['coarse_clusters']}</div>
108
+ <div style="opacity: 0.9;">Coarse Clusters</div>
109
+ </div>
110
+ <div>
111
+ <div style="font-size: 24px; font-weight: bold;">{stats['avg_properties_per_coarse_cluster']:.1f}</div>
112
+ <div style="opacity: 0.9;">Avg Properties/Cluster</div>
113
+ </div>
114
+ """
115
+
116
+ stats_html += """
117
+ </div>
118
+ </div>
119
+ """
120
+
121
+ # Add a note if coarse clusters were requested but not available
122
+ if cluster_level == "coarse" and "coarse_clusters" not in stats and "fine_clusters" in stats:
123
+ stats_html += """
124
+ <div style="
125
+ background: #fff3cd;
126
+ border-left: 4px solid #ffc107;
127
+ padding: 10px 15px;
128
+ margin-bottom: 15px;
129
+ border-radius: 4px;
130
+ ">
131
+ ⚠️ <strong>Note:</strong> Coarse clusters not available in this dataset. Showing fine clusters instead.
132
+ </div>
133
+ """
134
+
135
+ # Additional filter chips
136
+ filter_info = ""
137
+ if search_term and search_term.strip():
138
+ filter_info += f"""
139
+ <div style="
140
+ background: #e3f2fd;
141
+ border-left: 4px solid #2196f3;
142
+ padding: 10px 15px;
143
+ margin-bottom: 15px;
144
+ border-radius: 4px;
145
+ ">
146
+ πŸ” <strong>Search Filter:</strong> "{search_term}"
147
+ </div>
148
+ """
149
+
150
+ if selected_models:
151
+ filter_info += f"""
152
+ <div style="
153
+ background: #f3e5f5;
154
+ border-left: 4px solid #9c27b0;
155
+ padding: 10px 15px;
156
+ margin-bottom: 15px;
157
+ border-radius: 4px;
158
+ ">
159
+ 🎯 <strong>Selected Models:</strong> {', '.join(selected_models)}
160
+ </div>
161
+ """
162
+
163
+ return stats_html + filter_info + cluster_html
164
+
165
+
166
+ # ---------------------------------------------------------------------------
167
+ # Dataframe fallback view
168
+ # ---------------------------------------------------------------------------
169
+
170
+ def view_clusters_table(
171
+ selected_models: List[str],
172
+ cluster_level: str,
173
+ search_term: str = "",
174
+ ) -> pd.DataFrame:
175
+ if app_state["clustered_df"] is None:
176
+ return pd.DataFrame({"Message": ["Please load data first using the 'Load Data' tab"]})
177
+
178
+ df = app_state["clustered_df"].copy()
179
+
180
+ if search_term and search_term.strip():
181
+ df = search_clusters_only(df, search_term.strip(), cluster_level)
182
+
183
+ formatted_df = format_cluster_dataframe(df, selected_models, cluster_level)
184
+
185
+ if formatted_df.empty:
186
+ if search_term and search_term.strip():
187
+ return pd.DataFrame({"Message": [f"No results found for search term '{search_term}'. Try a different search term."]})
188
+ elif selected_models:
189
+ available_models = df["model"].unique().tolist() if "model" in df.columns else []
190
+ return pd.DataFrame({"Message": [
191
+ f"No data found for selected models: {', '.join(selected_models)}. "
192
+ f"Available models: {', '.join(available_models)}"
193
+ ]})
194
+ else:
195
+ return pd.DataFrame({"Message": [
196
+ "No data available. Please check your data files and try reloading."
197
+ ]})
198
+
199
+ return formatted_df
lmmvibes/vis_gradio/conversation_display.py ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ """Conversation display helpers for vis_gradio.
4
+
5
+ This module encapsulates everything related to:
6
+ β€’ safely parsing model responses (lists / dicts / JSON strings)
7
+ β€’ pretty-printing embedded dictionaries for readability
8
+ β€’ converting multiple conversation formats to the OpenAI chat list format
9
+ β€’ rendering that list as HTML (including accordion grouping + raw JSON viewer).
10
+
11
+ Moving this logic out of utils.py keeps the latter lean and focussed on general
12
+ analytics utilities.
13
+ """
14
+
15
+ from typing import List, Dict, Any
16
+ import ast
17
+ import json
18
+ import html
19
+ import markdown
20
+ import re
21
+
22
+ __all__: List[str] = [
23
+ "convert_to_openai_format",
24
+ "display_openai_conversation_html",
25
+ "pretty_print_embedded_dicts",
26
+ ]
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Pretty-printing helpers
30
+ # ---------------------------------------------------------------------------
31
+
32
+ def _find_balanced_spans(text: str):
33
+ """Return (start, end) spans of balanced {...} or [...] regions in *text*."""
34
+ spans, stack = [], []
35
+ for i, ch in enumerate(text):
36
+ if ch in "{[":
37
+ stack.append((ch, i))
38
+ elif ch in "]}" and stack:
39
+ opener, start = stack.pop()
40
+ if (opener, ch) in {("{", "}"), ("[", "]")} and not stack:
41
+ spans.append((start, i + 1))
42
+ return spans
43
+
44
+
45
+ def _try_parse_slice(slice_: str):
46
+ """Attempt to parse *slice_* into a Python object; return None on failure."""
47
+ try:
48
+ return ast.literal_eval(slice_)
49
+ except Exception:
50
+ try:
51
+ return json.loads(slice_)
52
+ except Exception:
53
+ return None
54
+
55
+
56
+ def _find_code_spans(text: str) -> List[tuple]:
57
+ """Return spans for markdown code regions to be preserved as-is.
58
+
59
+ Includes:
60
+ - fenced code blocks delimited by ``` ... ```
61
+ - inline code segments delimited by `...`
62
+ """
63
+ spans: List[tuple] = []
64
+
65
+ # Fenced blocks ``` ... ``` (language spec allowed after opening fence)
66
+ idx = 0
67
+ while True:
68
+ start = text.find("```", idx)
69
+ if start == -1:
70
+ break
71
+ # Find the end fence
72
+ end = text.find("```", start + 3)
73
+ if end == -1:
74
+ # Unclosed fence: treat rest of string as code
75
+ spans.append((start, len(text)))
76
+ break
77
+ spans.append((start, end + 3))
78
+ idx = end + 3
79
+
80
+ # Inline code `...`
81
+ for m in re.finditer(r"`[^`]*`", text, flags=re.DOTALL):
82
+ spans.append((m.start(), m.end()))
83
+
84
+ # Sort and merge overlapping spans
85
+ spans.sort()
86
+ merged: List[tuple] = []
87
+ for s, e in spans:
88
+ if not merged or s > merged[-1][1]:
89
+ merged.append((s, e))
90
+ else:
91
+ merged[-1] = (merged[-1][0], max(merged[-1][1], e))
92
+ return merged
93
+
94
+
95
+ def _is_inside_any_span(start: int, end: int, spans: List[tuple]) -> bool:
96
+ for s, e in spans:
97
+ if start >= s and end <= e:
98
+ return True
99
+ return False
100
+
101
+
102
+ def pretty_print_embedded_dicts(text: str) -> str:
103
+ """Replace dicts or list-of-dicts with a `<pre>` block, except inside code.
104
+
105
+ Dict-like regions that fall within markdown code spans (inline backticks
106
+ or fenced code blocks) are left untouched so code examples render verbatim.
107
+ """
108
+ if not text:
109
+ return text
110
+
111
+ code_spans = _find_code_spans(text)
112
+
113
+ new_parts, last_idx = [], 0
114
+ for start, end in _find_balanced_spans(text):
115
+ candidate = text[start:end]
116
+ parsed = _try_parse_slice(candidate)
117
+ is_good = isinstance(parsed, dict) or (
118
+ isinstance(parsed, list) and parsed and all(isinstance(d, dict) for d in parsed)
119
+ )
120
+ if is_good and not _is_inside_any_span(start, end, code_spans):
121
+ new_parts.append(html.escape(text[last_idx:start], quote=False))
122
+ pretty = json.dumps(parsed, indent=2, ensure_ascii=False)
123
+ new_parts.append(
124
+ f"<pre style='background:#f8f9fa;padding:10px;border-radius:4px;overflow-x:auto;'>{pretty}</pre>"
125
+ )
126
+ last_idx = end
127
+ new_parts.append(html.escape(text[last_idx:], quote=False))
128
+ return "".join(new_parts)
129
+
130
+ # ---------------------------------------------------------------------------
131
+ # Format conversion
132
+ # ---------------------------------------------------------------------------
133
+
134
+ def convert_to_openai_format(response_data: Any):
135
+ """Convert various response payloads into the OpenAI chat format list."""
136
+ if isinstance(response_data, list):
137
+ return response_data
138
+ if isinstance(response_data, str):
139
+ # Try Python literal first (handles single quotes)
140
+ try:
141
+ parsed = ast.literal_eval(response_data)
142
+ if isinstance(parsed, list):
143
+ return parsed
144
+ except (ValueError, SyntaxError):
145
+ pass
146
+ # Try JSON
147
+ try:
148
+ parsed = json.loads(response_data)
149
+ if isinstance(parsed, list):
150
+ return parsed
151
+ except json.JSONDecodeError:
152
+ pass
153
+ # Fallback plain-text assistant message
154
+ return [{"role": "assistant", "content": response_data}]
155
+ # Fallback for any other type
156
+ return [{"role": "assistant", "content": str(response_data)}]
157
+
158
+ # ---------------------------------------------------------------------------
159
+ # HTML rendering
160
+ # ---------------------------------------------------------------------------
161
+
162
+ def _markdown(text: str, *, pretty_print_dicts: bool = True) -> str:
163
+ """Render markdown, optionally pretty-printing any embedded dicts."""
164
+ processed = pretty_print_embedded_dicts(text) if pretty_print_dicts else html.escape(text, quote=False)
165
+
166
+ # Configure extensions for proper code block handling
167
+ extensions = ["fenced_code"]
168
+ extension_configs = {}
169
+
170
+ try:
171
+ import pygments
172
+ extensions.append("codehilite")
173
+ extension_configs['codehilite'] = {
174
+ 'css_class': 'highlight',
175
+ 'use_pygments': True,
176
+ 'guess_lang': True,
177
+ 'linenums': False
178
+ }
179
+ except ImportError:
180
+ pass
181
+
182
+ # Convert newlines to <br> only outside of code blocks
183
+ # Process fenced code blocks first, then handle line breaks
184
+ result = markdown.markdown(processed, extensions=extensions, extension_configs=extension_configs)
185
+
186
+ # Add line breaks for non-code content (simple approach)
187
+ # This replaces single newlines with <br> but preserves code blocks
188
+ import re
189
+
190
+ # Split by code blocks to avoid affecting them
191
+ code_block_pattern = r'(<pre[^>]*>.*?</pre>|<code[^>]*>.*?</code>)'
192
+ parts = re.split(code_block_pattern, result, flags=re.DOTALL)
193
+
194
+ for i in range(0, len(parts), 2): # Process non-code parts only
195
+ if i < len(parts):
196
+ # Replace single newlines with <br>, but not double newlines (paragraphs)
197
+ parts[i] = re.sub(r'(?<!\n)\n(?!\n)', '<br>\n', parts[i])
198
+
199
+ return ''.join(parts)
200
+
201
+
202
+ def display_openai_conversation_html(conversation_data: List[Dict[str, Any]], *, use_accordion: bool = True, pretty_print_dicts: bool = True) -> str:
203
+ """Convert an OpenAI-style conversation list into styled HTML for Gradio."""
204
+
205
+ if not conversation_data:
206
+ return "<p>No conversation data available</p>"
207
+
208
+ # Collapsed raw JSON section for debugging
209
+ raw_json = json.dumps(conversation_data, indent=2, ensure_ascii=False)
210
+ html_out = f"""
211
+ <details style="margin: 8px 0;">
212
+ <summary style="cursor: pointer; font-weight: 600;">
213
+ Click to see raw response ({len(conversation_data)})
214
+ </summary>
215
+ <div style="padding: 8px 15px;">
216
+ <pre style="white-space: pre-wrap; word-wrap: break-word; background: #f8f9fa; padding: 10px; border-radius: 4px; overflow-x: auto;">{raw_json}</pre>
217
+ </div>
218
+ </details>
219
+ """
220
+
221
+ role_colors = {
222
+ "system": "#ff6b6b",
223
+ "info": "#4ecdc4",
224
+ "assistant": "#45b7d1",
225
+ "tool": "#96ceb4",
226
+ "user": "#feca57",
227
+ }
228
+
229
+ def _format_msg(role: str, content: Any) -> str:
230
+ if isinstance(content, dict) or (isinstance(content, list) and content and all(isinstance(d, dict) for d in content)):
231
+ if pretty_print_dicts:
232
+ content_html = (
233
+ f"<pre style='background: #f8f9fa; padding: 10px; border-radius: 4px; overflow-x: auto;'>{json.dumps(content, indent=2, ensure_ascii=False)}</pre>"
234
+ )
235
+ else:
236
+ content_html = f"<code>{html.escape(json.dumps(content, ensure_ascii=False))}</code>"
237
+ elif isinstance(content, str):
238
+ content_html = _markdown(content, pretty_print_dicts=pretty_print_dicts)
239
+ elif content is None:
240
+ content_html = "<em>(No content)</em>"
241
+ else:
242
+ content_html = str(content)
243
+ color = role_colors.get(role.lower(), "#95a5a6")
244
+ return (
245
+ f"<div style='border-left: 4px solid {color}; margin: 8px 0; background-color: #ffffff; padding: 12px; border-radius: 0 8px 8px 0;'>"
246
+ f"<div style='font-weight: 600; color: {color}; margin-bottom: 8px; text-transform: capitalize; font-size: 14px;'>{role}</div>"
247
+ f"<div style='color: #333; line-height: 1.6; font-family: \"Segoe UI\", Tahoma, Geneva, Verdana, sans-serif;'>{content_html}</div>"
248
+ "</div>"
249
+ )
250
+
251
+ if use_accordion:
252
+ system_msgs, info_msgs, other_msgs = [], [], []
253
+ for m in conversation_data:
254
+ if not isinstance(m, dict):
255
+ continue
256
+ role = m.get("role", "unknown").lower()
257
+ content = m.get("content", "")
258
+ if isinstance(content, dict) and "text" in content:
259
+ content = content["text"]
260
+ if role == "system":
261
+ system_msgs.append((role, content))
262
+ elif role == "info":
263
+ info_msgs.append((role, content))
264
+ else:
265
+ other_msgs.append((role, content))
266
+
267
+ def _accordion(title: str, items: List):
268
+ if not items:
269
+ return ""
270
+ inner = "".join(_format_msg(r, c) for r, c in items)
271
+ return (
272
+ f"<details style='margin: 8px 0;'>"
273
+ f"<summary style='cursor: pointer; font-weight: 600;'>"
274
+ f"{html.escape(title)} ({len(items)})" # e.g. "Click to see system messages (3)"
275
+ f"</summary>"
276
+ f"<div style='padding: 8px 15px;'>{inner}</div>"
277
+ "</details>"
278
+ )
279
+
280
+ html_out += _accordion("Click to see system messages", system_msgs)
281
+ html_out += _accordion("Click to see info messages", info_msgs)
282
+ for r, c in other_msgs:
283
+ html_out += _format_msg(r, c)
284
+ else:
285
+ # No accordion: just render everything
286
+ for m in conversation_data:
287
+ if not isinstance(m, dict):
288
+ continue
289
+ role = m.get("role", "unknown").lower()
290
+ content = m.get("content", "")
291
+ if isinstance(content, dict) and "text" in content:
292
+ content = content["text"]
293
+ html_out += _format_msg(role, content)
294
+
295
+ # CSS for proper code block styling and summary hover effects
296
+ css_styles = """
297
+ <style>
298
+ :root {
299
+ /* Code block color palette - GitHub Light inspired */
300
+ --code-bg: #f6f8fa;
301
+ --code-text: #24292f;
302
+ --code-comment: #6a737d;
303
+ --code-keyword: #d73a49;
304
+ --code-string: #032f62;
305
+ --code-number: #005cc5;
306
+ --code-operator: #24292f;
307
+ --code-function: #6f42c1;
308
+ --code-border: #d0d7de;
309
+
310
+ /* Inline code colors - same light theme */
311
+ --inline-code-bg: #f3f4f6;
312
+ --inline-code-text: #24292f;
313
+ --inline-code-border: #d1d5db;
314
+
315
+ /* Code block structure */
316
+ --code-border-radius: 8px;
317
+ --code-padding: 16px;
318
+ --code-font-size: 14px;
319
+ --code-line-height: 1.5;
320
+ --code-font-family: 'JetBrains Mono', 'Fira Code', 'Cascadia Code', 'SF Mono', Consolas, 'Liberation Mono', Menlo, Courier, monospace;
321
+ }
322
+
323
+ /* Base code styling */
324
+ pre, code {
325
+ font-family: var(--code-font-family) !important;
326
+ font-size: var(--code-font-size) !important;
327
+ line-height: var(--code-line-height) !important;
328
+ font-variant-ligatures: normal !important;
329
+ -webkit-font-smoothing: antialiased !important;
330
+ -moz-osx-font-smoothing: grayscale !important;
331
+ }
332
+
333
+ /* Fenced code blocks - light theme */
334
+ .highlight, .codehilite, pre.highlight, pre.codehilite,
335
+ .language-python, .language-text, .language-bash {
336
+ background: var(--code-bg) !important;
337
+ color: var(--code-text) !important;
338
+ border: 1px solid var(--code-border) !important;
339
+ border-radius: var(--code-border-radius) !important;
340
+ padding: var(--code-padding) !important;
341
+ margin: 12px 0 !important;
342
+ overflow-x: auto !important;
343
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05) !important;
344
+ position: relative !important;
345
+ white-space: pre !important;
346
+ display: block !important;
347
+ }
348
+
349
+ .highlight pre, .codehilite pre {
350
+ background: transparent !important;
351
+ color: inherit !important;
352
+ margin: 0 !important;
353
+ padding: 0 !important;
354
+ border: none !important;
355
+ border-radius: 0 !important;
356
+ overflow: visible !important;
357
+ white-space: pre !important;
358
+ display: block !important;
359
+ }
360
+
361
+ /* Ensure code blocks preserve formatting */
362
+ .highlight code, .codehilite code {
363
+ white-space: pre !important;
364
+ display: block !important;
365
+ padding: 0 !important;
366
+ margin: 0 !important;
367
+ background: transparent !important;
368
+ border: none !important;
369
+ font-size: inherit !important;
370
+ line-height: inherit !important;
371
+ }
372
+
373
+ /* Add language label for fenced blocks */
374
+ .highlight::before, .codehilite::before {
375
+ content: 'python';
376
+ position: absolute;
377
+ top: 8px;
378
+ right: 12px;
379
+ background: rgba(0, 0, 0, 0.05);
380
+ color: #586069;
381
+ padding: 2px 8px;
382
+ border-radius: 4px;
383
+ font-size: 11px;
384
+ font-weight: 500;
385
+ text-transform: uppercase;
386
+ letter-spacing: 0.5px;
387
+ }
388
+
389
+ /* Syntax highlighting for Python - Light theme */
390
+ .highlight .k, .codehilite .k, /* keywords */
391
+ .highlight .kn, .codehilite .kn, /* keyword.namespace */
392
+ .highlight .kp, .codehilite .kp, /* keyword.pseudo */
393
+ .highlight .kr, .codehilite .kr, /* keyword.reserved */
394
+ .highlight .kt, .codehilite .kt /* keyword.type */
395
+ {
396
+ color: var(--code-keyword) !important;
397
+ font-weight: 600 !important;
398
+ }
399
+
400
+ .highlight .s, .codehilite .s, /* strings */
401
+ .highlight .s1, .codehilite .s1, /* string.single */
402
+ .highlight .s2, .codehilite .s2, /* string.double */
403
+ .highlight .se, .codehilite .se /* string.escape */
404
+ {
405
+ color: var(--code-string) !important;
406
+ }
407
+
408
+ .highlight .c, .codehilite .c, /* comments */
409
+ .highlight .c1, .codehilite .c1, /* comment.single */
410
+ .highlight .cm, .codehilite .cm /* comment.multiline */
411
+ {
412
+ color: var(--code-comment) !important;
413
+ font-style: italic !important;
414
+ }
415
+
416
+ .highlight .m, .codehilite .m, /* numbers */
417
+ .highlight .mi, .codehilite .mi, /* number.integer */
418
+ .highlight .mf, .codehilite .mf, /* number.float */
419
+ .highlight .mo, .codehilite .mo /* number.octal */
420
+ {
421
+ color: var(--code-number) !important;
422
+ font-weight: 600 !important;
423
+ }
424
+
425
+ .highlight .nf, .codehilite .nf, /* function names */
426
+ .highlight .fm, .codehilite .fm /* function.magic */
427
+ {
428
+ color: var(--code-function) !important;
429
+ font-weight: 600 !important;
430
+ }
431
+
432
+ .highlight .o, .codehilite .o, /* operators */
433
+ .highlight .ow, .codehilite .ow /* operator.word */
434
+ {
435
+ color: var(--code-operator) !important;
436
+ }
437
+
438
+ /* Inline code - light theme */
439
+ p code, li code, div code, span code,
440
+ h1 code, h2 code, h3 code, h4 code, h5 code, h6 code {
441
+ background: var(--inline-code-bg) !important;
442
+ color: var(--inline-code-text) !important;
443
+ border: 1px solid var(--inline-code-border) !important;
444
+ padding: 2px 6px !important;
445
+ border-radius: 4px !important;
446
+ font-size: 0.9em !important;
447
+ font-weight: 600 !important;
448
+ white-space: nowrap !important;
449
+ box-shadow: none !important;
450
+ display: inline !important;
451
+ }
452
+
453
+ /* Code blocks inside paragraphs should not be treated as inline */
454
+ p pre, li pre, div pre {
455
+ background: var(--code-bg) !important;
456
+ color: var(--code-text) !important;
457
+ border: 1px solid var(--code-border) !important;
458
+ border-radius: var(--code-border-radius) !important;
459
+ padding: var(--code-padding) !important;
460
+ margin: 8px 0 !important;
461
+ white-space: pre !important;
462
+ overflow-x: auto !important;
463
+ display: block !important;
464
+ }
465
+
466
+ /* Scrollbar styling for code blocks - light theme */
467
+ .highlight::-webkit-scrollbar, .codehilite::-webkit-scrollbar,
468
+ pre::-webkit-scrollbar {
469
+ height: 8px !important;
470
+ background: #f1f3f4 !important;
471
+ border-radius: 4px !important;
472
+ }
473
+
474
+ .highlight::-webkit-scrollbar-thumb, .codehilite::-webkit-scrollbar-thumb,
475
+ pre::-webkit-scrollbar-thumb {
476
+ background: #c1c8cd !important;
477
+ border-radius: 4px !important;
478
+ }
479
+
480
+ .highlight::-webkit-scrollbar-thumb:hover, .codehilite::-webkit-scrollbar-thumb:hover,
481
+ pre::-webkit-scrollbar-thumb:hover {
482
+ background: #a8b3ba !important;
483
+ }
484
+ """
485
+
486
+ if use_accordion:
487
+ css_styles += """
488
+ /* Accordion styling */
489
+ details > summary {
490
+ list-style: none !important;
491
+ cursor: pointer !important;
492
+ }
493
+ details > summary:hover {
494
+ background-color: transparent !important;
495
+ box-shadow: none !important;
496
+ transform: none !important;
497
+ }
498
+ details > summary::-webkit-details-marker,
499
+ details > summary::marker {
500
+ display: none !important;
501
+ }
502
+ """
503
+
504
+ css_styles += "</style>"
505
+ html_out = css_styles + html_out
506
+
507
+ return html_out
lmmvibes/vis_gradio/data_loader.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data loading functionality for the LMM-Vibes Gradio app.
3
+
4
+ This module handles loading pipeline results and converting them to formats
5
+ suitable for the Gradio interface.
6
+ """
7
+
8
+ import json
9
+ import pandas as pd
10
+ from pathlib import Path
11
+ from typing import Dict, List, Any, Tuple, Optional
12
+ import os
13
+
14
+ from .state import app_state
15
+ from lmmvibes.metrics.plotting import create_model_cluster_dataframe
16
+
17
+
18
+ class DataCache:
19
+ """Simple cache for loaded data to avoid re-loading."""
20
+ _cache = {}
21
+
22
+ @classmethod
23
+ def get(cls, key: str):
24
+ return cls._cache.get(key)
25
+
26
+ @classmethod
27
+ def set(cls, key: str, value: Any):
28
+ cls._cache[key] = value
29
+
30
+ @classmethod
31
+ def clear(cls):
32
+ cls._cache.clear()
33
+
34
+
35
+ def scan_for_result_subfolders(base_dir: str) -> List[str]:
36
+ """Scan for subfolders that might contain pipeline results."""
37
+ base_path = Path(base_dir)
38
+ if not base_path.exists():
39
+ return []
40
+
41
+ # Look for subfolders that contain the required files
42
+ subfolders = []
43
+ for item in base_path.iterdir():
44
+ if item.is_dir():
45
+ # Check if this subfolder contains pipeline results
46
+ required_files = [
47
+ "model_cluster_scores.json",
48
+ "cluster_scores.json",
49
+ "model_scores.json",
50
+ "clustered_results_lightweight.jsonl"
51
+ ]
52
+ if all((item / f).exists() for f in required_files):
53
+ subfolders.append(item.name)
54
+
55
+ return subfolders
56
+
57
+
58
+ def validate_results_directory(results_dir: str) -> Tuple[bool, str]:
59
+ """Validate that the results directory contains the expected files."""
60
+ results_path = Path(results_dir)
61
+
62
+ if not results_path.exists():
63
+ return False, f"Directory does not exist: {results_dir}"
64
+
65
+ if not results_path.is_dir():
66
+ return False, f"Path is not a directory: {results_dir}"
67
+
68
+ # Check for FunctionalMetrics format files
69
+ required_files = [
70
+ "model_cluster_scores.json",
71
+ "cluster_scores.json",
72
+ "model_scores.json",
73
+ ]
74
+
75
+ missing_files = []
76
+ for filename in required_files:
77
+ if not (results_path / filename).exists():
78
+ missing_files.append(filename)
79
+
80
+ # Check for clustered results
81
+ if not (results_path / "clustered_results_lightweight.jsonl").exists():
82
+ missing_files.append("clustered_results_lightweight.jsonl")
83
+
84
+ if missing_files:
85
+ return False, f"Missing required files: {', '.join(missing_files)}"
86
+
87
+ return True, ""
88
+
89
+
90
+ def get_available_models(metrics: Dict[str, Any]) -> List[str]:
91
+ """Extract available models from metrics data."""
92
+ model_cluster_scores = metrics.get("model_cluster_scores", {})
93
+ return list(model_cluster_scores.keys())
94
+
95
+
96
+ def get_all_models(metrics: Dict[str, Any]) -> List[str]:
97
+ """Get all available models from metrics data."""
98
+ return get_available_models(metrics)
99
+
100
+
101
+ def load_pipeline_results(results_dir: str) -> Tuple[pd.DataFrame, Dict[str, Any], pd.DataFrame, Path]:
102
+ """Load pipeline outputs (FunctionalMetrics format only).
103
+ Returns:
104
+ clustered_df: DataFrame of per-conversation data loaded from clustered_results.jsonl
105
+ metrics: Dict containing the three FunctionalMetrics score dictionaries
106
+ model_cluster_df: DataFrame created from model_cluster_scores for plotting/analysis
107
+ results_path: Path to the results directory
108
+ """
109
+ cache_key = f"pipeline_results_{results_dir}"
110
+ cached = DataCache.get(cache_key)
111
+ if cached:
112
+ return cached
113
+
114
+ results_path = Path(results_dir)
115
+ if not results_path.exists():
116
+ raise FileNotFoundError(f"Results directory does not exist: {results_dir}")
117
+
118
+ # ------------------------------------------------------------------
119
+ # 1. Load FunctionalMetrics score files (must ALL be present)
120
+ # ------------------------------------------------------------------
121
+ required_files = [
122
+ "model_cluster_scores.json",
123
+ "cluster_scores.json",
124
+ "model_scores.json",
125
+ ]
126
+ missing = [f for f in required_files if not (results_path / f).exists()]
127
+ if missing:
128
+ raise FileNotFoundError(
129
+ f"Missing required metrics files in {results_dir}: {', '.join(missing)}"
130
+ )
131
+
132
+ with open(results_path / "model_cluster_scores.json") as f:
133
+ model_cluster_scores = json.load(f)
134
+ with open(results_path / "cluster_scores.json") as f:
135
+ cluster_scores = json.load(f)
136
+ with open(results_path / "model_scores.json") as f:
137
+ model_scores = json.load(f)
138
+
139
+ metrics = {
140
+ "model_cluster_scores": model_cluster_scores,
141
+ "cluster_scores": cluster_scores,
142
+ "model_scores": model_scores,
143
+ }
144
+
145
+ # ------------------------------------------------------------------
146
+ # 2. Load clustered conversation data (JSON-Lines)
147
+ # ------------------------------------------------------------------
148
+ clustered_path = results_path / "clustered_results_lightweight.jsonl"
149
+ if not clustered_path.exists():
150
+ raise FileNotFoundError(f"clustered_results_lightweight.jsonl not found in {results_dir}")
151
+
152
+ try:
153
+ clustered_df = pd.read_json(clustered_path, lines=True)
154
+ except Exception as e:
155
+ raise ValueError(f"Could not load clustered results: {e}")
156
+
157
+ # ------------------------------------------------------------------
158
+ # 3. Create model_cluster_df from metrics for plotting/analysis
159
+ # ------------------------------------------------------------------
160
+ model_cluster_df = create_model_cluster_dataframe(model_cluster_scores)
161
+
162
+ result = (clustered_df, metrics, model_cluster_df, results_path)
163
+ DataCache.set(cache_key, result)
164
+ return result
165
+
166
+
167
+ def load_property_examples(results_path: Path, property_ids: List[str]) -> pd.DataFrame:
168
+ """Load specific property examples on-demand"""
169
+ if not property_ids:
170
+ return pd.DataFrame()
171
+
172
+ cache_key = f"examples_{results_path}_{hash(tuple(sorted(property_ids)))}"
173
+ cached = DataCache.get(cache_key)
174
+ if cached is not None:
175
+ return cached
176
+
177
+ # Load full dataset to get prompt/response details
178
+ clustered_path = results_path / "clustered_results_lightweight.jsonl"
179
+
180
+ if not clustered_path.exists():
181
+ raise FileNotFoundError("Could not load example data - clustered_results_lightweight.jsonl not found")
182
+
183
+ try:
184
+ full_df = pd.read_json(clustered_path, lines=True)
185
+ result = full_df[full_df['id'].isin(property_ids)]
186
+ DataCache.set(cache_key, result)
187
+ return result
188
+ except Exception as e:
189
+ raise ValueError(f"Failed to load examples: {e}")
lmmvibes/vis_gradio/debug_tab.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Logic for the **Debug Data** tab."""
2
+ from __future__ import annotations
3
+
4
+ from .state import app_state
5
+
6
+ __all__ = ["debug_data_structure"]
7
+
8
+
9
+ def debug_data_structure() -> str:
10
+ if app_state["clustered_df"] is None:
11
+ return "<p style='color: #e74c3c;'>❌ No data loaded</p>"
12
+
13
+ df = app_state["clustered_df"]
14
+
15
+ n_rows = len(df)
16
+ n_cols = len(df.columns)
17
+
18
+ # Check for both naming patterns
19
+ has_fine_clusters = ("property_description_fine_cluster_id" in df.columns or
20
+ "fine_cluster_id" in df.columns)
21
+ has_coarse_clusters = ("property_description_coarse_cluster_id" in df.columns or
22
+ "coarse_cluster_id" in df.columns)
23
+
24
+ sample_rows = min(3, len(df))
25
+ sample_data = df.head(sample_rows).to_html(
26
+ escape=False,
27
+ classes="table table-striped",
28
+ table_id="debug-table",
29
+ )
30
+
31
+ html = f"""
32
+ <div style="max-width: 1200px; margin: 0 auto;">
33
+ <h3>πŸ› Data Structure Debug Info</h3>
34
+
35
+ <div style="background: #f8f9fa; padding: 15px; border-radius: 8px; margin: 15px 0;">
36
+ <h4>Basic Statistics</h4>
37
+ <ul>
38
+ <li><strong>Rows:</strong> {n_rows:,}</li>
39
+ <li><strong>Columns:</strong> {n_cols}</li>
40
+ <li><strong>Fine Clusters Available:</strong> {'βœ… Yes' if has_fine_clusters else '❌ No'}</li>
41
+ <li><strong>Coarse Clusters Available:</strong> {'βœ… Yes' if has_coarse_clusters else '❌ No'}</li>
42
+ </ul>
43
+ </div>
44
+
45
+ <div style="background: #f8f9fa; padding: 15px; border-radius: 8px; margin: 15px 0;">
46
+ <h4>Available Columns</h4>
47
+ <div style="max-height: 200px; overflow-y: auto; background: white; padding: 10px; border-radius: 4px;">
48
+ <ul>
49
+ """
50
+
51
+ for col in sorted(df.columns):
52
+ unique_values = df[col].nunique() if df[col].dtype == "object" else "N/A"
53
+ html += f"<li><code>{col}</code> - {df[col].dtype} (unique values: {unique_values})</li>"
54
+
55
+ html += f"""
56
+ </ul>
57
+ </div>
58
+ </div>
59
+
60
+ <div style="background: #f8f9fa; padding: 15px; border-radius: 8px; margin: 15px 0;">
61
+ <h4>Sample Data (First {sample_rows} rows)</h4>
62
+ <div style="max-height: 400px; overflow: auto; background: white; padding: 10px; border-radius: 4px;">
63
+ {sample_data}
64
+ </div>
65
+ </div>
66
+ </div>
67
+
68
+ <style>
69
+ #debug-table {{
70
+ font-size: 12px;
71
+ width: 100%;
72
+ }}
73
+ #debug-table th, #debug-table td {{
74
+ padding: 4px 8px;
75
+ border: 1px solid #ddd;
76
+ }}
77
+ #debug-table th {{
78
+ background: #f1f1f1;
79
+ }}
80
+ </style>
81
+ """
82
+
83
+ return html
lmmvibes/vis_gradio/demo.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Demo script showing different ways to use the LMM-Vibes Gradio visualization.
3
+
4
+ This demonstrates the Python API for launching the Gradio app.
5
+ """
6
+
7
+ import argparse
8
+ from pathlib import Path
9
+ from lmmvibes.vis_gradio import launch_app, create_app
10
+
11
+
12
+ def demo_basic_launch():
13
+ """Demo: Basic launch without pre-loading data."""
14
+ print("πŸš€ Demo: Basic launch - data can be loaded through the UI")
15
+ launch_app()
16
+
17
+
18
+ def demo_preload_data(results_dir: str):
19
+ """Demo: Launch with pre-loaded data."""
20
+ print(f"πŸš€ Demo: Launch with pre-loaded data from {results_dir}")
21
+ launch_app(results_dir=results_dir)
22
+
23
+
24
+ def demo_custom_settings(results_dir: str = None):
25
+ """Demo: Launch with custom settings."""
26
+ print("πŸš€ Demo: Launch with custom settings")
27
+ launch_app(
28
+ results_dir=results_dir,
29
+ share=True, # Create public shareable link
30
+ server_name="0.0.0.0", # Allow access from other machines
31
+ server_port=8080, # Custom port
32
+ )
33
+
34
+
35
+ def demo_programmatic_access():
36
+ """Demo: Create app object for programmatic access."""
37
+ print("πŸš€ Demo: Programmatic app creation")
38
+
39
+ # Create the app object without launching
40
+ app = create_app()
41
+
42
+ # You could modify the app here if needed
43
+ # app.title = "My Custom Title"
44
+
45
+ # Launch when ready
46
+ print("Launching app...")
47
+ app.launch(share=False, server_port=7861)
48
+
49
+
50
+ def main():
51
+ parser = argparse.ArgumentParser(description="LMM-Vibes Gradio Visualization Demo")
52
+ parser.add_argument("--results_dir", help="Path to results directory for demos")
53
+ parser.add_argument("--demo", choices=[
54
+ "basic", "preload", "custom", "programmatic"
55
+ ], default="basic", help="Which demo to run")
56
+
57
+ args = parser.parse_args()
58
+
59
+ if args.demo == "basic":
60
+ demo_basic_launch()
61
+ elif args.demo == "preload":
62
+ if not args.results_dir:
63
+ print("❌ Error: --results_dir required for preload demo")
64
+ return
65
+ demo_preload_data(args.results_dir)
66
+ elif args.demo == "custom":
67
+ demo_custom_settings(args.results_dir)
68
+ elif args.demo == "programmatic":
69
+ demo_programmatic_access()
70
+
71
+
72
+ if __name__ == "__main__":
73
+ main()
lmmvibes/vis_gradio/examples_tab.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Logic for the **View Examples** tab – dropdown population + example renderer."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Any, List, Tuple
5
+
6
+ import gradio as gr
7
+
8
+ from .state import app_state
9
+ from .utils import (
10
+ get_unique_values_for_dropdowns,
11
+ get_example_data,
12
+ format_examples_display,
13
+ search_clusters_only,
14
+ )
15
+
16
+ __all__: List[str] = [
17
+ "get_dropdown_choices",
18
+ "update_example_dropdowns",
19
+ "view_examples",
20
+ "get_filter_options",
21
+ "update_filter_dropdowns",
22
+ ]
23
+
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Dropdown helpers
27
+ # ---------------------------------------------------------------------------
28
+
29
+ def get_dropdown_choices() -> Tuple[List[str], List[str], List[str]]:
30
+ if app_state["clustered_df"] is None:
31
+ return [], [], []
32
+
33
+ choices = get_unique_values_for_dropdowns(app_state["clustered_df"])
34
+ prompts = ["All Prompts"] + choices["prompts"]
35
+ models = ["All Models"] + choices["models"]
36
+ properties = ["All Clusters"] + choices["properties"]
37
+ return prompts, models, properties
38
+
39
+
40
+ def update_example_dropdowns() -> Tuple[Any, Any, Any]:
41
+ prompts, models, properties = get_dropdown_choices()
42
+ return (
43
+ gr.update(choices=prompts, value="All Prompts" if prompts else None),
44
+ gr.update(choices=models, value="All Models" if models else None),
45
+ gr.update(choices=properties, value="All Clusters" if properties else None),
46
+ )
47
+
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Example viewer
51
+ # ---------------------------------------------------------------------------
52
+
53
+ def view_examples(
54
+ selected_prompt: str,
55
+ selected_model: str,
56
+ selected_property: str,
57
+ max_examples: int = 5,
58
+ use_accordion: bool = True,
59
+ pretty_print_dicts: bool = True,
60
+ search_term: str = "",
61
+ show_unexpected_behavior: bool = False,
62
+ ) -> str:
63
+ if app_state["clustered_df"] is None:
64
+ return (
65
+ "<p style='color: #e74c3c; padding: 20px;'>❌ Please load data first "
66
+ "using the 'Load Data' tab</p>"
67
+ )
68
+
69
+ # Apply search filter first if search term is provided
70
+ df = app_state["clustered_df"]
71
+ if search_term and isinstance(search_term, str) and search_term.strip():
72
+ df = search_clusters_only(df, search_term.strip(), 'fine') # Default to fine clusters
73
+ if df.empty:
74
+ return f"<p style='color: #e74c3c; padding: 20px;'>❌ No clusters found matching '{search_term}'</p>"
75
+
76
+ examples = get_example_data(
77
+ df,
78
+ selected_prompt if selected_prompt != "All Prompts" else None,
79
+ selected_model if selected_model != "All Models" else None,
80
+ selected_property if selected_property != "All Clusters" else None,
81
+ max_examples,
82
+ show_unexpected_behavior=show_unexpected_behavior,
83
+ randomize=(
84
+ (selected_prompt == "All Prompts") and
85
+ (selected_model == "All Models") and
86
+ (selected_property == "All Clusters") and
87
+ (not search_term or not str(search_term).strip())
88
+ ),
89
+ )
90
+
91
+ return format_examples_display(
92
+ examples,
93
+ selected_prompt,
94
+ selected_model,
95
+ selected_property,
96
+ use_accordion=use_accordion,
97
+ pretty_print_dicts=pretty_print_dicts,
98
+ )
99
+
100
+
101
+ # ---------------------------------------------------------------------------
102
+ # Filter dropdown helpers for frequency comparison
103
+ # ---------------------------------------------------------------------------
104
+
105
+ def get_filter_options() -> Tuple[List[str], List[str]]:
106
+ if not app_state["model_stats"]:
107
+ return ["All Models"], ["All Metrics"]
108
+
109
+ available_models = ["All Models"] + list(app_state["model_stats"].keys())
110
+
111
+ quality_metrics = set()
112
+ for model_data in app_state["model_stats"].values():
113
+ clusters = model_data.get("fine", []) + model_data.get("coarse", [])
114
+ for cluster in clusters:
115
+ quality_score = cluster.get("quality_score", {})
116
+ if isinstance(quality_score, dict):
117
+ quality_metrics.update(quality_score.keys())
118
+
119
+ available_metrics = ["All Metrics"] + sorted(list(quality_metrics))
120
+
121
+ return available_models, available_metrics
122
+
123
+
124
+ def update_filter_dropdowns() -> Tuple[Any, Any]:
125
+ models, metrics = get_filter_options()
126
+ return (
127
+ gr.update(choices=models, value="All Models" if models else None),
128
+ gr.update(choices=metrics, value="All Metrics" if metrics else None),
129
+ )
lmmvibes/vis_gradio/frequency_tab.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Logic for the **Frequency Comparison** tab."""
2
+ from typing import List, Tuple, Dict, Any
3
+
4
+ import pandas as pd
5
+
6
+ from .state import app_state
7
+
8
+
9
+ # ---------------------------------------------------------------------------
10
+ # NOTE: app_state currently stores metrics under the legacy key 'model_stats'.
11
+ # During later cleanup this module will switch to 'metrics'. For now we treat
12
+ # the value as already being the new FunctionalMetrics dict.
13
+ # ---------------------------------------------------------------------------
14
+
15
+ __all__ = ["create_frequency_comparison", "create_frequency_plots"]
16
+
17
+
18
+ def create_frequency_comparison(
19
+ selected_models: List[str],
20
+ ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str]:
21
+ """Create frequency comparison tables for the 3 functional metrics tables."""
22
+ if not app_state["model_stats"]:
23
+ empty_df = pd.DataFrame({"Message": ["Please load data first"]})
24
+ return empty_df, empty_df, empty_df, ""
25
+
26
+ if not selected_models:
27
+ empty_df = pd.DataFrame({"Message": ["Please select at least one model"]})
28
+ return empty_df, empty_df, empty_df, ""
29
+
30
+ # Get the functional metrics data
31
+ metrics_data = app_state["model_stats"]
32
+
33
+ # Debug: Print data structure info
34
+ print(f"DEBUG: Creating frequency comparison tables")
35
+ print(f" - Selected models: {selected_models}")
36
+ print(f" - Available keys in metrics_data: {list(metrics_data.keys())}")
37
+
38
+ if "model_cluster_scores" in metrics_data:
39
+ model_cluster_scores = metrics_data["model_cluster_scores"]
40
+ print(f" - Model cluster scores keys: {list(model_cluster_scores.keys())}")
41
+ for model in selected_models:
42
+ if model in model_cluster_scores:
43
+ clusters = model_cluster_scores[model]
44
+ print(f" - {model}: {len(clusters)} clusters")
45
+ else:
46
+ print(f" - {model}: NOT FOUND in model_cluster_scores")
47
+
48
+ if "cluster_scores" in metrics_data:
49
+ cluster_scores = metrics_data["cluster_scores"]
50
+ print(f" - Cluster scores: {len(cluster_scores)} clusters")
51
+
52
+ if "model_scores" in metrics_data:
53
+ model_scores = metrics_data["model_scores"]
54
+ print(f" - Model scores: {list(model_scores.keys())}")
55
+
56
+ # Create the three tables
57
+ model_cluster_df = create_model_cluster_table(metrics_data, selected_models)
58
+ cluster_df = create_cluster_table(metrics_data, selected_models)
59
+ model_df = create_model_table(metrics_data, selected_models)
60
+
61
+ print(f" - Created tables with rows: Model-Cluster={len(model_cluster_df)}, Cluster={len(cluster_df)}, Model={len(model_df)}")
62
+
63
+ info_text = f"**Model-Cluster Scores:** {len(model_cluster_df)} rows | **Cluster Scores:** {len(cluster_df)} rows | **Model Scores:** {len(model_df)} rows"
64
+ return model_cluster_df, cluster_df, model_df, info_text
65
+
66
+
67
+ def create_model_cluster_table(metrics_data: Dict[str, Any], selected_models: List[str]) -> pd.DataFrame:
68
+ """Create table for model-cluster scores."""
69
+ model_cluster_scores = metrics_data.get("model_cluster_scores", {})
70
+
71
+ print(f"DEBUG: Creating model-cluster table")
72
+ print(f" - Available models in model_cluster_scores: {list(model_cluster_scores.keys())}")
73
+ print(f" - Selected models: {selected_models}")
74
+
75
+ rows = []
76
+ for model_name, clusters in model_cluster_scores.items():
77
+ if model_name not in selected_models:
78
+ print(f" - Skipping {model_name} (not in selected_models)")
79
+ continue
80
+
81
+ print(f" - Processing {model_name} with {len(clusters)} clusters")
82
+ for cluster_name, metrics in clusters.items():
83
+ # Filter out "No properties" clusters
84
+ if cluster_name == "No properties":
85
+ continue
86
+
87
+ # Basic metrics
88
+ size = metrics.get("size", 0)
89
+ proportion = metrics.get("proportion", 0) * 100 # Convert to percentage
90
+ proportion_delta = metrics.get("proportion_delta", 0) * 100 # Convert to percentage
91
+
92
+ # Quality metrics - show each metric separately
93
+ quality = metrics.get("quality", {})
94
+ quality_delta = metrics.get("quality_delta", {})
95
+
96
+ # Create base row
97
+ row = {
98
+ "Model": model_name,
99
+ "Cluster": cluster_name,
100
+ "Size": size,
101
+ "Proportion (%)": f"{proportion:.1f}",
102
+ "Proportion Delta (%)": f"{proportion_delta:.1f}",
103
+ # "Examples": len(metrics.get("examples", []))
104
+ }
105
+
106
+ # Add quality metrics for each individual metric
107
+ for metric_name, quality_val in quality.items():
108
+ row[f"Quality_{metric_name.title()}"] = f"{quality_val:.3f}"
109
+
110
+ for metric_name, delta_val in quality_delta.items():
111
+ row[f"Quality_Delta_{metric_name.title()}"] = f"{delta_val:+.3f}"
112
+
113
+ # Confidence intervals
114
+ proportion_ci = metrics.get("proportion_ci", {})
115
+ proportion_delta_ci = metrics.get("proportion_delta_ci", {})
116
+
117
+ # Significance flags
118
+ proportion_delta_significant = metrics.get("proportion_delta_significant", False)
119
+ quality_delta_significant = metrics.get("quality_delta_significant", {})
120
+
121
+ # Format confidence intervals
122
+ proportion_ci_str = format_ci(proportion_ci)
123
+ proportion_delta_ci_str = format_ci(proportion_delta_ci)
124
+
125
+ # Add confidence intervals and significance
126
+ row.update({
127
+ "Proportion CI": proportion_ci_str,
128
+ "Proportion Delta CI": proportion_delta_ci_str,
129
+ "Proportion Delta Significant": "Yes" if proportion_delta_significant else "No",
130
+ })
131
+
132
+ # Add quality delta significance for each metric
133
+ for metric_name, is_significant in quality_delta_significant.items():
134
+ row[f"Quality_Delta_{metric_name.title()}_Significant"] = "Yes" if is_significant else "No"
135
+
136
+ rows.append(row)
137
+
138
+ print(f" - Created {len(rows)} rows for model-cluster table")
139
+ return pd.DataFrame(rows)
140
+
141
+
142
+ def create_cluster_table(metrics_data: Dict[str, Any], selected_models: List[str]) -> pd.DataFrame:
143
+ """Create table for cluster scores (aggregated across all models)."""
144
+ cluster_scores = metrics_data.get("cluster_scores", {})
145
+
146
+ print(f"DEBUG: Creating cluster table")
147
+ print(f" - Available clusters: {list(cluster_scores.keys())}")
148
+ print(f" - Number of clusters: {len(cluster_scores)}")
149
+
150
+ rows = []
151
+ for cluster_name, metrics in cluster_scores.items():
152
+ # Filter out "No properties" clusters
153
+ if cluster_name == "No properties":
154
+ continue
155
+
156
+ # Basic metrics
157
+ size = metrics.get("size", 0)
158
+ proportion = metrics.get("proportion", 0) * 100 # Convert to percentage
159
+
160
+ # Quality metrics - show each metric separately
161
+ quality = metrics.get("quality", {})
162
+ quality_delta = metrics.get("quality_delta", {})
163
+
164
+ # Create base row
165
+ row = {
166
+ "Cluster": cluster_name,
167
+ "Size": size,
168
+ "Proportion (%)": f"{proportion:.1f}",
169
+ # "Examples": len(metrics.get("examples", []))
170
+ }
171
+
172
+ # Add quality metrics for each individual metric
173
+ for metric_name, quality_val in quality.items():
174
+ row[f"Quality_{metric_name.title()}"] = f"{quality_val:.3f}"
175
+
176
+ for metric_name, delta_val in quality_delta.items():
177
+ row[f"Quality_Delta_{metric_name.title()}"] = f"{delta_val:+.3f}"
178
+
179
+ # Confidence intervals
180
+ proportion_ci = metrics.get("proportion_ci", {})
181
+ quality_ci = metrics.get("quality_ci", {})
182
+ quality_delta_ci = metrics.get("quality_delta_ci", {})
183
+
184
+ # Significance flags
185
+ quality_delta_significant = metrics.get("quality_delta_significant", {})
186
+
187
+ # Format confidence intervals
188
+ proportion_ci_str = format_ci(proportion_ci)
189
+ quality_ci_str = format_ci(quality_ci)
190
+ quality_delta_ci_str = format_ci(quality_delta_ci)
191
+
192
+ # Add confidence intervals and significance
193
+ row.update({
194
+ "Proportion CI": proportion_ci_str,
195
+ })
196
+
197
+ # Add quality CI and significance for each metric
198
+ for metric_name in quality.keys():
199
+ if metric_name in quality_ci:
200
+ ci = quality_ci[metric_name]
201
+ row[f"Quality_{metric_name.title()}_CI"] = format_ci(ci)
202
+
203
+ for metric_name in quality_delta.keys():
204
+ if metric_name in quality_delta_ci:
205
+ ci = quality_delta_ci[metric_name]
206
+ row[f"Quality_Delta_{metric_name.title()}_CI"] = format_ci(ci)
207
+ row[f"Quality_Delta_{metric_name.title()}_Significant"] = "Yes" if quality_delta_significant.get(metric_name, False) else "No"
208
+
209
+ rows.append(row)
210
+
211
+ print(f" - Created {len(rows)} rows for cluster table")
212
+ return pd.DataFrame(rows)
213
+
214
+
215
+ def create_model_table(metrics_data: Dict[str, Any], selected_models: List[str]) -> pd.DataFrame:
216
+ """Create table for model scores (aggregated across all clusters)."""
217
+ model_scores = metrics_data.get("model_scores", {})
218
+
219
+ print(f"DEBUG: Creating model table")
220
+ print(f" - Available models in model_scores: {list(model_scores.keys())}")
221
+ print(f" - Selected models: {selected_models}")
222
+
223
+ rows = []
224
+ for model_name, metrics in model_scores.items():
225
+ # Filter by selected models
226
+ if model_name not in selected_models:
227
+ print(f" - Skipping {model_name} (not in selected_models)")
228
+ continue
229
+
230
+ print(f" - Processing {model_name}")
231
+ # Basic metrics
232
+ size = metrics.get("size", 0)
233
+ proportion = metrics.get("proportion", 0) * 100 # Convert to percentage
234
+
235
+ # Quality metrics - show each metric separately
236
+ quality = metrics.get("quality", {})
237
+ quality_delta = metrics.get("quality_delta", {})
238
+
239
+ # Create base row
240
+ row = {
241
+ "Model": model_name,
242
+ "Size": size,
243
+ # "Proportion (%)": f"{proportion:.1f}",
244
+ # "Examples": len(metrics.get("examples", []))
245
+ }
246
+
247
+ # Add quality metrics for each individual metric
248
+ for metric_name, quality_val in quality.items():
249
+ row[f"Quality_{metric_name.title()}"] = f"{quality_val:.3f}"
250
+
251
+ # for metric_name, delta_val in quality_delta.items():
252
+ # row[f"Quality_Delta_{metric_name.title()}"] = f"{delta_val:+.3f}"
253
+
254
+ # Confidence intervals
255
+ proportion_ci = metrics.get("proportion_ci", {})
256
+ quality_ci = metrics.get("quality_ci", {})
257
+ quality_delta_ci = metrics.get("quality_delta_ci", {})
258
+
259
+ # Significance flags
260
+ quality_delta_significant = metrics.get("quality_delta_significant", {})
261
+
262
+ # Format confidence intervals
263
+ proportion_ci_str = format_ci(proportion_ci)
264
+
265
+ # Add confidence intervals and significance
266
+ row.update({
267
+ "Proportion CI": proportion_ci_str,
268
+ })
269
+
270
+ # Add quality CI and significance for each metric
271
+ for metric_name in quality.keys():
272
+ if metric_name in quality_ci:
273
+ ci = quality_ci[metric_name]
274
+ row[f"Quality_{metric_name.title()}_CI"] = format_ci(ci)
275
+
276
+ # for metric_name in quality_delta.keys():
277
+ # if metric_name in quality_delta_ci:
278
+ # ci = quality_delta_ci[metric_name]
279
+ # row[f"Quality_Delta_{metric_name.title()}_CI"] = format_ci(ci)
280
+ # row[f"Quality_Delta_{metric_name.title()}_Significant"] = "Yes" if quality_delta_significant.get(metric_name, False) else "No"
281
+
282
+ rows.append(row)
283
+
284
+ print(f" - Created {len(rows)} rows for model table")
285
+ return pd.DataFrame(rows)
286
+
287
+
288
+ def format_ci(ci_dict: Dict[str, Any]) -> str:
289
+ """Format confidence interval dictionary to string."""
290
+ if not ci_dict or not isinstance(ci_dict, dict):
291
+ return "N/A"
292
+
293
+ lower = ci_dict.get("lower")
294
+ upper = ci_dict.get("upper")
295
+ mean = ci_dict.get("mean")
296
+
297
+ if lower is not None and upper is not None:
298
+ return f"[{lower:.3f}, {upper:.3f}]"
299
+ elif mean is not None:
300
+ return f"Mean: {mean:.3f}"
301
+ else:
302
+ return "N/A"
303
+
304
+
305
+ def create_frequency_plots(*_args, **_kwargs):
306
+ """Removed for now – kept as a stub for backward compatibility."""
307
+ return None, None
lmmvibes/vis_gradio/launcher.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ CLI launcher for LMM-Vibes Gradio visualization app.
4
+
5
+ Usage:
6
+ python -m lmmvibes.vis_gradio.launcher --results_dir path/to/results
7
+
8
+ Or directly:
9
+ python lmmvibes/vis_gradio/launcher.py --results_dir path/to/results
10
+ """
11
+
12
+ import argparse
13
+ import sys
14
+ from pathlib import Path
15
+
16
+ def main():
17
+ parser = argparse.ArgumentParser(
18
+ description="Launch LMM-Vibes Gradio visualization app",
19
+ formatter_class=argparse.RawDescriptionHelpFormatter,
20
+ epilog="""
21
+ Examples:
22
+ # Launch with auto-loaded data from a base results directory
23
+ python -m lmmvibes.vis_gradio.launcher --results_dir /path/to/results
24
+
25
+ # Launch with public sharing enabled
26
+ python -m lmmvibes.vis_gradio.launcher --results_dir /path/to/results --share
27
+
28
+ # Launch on specific port
29
+ python -m lmmvibes.vis_gradio.launcher --results_dir /path/to/results --port 8080
30
+
31
+ # Launch with automatic port selection
32
+ python -m lmmvibes.vis_gradio.launcher --results_dir /path/to/results --auto_port
33
+
34
+ # Launch without auto-loading (manual selection in app)
35
+ python -m lmmvibes.vis_gradio.launcher
36
+ """
37
+ )
38
+
39
+ parser.add_argument(
40
+ "--results_dir",
41
+ type=str,
42
+ help="Path to base results directory containing experiment subfolders (optional - can be loaded in the app)"
43
+ )
44
+
45
+ parser.add_argument(
46
+ "--share",
47
+ action="store_true",
48
+ help="Create a public shareable link"
49
+ )
50
+
51
+ parser.add_argument(
52
+ "--server_name",
53
+ type=str,
54
+ default="127.0.0.1",
55
+ help="Server address (default: 127.0.0.1)"
56
+ )
57
+
58
+ parser.add_argument(
59
+ "--port",
60
+ type=int,
61
+ default=7860,
62
+ help="Server port (default: 7860). Use --auto_port to automatically find an available port."
63
+ )
64
+
65
+ parser.add_argument(
66
+ "--auto_port",
67
+ action="store_true",
68
+ help="Automatically find an available port by trying ports 8080-8089"
69
+ )
70
+
71
+ parser.add_argument(
72
+ "--debug",
73
+ action="store_true",
74
+ help="Enable debug mode"
75
+ )
76
+
77
+ args = parser.parse_args()
78
+
79
+ # Handle auto_port option
80
+ if args.auto_port:
81
+ # Use a high port range for auto-port mode
82
+ args.port = 8080
83
+ print("πŸ” Auto-port mode enabled - will try ports 8080-8089")
84
+
85
+ # Validate results directory if provided
86
+ if args.results_dir:
87
+ results_path = Path(args.results_dir)
88
+ if not results_path.exists():
89
+ print(f"❌ Error: Results directory does not exist: {args.results_dir}")
90
+ sys.exit(1)
91
+ if not results_path.is_dir():
92
+ print(f"❌ Error: Path is not a directory: {args.results_dir}")
93
+ sys.exit(1)
94
+
95
+ # Import and launch the app
96
+ try:
97
+ from .app import launch_app
98
+
99
+ print("πŸš€ Launching LMM-Vibes Gradio Visualization App...")
100
+ print(f"🌐 Server: http://{args.server_name}:{args.port}")
101
+ if args.share:
102
+ print("πŸ”— Public sharing enabled")
103
+
104
+ launch_app(
105
+ results_dir=args.results_dir,
106
+ share=args.share,
107
+ server_name=args.server_name,
108
+ server_port=args.port,
109
+ debug=args.debug
110
+ )
111
+
112
+ except ImportError as e:
113
+ print(f"❌ Error: Failed to import required modules: {e}")
114
+ print("πŸ’‘ Make sure you have gradio installed: pip install gradio")
115
+ sys.exit(1)
116
+ except Exception as e:
117
+ print(f"❌ Error launching app: {e}")
118
+ sys.exit(1)
119
+
120
+
121
+ if __name__ == "__main__":
122
+ main()
lmmvibes/vis_gradio/load_data_tab.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities for the "Load Data" tab – loading pipeline results and scanning for
3
+ available experiment folders.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import os
8
+ from pathlib import Path
9
+ from typing import List, Tuple
10
+
11
+ import gradio as gr
12
+
13
+ # ---------------------------------------------------------------------------
14
+ # Loading utilities updated for FunctionalMetrics
15
+ # ---------------------------------------------------------------------------
16
+
17
+ from .state import app_state, BASE_RESULTS_DIR
18
+ from .data_loader import (
19
+ load_pipeline_results,
20
+ scan_for_result_subfolders,
21
+ validate_results_directory,
22
+ )
23
+
24
+ # Metrics helpers
25
+ from .metrics_adapter import get_all_models
26
+
27
+ __all__ = [
28
+ "load_data",
29
+ "get_available_experiments",
30
+ "get_experiment_choices",
31
+ "refresh_experiment_dropdown",
32
+ "load_experiment_data",
33
+ ]
34
+
35
+
36
+ def load_data(results_dir: str) -> Tuple[str, str, str]:
37
+ """Load pipeline results from *results_dir* and update the shared *app_state*.
38
+
39
+ Returns a tuple of (summary_markdown, models_info_markdown, models_checkbox_update).
40
+ """
41
+ try:
42
+ # 1. Validate directory structure
43
+ is_valid, error_msg = validate_results_directory(results_dir)
44
+ if not is_valid:
45
+ return "", f"❌ Error: {error_msg}", ""
46
+
47
+ # 2. Handle optional sub-folder selection (first match for now)
48
+ subfolders = scan_for_result_subfolders(results_dir)
49
+ final_dir = results_dir
50
+ if subfolders and "." not in subfolders:
51
+ final_dir = str(Path(results_dir) / subfolders[0])
52
+
53
+ # 3. Load results into memory
54
+ clustered_df, metrics, model_cluster_df, results_path = load_pipeline_results(final_dir)
55
+
56
+ # 4. Stash in global state so other tabs can use it
57
+ app_state["clustered_df"] = clustered_df
58
+ app_state["metrics"] = metrics
59
+ app_state["model_cluster_df"] = model_cluster_df
60
+ # Temporary alias for legacy modules
61
+ app_state["model_stats"] = metrics
62
+ app_state["results_path"] = results_path
63
+ app_state["available_models"] = get_all_models(metrics)
64
+ app_state["current_results_dir"] = final_dir
65
+
66
+ # 5. Compose status messages
67
+ n_models = len(metrics.get("model_cluster_scores", {}))
68
+ n_properties = len(clustered_df)
69
+
70
+ summary = f"""
71
+ βœ… **Successfully loaded pipeline results!**
72
+
73
+ **Data Summary:**
74
+ - **Models:** {n_models}
75
+ - **Properties:** {n_properties:,}
76
+ - **Results Directory:** {Path(final_dir).name}
77
+ """
78
+ # Check for both naming patterns for fine clusters
79
+ if ("fine_cluster_id" in clustered_df.columns or
80
+ "property_description_fine_cluster_id" in clustered_df.columns):
81
+ fine_id_col = ("fine_cluster_id" if "fine_cluster_id" in clustered_df.columns
82
+ else "property_description_fine_cluster_id")
83
+ n_fine_clusters = clustered_df[fine_id_col].nunique()
84
+ summary += f"\n- **Fine Clusters:** {n_fine_clusters}"
85
+
86
+ # Check for both naming patterns for coarse clusters
87
+ if ("coarse_cluster_id" in clustered_df.columns or
88
+ "property_description_coarse_cluster_id" in clustered_df.columns):
89
+ coarse_id_col = ("coarse_cluster_id" if "coarse_cluster_id" in clustered_df.columns
90
+ else "property_description_coarse_cluster_id")
91
+ n_coarse_clusters = clustered_df[coarse_id_col].nunique()
92
+ summary += f"\n- **Coarse Clusters:** {n_coarse_clusters}"
93
+
94
+ model_choices = app_state["available_models"]
95
+ models_info = f"Available models: {', '.join(model_choices)}"
96
+
97
+ # Gradio update object for the CheckboxGroup
98
+ return summary, models_info, gr.update(choices=model_choices, value=model_choices)
99
+
100
+ except Exception as e:
101
+ error_msg = f"❌ Error loading results: {e}"
102
+ return "", error_msg, gr.update(choices=[], value=[])
103
+
104
+
105
+ def get_available_experiments(base_dir: str) -> List[str]:
106
+ """Return experiment sub-directories that contain the expected result files."""
107
+ if not base_dir or not os.path.exists(base_dir):
108
+ return []
109
+
110
+ experiments: List[str] = []
111
+ try:
112
+ for item in os.listdir(base_dir):
113
+ item_path = os.path.join(base_dir, item)
114
+ if os.path.isdir(item_path):
115
+ if (
116
+ os.path.exists(os.path.join(item_path, "model_stats.json"))
117
+ or os.path.exists(os.path.join(item_path, "clustered_results_lightweight.jsonl"))
118
+ ):
119
+ experiments.append(item)
120
+ except Exception as e:
121
+ print(f"Error scanning experiments: {e}")
122
+
123
+ return sorted(experiments)
124
+
125
+
126
+ def get_experiment_choices() -> List[str]:
127
+ """Return dropdown choices for the experiment selector."""
128
+ if not BASE_RESULTS_DIR:
129
+ return []
130
+ experiments = get_available_experiments(BASE_RESULTS_DIR)
131
+ return ["Select an experiment..."] + experiments
132
+
133
+
134
+ def refresh_experiment_dropdown() -> gr.update:
135
+ """Gradio helper to refresh the experiment dropdown choices."""
136
+ choices = get_experiment_choices()
137
+ return gr.update(choices=choices, value="Select an experiment...")
138
+
139
+
140
+ def load_experiment_data(experiment_name: str) -> Tuple[str, str, str]:
141
+ """Wrapper used by Gradio events to load a *selected* experiment."""
142
+ if not BASE_RESULTS_DIR or experiment_name == "Select an experiment...":
143
+ return "", "Please select a valid experiment", gr.update(choices=[], value=[])
144
+
145
+ experiment_path = os.path.join(BASE_RESULTS_DIR, experiment_name)
146
+ print(f"πŸ” Loading experiment: {experiment_name} from {experiment_path}")
147
+ return load_data(experiment_path)
lmmvibes/vis_gradio/metrics_adapter.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Lightweight access helpers for FunctionalMetrics score dictionaries.
2
+
3
+ The Gradio UI now receives the *raw* FunctionalMetrics output as a
4
+ ```
5
+ metrics = {
6
+ "model_cluster_scores": {...},
7
+ "cluster_scores": {...},
8
+ "model_scores": {...},
9
+ }
10
+ ```
11
+ This module centralises the most common look-ups so that the rest of the
12
+ codebase does *not* need to know the exact key names. If the format
13
+ changes again we only need to update these helpers.
14
+ """
15
+ from typing import Dict, Any, List
16
+
17
+ __all__ = [
18
+ "get_model_clusters",
19
+ "get_all_models",
20
+ "get_all_clusters",
21
+ ]
22
+
23
+ def get_model_clusters(metrics: Dict[str, Any], model_name: str) -> Dict[str, Any]:
24
+ """Return the per-cluster dictionary for a given model.
25
+
26
+ Args:
27
+ metrics: The dict returned by ``load_pipeline_results``.
28
+ model_name: Name of the model.
29
+ """
30
+ if model_name == "all":
31
+ # For "all" model, return cluster_scores (aggregated across all models)
32
+ return metrics.get("cluster_scores", {})
33
+ else:
34
+ return metrics.get("model_cluster_scores", {}).get(model_name, {})
35
+
36
+
37
+ def get_all_models(metrics: Dict[str, Any]) -> List[str]:
38
+ """Return the list of model names present in the metrics dict."""
39
+ models = list(metrics.get("model_cluster_scores", {}).keys())
40
+ # Add "all" as the first option to show aggregated metrics across all models
41
+ return ["all"] + models
42
+
43
+
44
+ def get_all_clusters(metrics: Dict[str, Any]) -> List[str]:
45
+ """Return the list of cluster names (across all models)."""
46
+ return list(metrics.get("cluster_scores", {}).keys())
lmmvibes/vis_gradio/overview_tab.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Logic helpers for the **Overview** tab."""
2
+ from typing import List
3
+
4
+ from .state import app_state
5
+ from .utils import compute_model_rankings_new, create_model_summary_card_new
6
+
7
+ __all__ = ["create_overview"]
8
+
9
+
10
+ def create_overview(
11
+ selected_models: List[str],
12
+ top_n: int,
13
+ score_significant_only: bool = False,
14
+ quality_significant_only: bool = False,
15
+ sort_by: str = "quality_asc",
16
+ min_cluster_size: int = 1,
17
+ ) -> str:
18
+ """Return the HTML snippet that summarises model performance."""
19
+ if not app_state["metrics"]:
20
+ return "Please load data first using the 'Load Data' tab."
21
+
22
+ if not selected_models:
23
+ return "Please select at least one model to display."
24
+
25
+ # 1. Compute global rankings and filter to selection
26
+ model_rankings = compute_model_rankings_new(app_state["metrics"])
27
+ filtered_rankings = [
28
+ (name, stats) for name, stats in model_rankings if name in selected_models
29
+ ]
30
+
31
+ # Sort so "all" appears first, then the rest by their rankings
32
+ all_models = [(name, stats) for name, stats in filtered_rankings if name == "all"]
33
+ other_models = [(name, stats) for name, stats in filtered_rankings if name != "all"]
34
+ filtered_rankings = all_models + other_models
35
+
36
+ if not filtered_rankings:
37
+ return "No data available for selected models."
38
+
39
+ # 2. Assemble HTML
40
+ overview_html = """
41
+ <div style="max-width: 1600px; margin: 0 auto;">
42
+ <p style="color: #666; margin-bottom: 10px;">
43
+ Top distinctive clusters where each model shows unique behavioural patterns.
44
+ Frequency shows what percentage of a model's battles resulted in that behavioural pattern.
45
+ </p>
46
+
47
+ <details style="margin-bottom:25px;">
48
+ <summary style="cursor:pointer; color:#4c6ef5; font-weight:600;">ℹ️ What do "proportion delta", "Quality Ξ”", and significance tags mean?</summary>
49
+ <div style="margin-top:12px; font-size:14px; line-height:1.5; color:#333;">
50
+ <strong>Proportion Delta</strong><br>
51
+ For each cluster we compute how often <em>this model</em> appears in that cluster compared with the average across all models.<br>
52
+ β€’ A positive value (e.g. <code>+0.15</code>) means the model hits the behaviour more often than average.<br>
53
+ β€’ A negative value (e.g. <code>-0.08</code>) means it appears less often.<br>
54
+ It is derived from the&nbsp;<code>proportion_delta</code>&nbsp;field in <code>model_cluster_scores.json</code>.<br><br>
55
+ <strong>Quality Ξ”</strong><br>
56
+ The difference between the cluster's quality score(s) for this model and the model's <em>overall</em> quality baseline, shown for each individual metric (e.g., helpfulness, accuracy).<br>
57
+ Positive values (green) indicate the model performs better than its average in that behaviour; negative values (red) indicate worse.<br>
58
+ This is derived from the <code>quality_delta</code> metric dictionary in <code>model_cluster_scores.json</code>.<br><br>
59
+ <strong>Significance Tags (FREQ/QUAL)</strong><br>
60
+ The <span style="background: #28a745; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">FREQ</span> and <span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">QUAL</span> tags indicate <em>statistical significance</em> based on confidence intervals:<br>
61
+ β€’ <strong>FREQ</strong> (green): The proportion delta is statistically significant (confidence interval doesn't include zero)<br>
62
+ β€’ <strong>QUAL</strong> (blue): At least one quality metric delta is statistically significant<br>
63
+ These tags help identify which behavioral patterns are reliably different from the model's baseline performance.
64
+ </div>
65
+ </details>
66
+ """
67
+
68
+ for model_name, _ in filtered_rankings:
69
+ card_html = create_model_summary_card_new(
70
+ model_name,
71
+ app_state["metrics"],
72
+ # top_n etc.
73
+ top_n,
74
+ score_significant_only=score_significant_only,
75
+ quality_significant_only=quality_significant_only,
76
+ sort_by=sort_by,
77
+ min_cluster_size=min_cluster_size,
78
+ )
79
+ overview_html += card_html
80
+
81
+ overview_html += "</div>"
82
+ return overview_html
lmmvibes/vis_gradio/plots_tab.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Plots tab for the LMM-Vibes Gradio app.
3
+
4
+ This module provides functionality to display the model cluster proportion and quality plots.
5
+ """
6
+
7
+ import gradio as gr
8
+ import pandas as pd
9
+ import plotly.express as px
10
+ import plotly.graph_objects as go
11
+ from typing import Tuple, List
12
+
13
+ from .state import app_state
14
+
15
+
16
+ def create_proportion_plot(show_ci: bool = False) -> Tuple[go.Figure, str]:
17
+ """Create a grouped bar plot of proportion by property and model."""
18
+ if app_state.get("model_cluster_df") is None:
19
+ return None, "No model cluster data loaded. Please load data first."
20
+
21
+ model_cluster_df = app_state["model_cluster_df"]
22
+ print("DataFrame shape:", model_cluster_df.shape)
23
+ print("Columns:", model_cluster_df.columns.tolist())
24
+ print("Proportion range:", model_cluster_df['proportion'].min(), "to", model_cluster_df['proportion'].max())
25
+ print("Sample data:")
26
+ print(model_cluster_df[['model', 'cluster', 'proportion']].head(10))
27
+
28
+ if model_cluster_df.empty:
29
+ return None, "No model cluster data available."
30
+
31
+ # Ensure proportion values are numeric and in reasonable range
32
+ model_cluster_df = model_cluster_df.copy()
33
+ model_cluster_df['proportion'] = pd.to_numeric(model_cluster_df['proportion'], errors='coerce')
34
+
35
+ # Check for any unreasonable values
36
+ print("After conversion - Proportion range:", model_cluster_df['proportion'].min(), "to", model_cluster_df['proportion'].max())
37
+ print("Proportion values > 1:", (model_cluster_df['proportion'] > 1).sum())
38
+ print("Proportion values < 0:", (model_cluster_df['proportion'] < 0).sum())
39
+
40
+ # Create property name mapping with proper ordering
41
+ unique_properties = sorted(model_cluster_df['cluster'].unique())
42
+ property_mapping = {prop: f"P{i+1}" for i, prop in enumerate(unique_properties)}
43
+
44
+ # Create abbreviated property column for plotting
45
+ model_cluster_df['property_abbr'] = model_cluster_df['cluster'].map(property_mapping)
46
+
47
+ # Filter out "No properties" clusters
48
+ model_cluster_df = model_cluster_df[model_cluster_df['cluster'] != "No properties"]
49
+
50
+ # Prepare confidence interval data if requested
51
+ error_y_data = None
52
+ if show_ci and 'proportion_ci_lower' in model_cluster_df.columns and 'proportion_ci_upper' in model_cluster_df.columns:
53
+ # Calculate error bar values
54
+ model_cluster_df['y_error'] = model_cluster_df['proportion_ci_upper'] - model_cluster_df['proportion']
55
+ model_cluster_df['y_error_minus'] = model_cluster_df['proportion'] - model_cluster_df['proportion_ci_lower']
56
+ # Replace NaN values with 0
57
+ model_cluster_df['y_error'] = model_cluster_df['y_error'].fillna(0)
58
+ model_cluster_df['y_error_minus'] = model_cluster_df['y_error_minus'].fillna(0)
59
+ error_y_data = model_cluster_df['y_error']
60
+ error_y_minus_data = model_cluster_df['y_error_minus']
61
+
62
+ # Create a grouped bar plot of 'proportion' by property (x) and model (hue)
63
+ fig = px.bar(
64
+ model_cluster_df,
65
+ x="property_abbr",
66
+ y="proportion",
67
+ color="model",
68
+ barmode="group",
69
+ title="Proportion by Property and Model",
70
+ labels={"proportion": "Proportion", "property_abbr": "Property", "model": "Model"},
71
+ error_y="y_error" if error_y_data is not None else None,
72
+ error_y_minus="y_error_minus" if error_y_data is not None else None
73
+ )
74
+
75
+ # Set the x-axis order to ensure P1, P2, P3, etc.
76
+ property_order = [f"P{i+1}" for i in range(len(unique_properties))]
77
+ fig.update_xaxes(categoryorder='array', categoryarray=property_order)
78
+ fig.update_layout(xaxis_tickangle=45)
79
+
80
+ # save figure to file
81
+ fig.write_html("model_cluster_proportion_plot.html")
82
+
83
+ # Create property mapping string
84
+ mapping_text = "**Property Mapping:**\n\n"
85
+ for prop, abbr in property_mapping.items():
86
+ mapping_text += f"**{abbr}:** {prop}\n\n"
87
+
88
+ # Add confidence interval info if enabled
89
+ if show_ci:
90
+ if 'proportion_ci_lower' in model_cluster_df.columns and 'proportion_ci_upper' in model_cluster_df.columns:
91
+ mapping_text += "---\n\n**Confidence Intervals:**\n"
92
+ mapping_text += "Error bars show 95% confidence intervals for proportion values.\n"
93
+ else:
94
+ mapping_text += "---\n\n**Note:** Confidence interval data not available in the loaded dataset.\n"
95
+
96
+ return fig, mapping_text
97
+
98
+
99
+ def create_quality_plot(quality_metric: str = "helpfulness", show_ci: bool = False) -> Tuple[go.Figure, str]:
100
+ """Create a grouped bar plot of quality by property and model."""
101
+ if app_state.get("model_cluster_df") is None:
102
+ return None, "No model cluster data loaded. Please load data first."
103
+
104
+ model_cluster_df = app_state["model_cluster_df"]
105
+
106
+ if model_cluster_df.empty:
107
+ return None, "No model cluster data available."
108
+
109
+ # Check if the quality metric exists in the data
110
+ quality_col = f"quality_{quality_metric}"
111
+ if quality_col not in model_cluster_df.columns:
112
+ # Get available quality metrics for better error message
113
+ available_metrics = [col.replace("quality_", "") for col in model_cluster_df.columns
114
+ if col.startswith("quality_")
115
+ and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant", "_delta"))]
116
+ if not available_metrics:
117
+ return None, f"No quality metrics found in the data. Available columns: {list(model_cluster_df.columns)}"
118
+ return None, f"Quality metric '{quality_metric}' not found. Available metrics: {available_metrics}"
119
+
120
+ # Create a copy for plotting
121
+ plot_df = model_cluster_df.copy()
122
+
123
+ # Ensure quality values are numeric
124
+ plot_df[quality_col] = pd.to_numeric(plot_df[quality_col], errors='coerce')
125
+
126
+ # Check if we have any valid quality data
127
+ if plot_df[quality_col].isna().all():
128
+ return None, f"No valid quality data found for metric '{quality_metric}'. All values are missing or invalid."
129
+
130
+ # Create property name mapping with proper ordering (same as proportion plot)
131
+ unique_properties = sorted(plot_df['cluster'].unique())
132
+ property_mapping = {prop: f"P{i+1}" for i, prop in enumerate(unique_properties)}
133
+
134
+ # Create abbreviated property column for plotting
135
+ plot_df['property_abbr'] = plot_df['cluster'].map(property_mapping)
136
+
137
+ # Filter out "No properties" clusters
138
+ plot_df = plot_df[plot_df['cluster'] != "No properties"]
139
+
140
+ # Prepare confidence interval data if requested
141
+ error_y_data = None
142
+ if show_ci:
143
+ ci_lower_col = f"{quality_col}_ci_lower"
144
+ ci_upper_col = f"{quality_col}_ci_upper"
145
+ if ci_lower_col in plot_df.columns and ci_upper_col in plot_df.columns:
146
+ # Calculate error bar values
147
+ plot_df['y_error'] = plot_df[ci_upper_col] - plot_df[quality_col]
148
+ plot_df['y_error_minus'] = plot_df[quality_col] - plot_df[ci_lower_col]
149
+ # Replace NaN values with 0
150
+ plot_df['y_error'] = plot_df['y_error'].fillna(0)
151
+ plot_df['y_error_minus'] = plot_df['y_error_minus'].fillna(0)
152
+ error_y_data = plot_df['y_error']
153
+ error_y_minus_data = plot_df['y_error_minus']
154
+
155
+ # Create a grouped bar plot of quality by property (x) and model (hue)
156
+ fig = px.bar(
157
+ plot_df,
158
+ x="property_abbr",
159
+ y=quality_col,
160
+ color="model",
161
+ barmode="group",
162
+ title=f"Quality ({quality_metric.title()}) by Property and Model",
163
+ labels={quality_col: f"Quality ({quality_metric.title()})", "property_abbr": "Property", "model": "Model"},
164
+ error_y="y_error" if error_y_data is not None else None,
165
+ error_y_minus="y_error_minus" if error_y_data is not None else None
166
+ )
167
+
168
+ # Set the x-axis order to ensure P1, P2, P3, etc. (same as proportion plot)
169
+ property_order = [f"P{i+1}" for i in range(len(unique_properties))]
170
+ fig.update_xaxes(categoryorder='array', categoryarray=property_order)
171
+ fig.update_layout(xaxis_tickangle=45)
172
+
173
+ # save figure to file
174
+ fig.write_html(f"model_cluster_quality_{quality_metric}_plot.html")
175
+
176
+ # Create property mapping string (same as proportion plot)
177
+ mapping_text = "**Property Mapping:**\n\n"
178
+ for prop, abbr in property_mapping.items():
179
+ mapping_text += f"**{abbr}:** {prop}\n\n"
180
+
181
+ # Add confidence interval info if enabled
182
+ if show_ci:
183
+ ci_lower_col = f"{quality_col}_ci_lower"
184
+ ci_upper_col = f"{quality_col}_ci_upper"
185
+ if ci_lower_col in plot_df.columns and ci_upper_col in plot_df.columns:
186
+ mapping_text += "---\n\n**Confidence Intervals:**\n"
187
+ mapping_text += f"Error bars show 95% confidence intervals for {quality_metric} values.\n"
188
+ else:
189
+ mapping_text += "---\n\n**Note:** Confidence interval data not available for this quality metric.\n"
190
+
191
+ return fig, mapping_text
192
+
193
+
194
+ def get_available_quality_metrics() -> List[str]:
195
+ """Get available quality metrics from the loaded DataFrame."""
196
+ if app_state.get("model_cluster_df") is None:
197
+ return ["helpfulness", "accuracy", "harmlessness", "honesty"]
198
+
199
+ model_cluster_df = app_state["model_cluster_df"]
200
+ # Find all quality columns (excluding CI and other suffix columns)
201
+ quality_columns = [col for col in model_cluster_df.columns
202
+ if col.startswith("quality_")
203
+ and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant", "_delta"))]
204
+ # Extract metric names by removing "quality_" prefix
205
+ available_quality_metrics = [col.replace("quality_", "") for col in quality_columns]
206
+
207
+ # If no quality metrics found, provide defaults
208
+ if not available_quality_metrics:
209
+ available_quality_metrics = ["helpfulness", "accuracy", "harmlessness", "honesty"]
210
+
211
+ return available_quality_metrics
212
+
213
+
214
+ def update_quality_metric_dropdown() -> gr.Dropdown:
215
+ """Update the quality metric dropdown with available metrics."""
216
+ available_metrics = get_available_quality_metrics()
217
+ return gr.Dropdown(
218
+ label="Quality Metric",
219
+ choices=available_metrics,
220
+ value=available_metrics[0] if available_metrics else "helpfulness",
221
+ info="Select which quality metric to display"
222
+ )
223
+
224
+
225
+ def update_quality_metric_visibility(plot_type: str) -> gr.Dropdown:
226
+ """Update the quality metric dropdown visibility based on plot type."""
227
+ available_metrics = get_available_quality_metrics()
228
+ return gr.Dropdown(
229
+ label="Quality Metric",
230
+ choices=available_metrics,
231
+ value=available_metrics[0] if available_metrics else "helpfulness",
232
+ info="Select which quality metric to display",
233
+ visible=(plot_type == "quality")
234
+ )
235
+
236
+
237
+ def create_plot_with_toggle(plot_type: str, quality_metric: str = "helpfulness", show_ci: bool = False) -> Tuple[go.Figure, str]:
238
+ """Create a plot based on the selected type (frequency or quality)."""
239
+ if plot_type == "frequency":
240
+ return create_proportion_plot(show_ci)
241
+ elif plot_type == "quality":
242
+ return create_quality_plot(quality_metric, show_ci)
243
+ else:
244
+ return None, f"Unknown plot type: {plot_type}"
245
+
246
+
247
+ def create_plots_tab() -> Tuple[gr.Plot, gr.Markdown, gr.Checkbox, gr.Dropdown, gr.Dropdown]:
248
+ """Create the plots tab interface with a toggle between frequency and quality plots."""
249
+ gr.Markdown("Interactive grouped bar plot showing either frequency (proportion) or quality metrics by property and model. **If the plot looks wonky, just unclick and re-click the signifigance checkbox to have it resize**")
250
+
251
+ # Plot controls in a row
252
+ with gr.Row():
253
+ # Plot type toggle
254
+ plot_type_dropdown = gr.Dropdown(
255
+ label="Plot Type",
256
+ choices=["frequency", "quality"],
257
+ value="frequency",
258
+ info="Choose between frequency (proportion) or quality metrics"
259
+ )
260
+
261
+ # Quality metric dropdown (only visible for quality plots)
262
+ quality_metric_dropdown = gr.Dropdown(
263
+ label="Quality Metric",
264
+ choices=get_available_quality_metrics(),
265
+ value=get_available_quality_metrics()[0] if get_available_quality_metrics() else "helpfulness",
266
+ info="Select which quality metric to display",
267
+ visible=False # Initially hidden, shown when quality is selected
268
+ )
269
+
270
+ # Add checkbox for confidence intervals
271
+ show_ci_checkbox = gr.Checkbox(
272
+ label="Show Confidence Intervals",
273
+ value=True,
274
+ info="Display 95% confidence intervals as error bars (if available in data)"
275
+ )
276
+
277
+ plot_display = gr.Plot(
278
+ label="Model-Cluster Analysis Plot",
279
+ value=None
280
+ )
281
+
282
+ plot_info = gr.Markdown("")
283
+
284
+ return plot_display, plot_info, show_ci_checkbox, plot_type_dropdown, quality_metric_dropdown
lmmvibes/vis_gradio/side_by_side_display.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Side-by-side display component for comparing model responses.
3
+
4
+ This module provides functionality to display two model responses side by side
5
+ for comparison, specifically designed for datasets with model_a_response and
6
+ model_b_response fields.
7
+ """
8
+
9
+ from typing import Dict, Any, Optional
10
+ from .conversation_display import convert_to_openai_format, display_openai_conversation_html
11
+ import html
12
+
13
+ def display_side_by_side_responses(
14
+ model_a: str,
15
+ model_b: str,
16
+ model_a_response: Any,
17
+ model_b_response: Any,
18
+ use_accordion: bool = True,
19
+ pretty_print_dicts: bool = True,
20
+ score: Optional[float] = None,
21
+ winner: Optional[str] = None
22
+ ) -> str:
23
+ """
24
+ Display two model responses side by side for comparison.
25
+
26
+ Args:
27
+ model_a: Name of model A
28
+ model_b: Name of model B
29
+ model_a_response: Response data from model A
30
+ model_b_response: Response data from model B
31
+ use_accordion: If True, group system and info messages in collapsible accordions
32
+ pretty_print_dicts: If True, pretty-print embedded dictionaries
33
+ score: Optional score for the comparison
34
+ winner: Optional winner indication ('model_a', 'model_b', or 'tie')
35
+
36
+ Returns:
37
+ HTML string for side-by-side display
38
+ """
39
+
40
+ # Convert responses to OpenAI format
41
+ conversation_a = convert_to_openai_format(model_a_response) if model_a_response != 'N/A' else None
42
+ conversation_b = convert_to_openai_format(model_b_response) if model_b_response != 'N/A' else None
43
+
44
+ # Generate conversation HTML for each model
45
+ if conversation_a:
46
+ html_a = display_openai_conversation_html(
47
+ conversation_a,
48
+ use_accordion=use_accordion,
49
+ pretty_print_dicts=pretty_print_dicts
50
+ )
51
+ else:
52
+ html_a = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
53
+
54
+ if conversation_b:
55
+ html_b = display_openai_conversation_html(
56
+ conversation_b,
57
+ use_accordion=use_accordion,
58
+ pretty_print_dicts=pretty_print_dicts
59
+ )
60
+ else:
61
+ html_b = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
62
+
63
+ # Create winner badges if winner is specified
64
+ winner_badge_a = ""
65
+ winner_badge_b = ""
66
+ if winner:
67
+ if winner == 'model_a':
68
+ winner_badge_a = """
69
+ <span style="
70
+ background: #28a745;
71
+ color: white;
72
+ padding: 4px 8px;
73
+ border-radius: 12px;
74
+ font-size: 12px;
75
+ font-weight: bold;
76
+ margin-left: 10px;
77
+ ">
78
+ πŸ† Winner
79
+ </span>
80
+ """
81
+ elif winner == 'model_b':
82
+ winner_badge_b = """
83
+ <span style="
84
+ background: #28a745;
85
+ color: white;
86
+ padding: 4px 8px;
87
+ border-radius: 12px;
88
+ font-size: 12px;
89
+ font-weight: bold;
90
+ margin-left: 10px;
91
+ ">
92
+ πŸ† Winner
93
+ </span>
94
+ """
95
+ elif winner == 'tie':
96
+ tie_badge = """
97
+ <span style="
98
+ background: #6c757d;
99
+ color: white;
100
+ padding: 4px 8px;
101
+ border-radius: 12px;
102
+ font-size: 12px;
103
+ font-weight: bold;
104
+ margin-left: 10px;
105
+ ">
106
+ 🀝 Tie
107
+ </span>
108
+ """
109
+ winner_badge_a = tie_badge
110
+ winner_badge_b = tie_badge
111
+
112
+ # Add score badge if available
113
+ score_info = ""
114
+ if score is not None and score != 'N/A':
115
+ try:
116
+ score_val = float(score)
117
+ score_color = '#28a745' if score_val >= 0 else '#dc3545'
118
+ score_info = f"""
119
+ <div style="text-align: center; margin-bottom: 15px;">
120
+ <span style="
121
+ background: {score_color};
122
+ color: white;
123
+ padding: 6px 12px;
124
+ border-radius: 15px;
125
+ font-size: 14px;
126
+ font-weight: bold;
127
+ ">
128
+ Comparison Score: {score_val:.3f}
129
+ </span>
130
+ </div>
131
+ """
132
+ except (ValueError, TypeError):
133
+ pass
134
+
135
+ # Create the side-by-side layout
136
+ side_by_side_html = f"""
137
+ <div style="margin-bottom: 20px;">
138
+ {score_info}
139
+ <div style="display: flex; gap: 20px; margin-top: 10px;">
140
+ <!-- Model A Column -->
141
+ <div style="flex: 1; border: 2px solid #e9ecef; border-radius: 8px; padding: 15px; background-color: #f8f9fa;">
142
+ <h4 style="margin: 0 0 15px 0; padding-bottom: 10px; border-bottom: 2px solid #dee2e6; color: #495057; display: flex; align-items: center;">
143
+ <span style="background: #007bff; color: white; padding: 4px 8px; border-radius: 4px; font-size: 12px; margin-right: 10px;">A</span>
144
+ {html.escape(model_a)}
145
+ {winner_badge_a}
146
+ </h4>
147
+ <div style="font-size: 13px; line-height: 1.5;">
148
+ {html_a}
149
+ </div>
150
+ </div>
151
+
152
+ <!-- Model B Column -->
153
+ <div style="flex: 1; border: 2px solid #e9ecef; border-radius: 8px; padding: 15px; background-color: #f8f9fa;">
154
+ <h4 style="margin: 0 0 15px 0; padding-bottom: 10px; border-bottom: 2px solid #dee2e6; color: #495057; display: flex; align-items: center;">
155
+ <span style="background: #fd7e14; color: white; padding: 4px 8px; border-radius: 4px; font-size: 12px; margin-right: 10px;">B</span>
156
+ {html.escape(model_b)}
157
+ {winner_badge_b}
158
+ </h4>
159
+ <div style="font-size: 13px; line-height: 1.5;">
160
+ {html_b}
161
+ </div>
162
+ </div>
163
+ </div>
164
+ </div>
165
+ """
166
+
167
+ return side_by_side_html
168
+
169
+
170
+ def is_side_by_side_dataset(example: Dict[str, Any]) -> bool:
171
+ """
172
+ Check if an example contains side-by-side comparison data.
173
+
174
+ Args:
175
+ example: Example dictionary from the dataset
176
+
177
+ Returns:
178
+ True if the example has both model_a_response and model_b_response
179
+ """
180
+ # Check if this is a side-by-side dataset by looking for both model_a_response and model_b_response
181
+ return 'model_a_response' in example and 'model_b_response' in example and \
182
+ example.get('model_a_response') is not None and example.get('model_b_response') is not None
183
+
184
+
185
+ def extract_side_by_side_data(row: Dict[str, Any]) -> Dict[str, Any]:
186
+ """
187
+ Extract side-by-side comparison data from a row.
188
+
189
+ Args:
190
+ row: Row from the dataset
191
+
192
+ Returns:
193
+ Dictionary with extracted side-by-side data
194
+ """
195
+ return {
196
+ 'model_a': row.get('model_a', 'Model A'),
197
+ 'model_b': row.get('model_b', 'Model B'),
198
+ 'model_a_response': row.get('model_a_response', 'N/A'),
199
+ 'model_b_response': row.get('model_b_response', 'N/A'),
200
+ 'winner': row.get('winner', None),
201
+ 'score': row.get('score', None)
202
+ }
lmmvibes/vis_gradio/state.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Shared application state for the LMM-Vibes Gradio viewer.
3
+
4
+ This module centralises mutable globals so they can be imported from any other
5
+ sub-module without circular-import problems.
6
+ """
7
+ from typing import Any, Dict, Optional
8
+ import os
9
+ from pathlib import Path
10
+
11
+ # Global runtime state – mutable and shared across all tabs
12
+ app_state: Dict[str, Any] = {
13
+ "clustered_df": None,
14
+ # NEW canonical key for the FunctionalMetrics dict
15
+ "metrics": None,
16
+ # DEPRECATED alias kept temporarily so that untouched modules continue to work
17
+ "model_stats": None,
18
+ "results_path": None,
19
+ "available_models": [],
20
+ "current_results_dir": None,
21
+ }
22
+
23
+ # Base directory that contains experiment result folders. Can be changed at
24
+ # runtime via launch_app(results_dir=…). A value of None means "not set".
25
+ # Prefer persistent storage in Spaces at /data/data when available.
26
+ _default_base = "/data/data" if Path("/data/data").exists() else "data"
27
+ BASE_RESULTS_DIR: Optional[str] = os.getenv("BASE_RESULTS_DIR", _default_base)
lmmvibes/vis_gradio/utils.py ADDED
@@ -0,0 +1,1673 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for Gradio pipeline results app.
3
+
4
+ This module contains common utility functions used across different components.
5
+ """
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import json
10
+ import markdown
11
+ import plotly.graph_objects as go
12
+ import plotly.express as px
13
+ from typing import Dict, List, Any, Optional, Tuple
14
+ import html
15
+ import ast
16
+
17
+ # Conversation rendering helpers are now in a dedicated module for clarity
18
+ from . import conversation_display as _convdisp
19
+ from .conversation_display import (
20
+ convert_to_openai_format,
21
+ display_openai_conversation_html,
22
+ pretty_print_embedded_dicts,
23
+ )
24
+
25
+ # NEW IMPLEMENTATION ---------------------------------------------------
26
+ from .metrics_adapter import get_model_clusters, get_all_models
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # NEW helper utilities for FunctionalMetrics format
30
+ # ---------------------------------------------------------------------------
31
+
32
+
33
+ def format_confidence_interval(ci: dict | None, decimals: int = 3) -> str:
34
+ """Return a pretty string for a CI dict of the form {"lower": x, "upper": y}."""
35
+ if not ci or not isinstance(ci, dict):
36
+ return "N/A"
37
+ lower, upper = ci.get("lower"), ci.get("upper")
38
+ if lower is None or upper is None:
39
+ return "N/A"
40
+ return f"[{lower:.{decimals}f}, {upper:.{decimals}f}]"
41
+
42
+
43
+ def get_confidence_interval_width(ci: dict | None) -> float | None:
44
+ """Return CI width (upper-lower) if possible."""
45
+ if not ci or not isinstance(ci, dict):
46
+ return None
47
+ lower, upper = ci.get("lower"), ci.get("upper")
48
+ if lower is None or upper is None:
49
+ return None
50
+ return upper - lower
51
+
52
+
53
+ def has_confidence_intervals(record: dict | None) -> bool:
54
+ """Simple check whether any *_ci key with lower/upper exists in a metrics record."""
55
+ if not record or not isinstance(record, dict):
56
+ return False
57
+ for k, v in record.items():
58
+ if k.endswith("_ci") and isinstance(v, dict) and {"lower", "upper"}.issubset(v.keys()):
59
+ return True
60
+ return False
61
+
62
+
63
+ def extract_quality_score(quality_field: Any) -> float | None:
64
+ """Given a quality field that may be a dict of metric values or a scalar, return its mean."""
65
+ if quality_field is None:
66
+ return None
67
+ if isinstance(quality_field, (int, float)):
68
+ return float(quality_field)
69
+ if isinstance(quality_field, dict) and quality_field:
70
+ return float(np.mean(list(quality_field.values())))
71
+ return None
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # UPDATED: get_top_clusters_for_model for FunctionalMetrics format
75
+ # ---------------------------------------------------------------------------
76
+
77
+
78
+ def get_top_clusters_for_model(metrics: Dict[str, Any], model_name: str, top_n: int = 10) -> List[Tuple[str, Dict[str, Any]]]:
79
+ """Return the top N clusters (by salience) for a given model.
80
+
81
+ Args:
82
+ metrics: The FunctionalMetrics dictionary (3-file format) loaded via data_loader.
83
+ model_name: Name of the model to inspect.
84
+ top_n: Number of clusters to return.
85
+
86
+ Returns:
87
+ List of (cluster_name, cluster_dict) tuples sorted by descending proportion_delta.
88
+ """
89
+ clusters_dict = get_model_clusters(metrics, model_name)
90
+ if not clusters_dict:
91
+ return []
92
+
93
+ # Filter out "No properties" clusters
94
+ clusters_dict = {k: v for k, v in clusters_dict.items() if k != "No properties"}
95
+
96
+ sorted_items = sorted(
97
+ clusters_dict.items(), key=lambda kv: kv[1].get("proportion_delta", 0), reverse=True
98
+ )
99
+ return sorted_items[:top_n]
100
+
101
+
102
+ def compute_model_rankings_new(metrics: Dict[str, Any]) -> List[tuple]:
103
+ """Compute rankings of models based on mean salience (proportion_delta).
104
+
105
+ Args:
106
+ metrics: The FunctionalMetrics dict loaded by data_loader.
107
+
108
+ Returns:
109
+ List[Tuple[str, Dict[str, float]]]: sorted list of (model_name, summary_dict)
110
+ """
111
+ model_scores: Dict[str, Dict[str, float]] = {}
112
+ for model in get_all_models(metrics):
113
+ clusters = get_model_clusters(metrics, model)
114
+ # Filter out "No properties" clusters
115
+ clusters = {k: v for k, v in clusters.items() if k != "No properties"}
116
+ if not clusters:
117
+ continue
118
+ saliences = [c.get("proportion_delta", 0.0) for c in clusters.values()]
119
+ model_scores[model] = {
120
+ "avg_salience": float(np.mean(saliences)),
121
+ "median_salience": float(np.median(saliences)),
122
+ "num_clusters": len(saliences),
123
+ "top_salience": float(max(saliences)),
124
+ "std_salience": float(np.std(saliences)),
125
+ }
126
+ return sorted(model_scores.items(), key=lambda x: x[1]["avg_salience"], reverse=True)
127
+
128
+
129
+ def create_model_summary_card_new(
130
+ model_name: str,
131
+ metrics: Dict[str, Any],
132
+ top_n: int = 3,
133
+ score_significant_only: bool = False,
134
+ quality_significant_only: bool = False,
135
+ sort_by: str = "quality_asc",
136
+ min_cluster_size: int = 1,
137
+ ) -> str:
138
+ """Generate a **styled** HTML summary card for a single model.
139
+
140
+ The new implementation recreates the legacy card design the user prefers:
141
+ β€’ Card header with battle count
142
+ β€’ Each cluster displayed as a vertically-spaced block (NOT a table)
143
+ β€’ Frequency, distinctiveness factor and CI inline; quality score right-aligned
144
+ """
145
+
146
+ clusters_dict = get_model_clusters(metrics, model_name)
147
+ if not clusters_dict:
148
+ return f"<div style='padding:20px'>No cluster data for {model_name}</div>"
149
+
150
+ # Filter out "No properties" clusters
151
+ clusters_dict = {k: v for k, v in clusters_dict.items() if k != "No properties"}
152
+
153
+ # Filter clusters ----------------------------------------------------
154
+ all_clusters = [c for c in clusters_dict.values() if c.get("size", 0) >= min_cluster_size]
155
+
156
+ if score_significant_only:
157
+ if model_name == "all":
158
+ # For "all" model, we don't have proportion_delta_significant, so skip this filter
159
+ pass
160
+ else:
161
+ all_clusters = [c for c in all_clusters if c.get("proportion_delta_significant", False)]
162
+ if quality_significant_only:
163
+ all_clusters = [c for c in all_clusters if any(c.get("quality_delta_significant", {}).values())]
164
+
165
+ if not all_clusters:
166
+ return f"<div style='padding:20px'>No clusters pass filters for {model_name}</div>"
167
+
168
+ # Count significant properties ---------------------------------------
169
+ significant_frequency_count = 0
170
+ significant_quality_count = 0
171
+
172
+ for cluster in clusters_dict.values():
173
+ if cluster.get("size", 0) >= min_cluster_size:
174
+ # Count frequency significance
175
+ if model_name != "all" and cluster.get("proportion_delta_significant", False):
176
+ significant_frequency_count += 1
177
+
178
+ # Count quality significance (sum across all metrics)
179
+ quality_delta_significant = cluster.get("quality_delta_significant", {})
180
+ significant_quality_count += sum(quality_delta_significant.values())
181
+
182
+ # Sort ---------------------------------------------------------------
183
+ def _mean_quality(c: dict[str, Any]) -> float:
184
+ vals = list(c.get("quality", {}).values())
185
+ return float(np.mean(vals)) if vals else 0.0
186
+
187
+ sort_key_map = {
188
+ "quality_asc": (_mean_quality, False),
189
+ "quality_desc": (_mean_quality, True),
190
+ "frequency_desc": (lambda c: c.get("proportion", 0), True),
191
+ "frequency_asc": (lambda c: c.get("proportion", 0), False),
192
+ "salience_desc": (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), True),
193
+ "salience_asc": (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), False),
194
+ }
195
+
196
+ key_fn, reverse = sort_key_map.get(sort_by, (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), True))
197
+ sorted_clusters = sorted(all_clusters, key=key_fn, reverse=reverse)[:top_n]
198
+
199
+ # Determine total conversations for this model ----------------
200
+ if model_name == "all":
201
+ # For "all" model, sum the individual model totals to avoid double-counting
202
+ model_scores = metrics.get("model_scores", {})
203
+ total_battles = sum(model_data.get("size", 0) for model_data in model_scores.values())
204
+ else:
205
+ model_scores_entry = metrics.get("model_scores", {}).get(model_name, {})
206
+ total_battles = model_scores_entry.get("size")
207
+ if total_battles is None:
208
+ # Fallback: deduplicate example IDs across clusters
209
+ total_battles = sum(c.get("size", 0) for c in clusters_dict.values())
210
+
211
+ # Card header --------------------------------------------------------
212
+ html_parts: list[str] = [f"""
213
+ <div style="padding: 20px; border:1px solid #e0e0e0; border-radius:8px; margin-bottom:25px;">
214
+ <h3 style="margin-top:0; font-size: 20px;">{html.escape(model_name)}</h3>
215
+ <p style="margin: 4px 0 8px 0; color:#555; font-size:13px;">
216
+ {total_battles} battles Β· Top clusters by frequency
217
+ </p>
218
+ <p style="margin: 0 0 18px 0; color:#666; font-size:12px;">
219
+ πŸ“Š {significant_frequency_count} significant frequency properties Β· {significant_quality_count} significant quality properties
220
+ </p>
221
+ """]
222
+
223
+ # Cluster blocks -----------------------------------------------------
224
+ for i, cluster in enumerate(sorted_clusters):
225
+ name = html.escape(next(k for k, v in clusters_dict.items() if v is cluster))
226
+ prop = cluster.get("proportion", 0)
227
+ freq_pct = prop * 100
228
+ size = cluster.get("size", 0)
229
+
230
+ # Check significance flags
231
+ is_proportion_significant = False
232
+ if model_name != "all":
233
+ is_proportion_significant = cluster.get("proportion_delta_significant", False)
234
+
235
+ quality_delta_significant = cluster.get("quality_delta_significant", {})
236
+ is_quality_significant = any(quality_delta_significant.values())
237
+
238
+ # Create significance indicators
239
+ significance_indicators = []
240
+ if is_proportion_significant:
241
+ significance_indicators.append('<span style="background: #28a745; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">FREQ</span>')
242
+ if is_quality_significant:
243
+ significance_indicators.append('<span style="background: #007bff; color: white; padding: 2px 6px; border-radius: 4px; font-size: 10px; font-weight: bold;">QUAL</span>')
244
+
245
+ significance_html = " ".join(significance_indicators) if significance_indicators else ""
246
+
247
+ # Distinctiveness factor heuristic
248
+ if model_name == "all":
249
+ # For "all" model, proportion_delta doesn't make sense, so show proportion instead
250
+ distinct_factor = prop
251
+ distinct_text = f"{freq_pct:.1f}% of all conversations"
252
+ else:
253
+ sal = cluster.get("proportion_delta", 0)
254
+ distinct_factor = 1 + (sal / prop) if prop else 1
255
+ distinct_text = f"proportion delta: {sal:+.3f}"
256
+
257
+ # Confidence interval (frequency based)
258
+ ci = cluster.get("proportion_ci")
259
+ ci_str = format_confidence_interval(ci) if ci else "N/A"
260
+
261
+ # Quality delta – show each metric separately
262
+ quality_delta = cluster.get("quality_delta", {})
263
+ quality_delta_html = ""
264
+
265
+ if quality_delta:
266
+ quality_delta_parts = []
267
+ for metric_name, delta_value in quality_delta.items():
268
+ color = "#28a745" if delta_value >= 0 else "#dc3545"
269
+ quality_delta_parts.append(f'<div style="color:{color}; font-weight:500;">{metric_name}: {delta_value:+.3f}</div>')
270
+ quality_delta_html = "".join(quality_delta_parts)
271
+ else:
272
+ quality_delta_html = '<span style="color:#666;">No quality data</span>'
273
+
274
+ # Get light color for this cluster
275
+ cluster_color = get_light_color_for_cluster(name, i)
276
+
277
+ html_parts.append(f"""
278
+ <div style="border-left: 4px solid #4c6ef5; padding: 12px 16px; margin-bottom: 10px; background:{cluster_color}; border-radius: 4px;">
279
+ <div style="display:flex; justify-content:space-between; align-items:flex-start;">
280
+ <div style="max-width:80%;">
281
+ <div style="margin-bottom:4px;">
282
+ <strong style="font-size:14px;">{name}</strong>
283
+ </div>
284
+ <span style="font-size:12px; color:#555;">{freq_pct:.1f}% frequency ({size} out of {total_battles} total) Β· {distinct_text}</span>
285
+ </div>
286
+ <div style="font-size:12px; font-weight:normal; white-space:nowrap; text-align:right;">
287
+ {quality_delta_html}
288
+ {significance_html}
289
+ </div>
290
+ </div>
291
+ </div>
292
+ """)
293
+
294
+ # Close card div -----------------------------------------------------
295
+ html_parts.append("</div>")
296
+
297
+ return "\n".join(html_parts)
298
+
299
+
300
+ def format_cluster_dataframe(clustered_df: pd.DataFrame,
301
+ selected_models: Optional[List[str]] = None,
302
+ cluster_level: str = 'fine') -> pd.DataFrame:
303
+ """Format cluster DataFrame for display in Gradio."""
304
+ df = clustered_df.copy()
305
+
306
+ # Debug information
307
+ print(f"DEBUG: format_cluster_dataframe called")
308
+ print(f" - Input DataFrame shape: {df.shape}")
309
+ print(f" - Selected models: {selected_models}")
310
+ print(f" - Available models in data: {df['model'].unique().tolist() if 'model' in df.columns else 'No model column'}")
311
+
312
+ # Filter by models if specified
313
+ if selected_models:
314
+ print(f" - Filtering by {len(selected_models)} selected models")
315
+ df = df[df['model'].isin(selected_models)]
316
+ print(f" - After filtering shape: {df.shape}")
317
+ print(f" - Models after filtering: {df['model'].unique().tolist()}")
318
+ else:
319
+ print(f" - No model filtering applied")
320
+
321
+ # Select relevant columns based on cluster level using correct column names from pipeline
322
+ if cluster_level == 'fine':
323
+ id_col = 'property_description_fine_cluster_id'
324
+ label_col = 'property_description_fine_cluster_label'
325
+ # Also check for alternative naming without prefix
326
+ alt_id_col = 'fine_cluster_id'
327
+ alt_label_col = 'fine_cluster_label'
328
+ else:
329
+ id_col = 'property_description_coarse_cluster_id'
330
+ label_col = 'property_description_coarse_cluster_label'
331
+ # Also check for alternative naming without prefix
332
+ alt_id_col = 'coarse_cluster_id'
333
+ alt_label_col = 'coarse_cluster_label'
334
+
335
+ # Try both naming patterns
336
+ if id_col in df.columns and label_col in df.columns:
337
+ # Use the expected naming pattern
338
+ cols = ['question_id', 'model', 'property_description', id_col, label_col, 'score']
339
+ elif alt_id_col in df.columns and alt_label_col in df.columns:
340
+ # Use the alternative naming pattern
341
+ cols = ['question_id', 'model', 'property_description', alt_id_col, alt_label_col, 'score']
342
+ else:
343
+ # Fall back to basic columns if cluster columns are missing
344
+ cols = ['question_id', 'model', 'property_description', 'score']
345
+
346
+ # Keep only existing columns
347
+ available_cols = [col for col in cols if col in df.columns]
348
+ df = df[available_cols]
349
+
350
+ print(f" - Final DataFrame shape: {df.shape}")
351
+ print(f" - Final columns: {df.columns.tolist()}")
352
+
353
+ return df
354
+
355
+
356
+ def truncate_cluster_name(cluster_desc: str, max_length: int = 50) -> str:
357
+ """Truncate cluster description to fit in table column."""
358
+ if len(cluster_desc) <= max_length:
359
+ return cluster_desc
360
+ return cluster_desc[:max_length-3] + "..."
361
+
362
+ def create_frequency_comparison_table(model_stats: Dict[str, Any],
363
+ selected_models: List[str],
364
+ cluster_level: str = "fine", # Ignored – kept for backward-compat
365
+ top_n: int = 50,
366
+ selected_model: str | None = None,
367
+ selected_quality_metric: str | None = None) -> pd.DataFrame:
368
+ """Create a comparison table for the new FunctionalMetrics format.
369
+
370
+ The old signature is kept (cluster_level arg is ignored) so that callers
371
+ can be updated incrementally.
372
+ """
373
+
374
+ if not selected_models:
375
+ return pd.DataFrame()
376
+
377
+ # ------------------------------------------------------------------
378
+ # 1. Collect per-model, per-cluster rows
379
+ # ------------------------------------------------------------------
380
+ all_rows: List[dict] = []
381
+ for model in selected_models:
382
+ model_clusters = get_model_clusters(model_stats, model) # type: ignore[arg-type]
383
+ if not model_clusters:
384
+ continue
385
+
386
+ # Optional filter by a single model after the fact
387
+ if selected_model and model != selected_model:
388
+ continue
389
+
390
+ for cluster_name, cdata in model_clusters.items():
391
+ # Filter out "No properties" clusters
392
+ if cluster_name == "No properties":
393
+ continue
394
+
395
+ # Basic numbers
396
+ freq_pct = cdata.get("proportion", 0.0) * 100.0
397
+ prop_ci = cdata.get("proportion_ci")
398
+
399
+ # Quality per metric dicts ------------------------------------------------
400
+ quality_dict = cdata.get("quality", {}) or {}
401
+ quality_ci_dict = cdata.get("quality_ci", {}) or {}
402
+
403
+ # Significance flags
404
+ sal_sig = bool(cdata.get("proportion_delta_significant", False))
405
+ quality_sig_flags = cdata.get("quality_delta_significant", {}) or {}
406
+
407
+ all_rows.append({
408
+ "cluster": cluster_name,
409
+ "model": model,
410
+ "frequency": freq_pct,
411
+ "proportion_ci": prop_ci,
412
+ "quality": quality_dict,
413
+ "quality_ci": quality_ci_dict,
414
+ "score_significant": sal_sig,
415
+ "quality_significant_any": any(quality_sig_flags.values()),
416
+ "quality_significant_metric": quality_sig_flags.get(selected_quality_metric) if selected_quality_metric else None,
417
+ })
418
+
419
+ if not all_rows:
420
+ return pd.DataFrame()
421
+
422
+ df_all = pd.DataFrame(all_rows)
423
+
424
+ # Aggregate frequency across models ----------------------------------
425
+ freq_sum = df_all.groupby("cluster")["frequency"].sum().sort_values(ascending=False)
426
+ top_clusters = freq_sum.head(top_n).index.tolist()
427
+
428
+ df_top = df_all[df_all["cluster"].isin(top_clusters)].copy()
429
+
430
+ table_rows: List[dict] = []
431
+ for clu in top_clusters:
432
+ subset = df_top[df_top["cluster"] == clu]
433
+ avg_freq = subset["frequency"].mean()
434
+
435
+ # Aggregate CI (mean of bounds)
436
+ ci_lowers = [ci.get("lower") for ci in subset["proportion_ci"] if isinstance(ci, dict)]
437
+ ci_uppers = [ci.get("upper") for ci in subset["proportion_ci"] if isinstance(ci, dict)]
438
+ freq_ci = {
439
+ "lower": float(np.mean(ci_lowers)) if ci_lowers else None,
440
+ "upper": float(np.mean(ci_uppers)) if ci_uppers else None,
441
+ } if ci_lowers and ci_uppers else None
442
+
443
+ # Quality aggregation -----------------------------------------------------
444
+ q_vals: List[float] = []
445
+ q_ci_l: List[float] = []
446
+ q_ci_u: List[float] = []
447
+ quality_sig_any = False
448
+ for _, row in subset.iterrows():
449
+ q_dict = row["quality"]
450
+ if selected_quality_metric:
451
+ if selected_quality_metric in q_dict:
452
+ q_vals.append(q_dict[selected_quality_metric])
453
+ ci_metric = row["quality_ci"].get(selected_quality_metric) if isinstance(row["quality_ci"], dict) else None
454
+ if ci_metric:
455
+ q_ci_l.append(ci_metric.get("lower"))
456
+ q_ci_u.append(ci_metric.get("upper"))
457
+ quality_sig_any = quality_sig_any or bool(row["quality_significant_metric"])
458
+ else:
459
+ q_vals.extend(q_dict.values())
460
+ for ci in row["quality_ci"].values():
461
+ if isinstance(ci, dict):
462
+ q_ci_l.append(ci.get("lower"))
463
+ q_ci_u.append(ci.get("upper"))
464
+ quality_sig_any = quality_sig_any or row["quality_significant_any"]
465
+
466
+ quality_val = float(np.mean(q_vals)) if q_vals else None
467
+ quality_ci = {
468
+ "lower": float(np.mean(q_ci_l)),
469
+ "upper": float(np.mean(q_ci_u)),
470
+ } if q_ci_l and q_ci_u else None
471
+
472
+ score_sig = subset["score_significant"].any()
473
+
474
+ table_rows.append({
475
+ "Cluster": clu,
476
+ "Frequency (%)": f"{avg_freq:.1f}",
477
+ "Freq CI": format_confidence_interval(freq_ci),
478
+ "Quality": f"{quality_val:.3f}" if quality_val is not None else "N/A",
479
+ "Quality CI": format_confidence_interval(quality_ci) if quality_ci else "N/A",
480
+ "Score Significance": "Yes" if score_sig else "No",
481
+ "Quality Significance": "Yes" if quality_sig_any else "No",
482
+ })
483
+
484
+ return pd.DataFrame(table_rows)
485
+
486
+
487
+ def create_frequency_comparison_plots(model_stats: Dict[str, Any],
488
+ selected_models: List[str],
489
+ cluster_level: str = 'fine',
490
+ top_n: int = 50,
491
+ show_confidence_intervals: bool = False) -> Tuple[go.Figure, go.Figure]:
492
+ """Create frequency comparison plots (matching frequencies_tab.py exactly)."""
493
+
494
+ print(f"\nDEBUG: Plotting function called with:")
495
+ print(f" - Selected models: {selected_models}")
496
+ print(f" - Cluster level: {cluster_level}")
497
+ print(f" - Top N: {top_n}")
498
+ print(f" - Available models in stats: {list(model_stats.keys())}")
499
+
500
+ # Use the same data preparation logic as the table function
501
+ # Collect all clusters across all models for the chart (exact copy from frequencies_tab.py)
502
+ all_clusters_data = []
503
+ for model_name, model_data in model_stats.items():
504
+ if model_name not in selected_models:
505
+ continue
506
+
507
+ clusters = model_data.get(cluster_level, [])
508
+ for cluster in clusters:
509
+ # Filter out "No properties" clusters
510
+ if cluster.get('property_description') == "No properties":
511
+ continue
512
+
513
+ # Get confidence intervals for quality scores if available
514
+ quality_score_ci = cluster.get('quality_score_ci', {})
515
+ has_quality_ci = bool(quality_score_ci)
516
+
517
+ # Get distinctiveness score confidence intervals (correct structure)
518
+ score_ci = cluster.get('score_ci', {})
519
+ ci_lower = score_ci.get('lower') if score_ci else None
520
+ ci_upper = score_ci.get('upper') if score_ci else None
521
+
522
+ all_clusters_data.append({
523
+ 'property_description': cluster['property_description'],
524
+ 'model': model_name,
525
+ 'frequency': cluster.get('proportion', 0) * 100, # Convert to percentage
526
+ 'size': cluster.get('size', 0),
527
+ 'cluster_size_global': cluster.get('cluster_size_global', 0),
528
+ 'has_ci': has_confidence_intervals(cluster),
529
+ 'ci_lower': ci_lower,
530
+ 'ci_upper': ci_upper,
531
+ 'has_quality_ci': has_quality_ci
532
+ })
533
+
534
+ if not all_clusters_data:
535
+ # Return empty figures
536
+ empty_fig = go.Figure()
537
+ empty_fig.add_annotation(text="No data available", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
538
+ return empty_fig, empty_fig
539
+
540
+ clusters_df = pd.DataFrame(all_clusters_data)
541
+
542
+ # Get all unique clusters for the chart
543
+ all_unique_clusters = clusters_df['property_description'].unique()
544
+ total_clusters = len(all_unique_clusters)
545
+
546
+ # Show all clusters by default
547
+ top_n_for_chart = min(top_n, total_clusters)
548
+
549
+ # Calculate total frequency per cluster and get top clusters
550
+ cluster_totals = clusters_df.groupby('property_description')['frequency'].sum().sort_values(ascending=False)
551
+ top_clusters = cluster_totals.head(top_n_for_chart).index.tolist()
552
+
553
+ # Get quality scores for the same clusters to sort by quality
554
+ quality_data_for_sorting = []
555
+ for model_name, model_data in model_stats.items():
556
+ if model_name not in selected_models:
557
+ continue
558
+ clusters = model_data.get(cluster_level, [])
559
+ for cluster in clusters:
560
+ # Filter out "No properties" clusters
561
+ if cluster.get('property_description') == "No properties":
562
+ continue
563
+
564
+ if cluster['property_description'] in top_clusters:
565
+ quality_data_for_sorting.append({
566
+ 'property_description': cluster['property_description'],
567
+ 'quality_score': extract_quality_score(cluster.get('quality_score', 0))
568
+ })
569
+
570
+ # Calculate average quality score per cluster and sort
571
+ if quality_data_for_sorting:
572
+ quality_df_for_sorting = pd.DataFrame(quality_data_for_sorting)
573
+ avg_quality_per_cluster = quality_df_for_sorting.groupby('property_description')['quality_score'].mean().sort_values(ascending=True) # Low to high
574
+ top_clusters = avg_quality_per_cluster.index.tolist()
575
+ # Reverse the order so low quality appears at top of chart
576
+ top_clusters = top_clusters[::-1]
577
+
578
+ # Filter data to only include top clusters
579
+ chart_data = clusters_df[clusters_df['property_description'].isin(top_clusters)]
580
+
581
+ if chart_data.empty:
582
+ # Return empty figures
583
+ empty_fig = go.Figure()
584
+ empty_fig.add_annotation(text="No data available", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
585
+ return empty_fig, empty_fig
586
+
587
+ # Get unique models for colors
588
+ models = chart_data['model'].unique()
589
+ # Use a color palette that avoids yellow - using Set1 which has better contrast
590
+ colors = px.colors.qualitative.Set1[:len(models)]
591
+
592
+ # Create horizontal bar chart for frequencies
593
+ fig = go.Figure()
594
+
595
+ # Add a bar for each model
596
+ for i, model in enumerate(models):
597
+ model_data = chart_data[chart_data['model'] == model]
598
+
599
+ # Sort by cluster order (same as top_clusters)
600
+ model_data = model_data.set_index('property_description').reindex(top_clusters).reset_index()
601
+
602
+ # Fill NaN values with 0 for missing clusters
603
+ model_data['frequency'] = model_data['frequency'].fillna(0)
604
+ model_data['has_ci'] = model_data['has_ci'].fillna(False)
605
+ # For CI columns, replace NaN with None using where() instead of fillna(None)
606
+ model_data['ci_lower'] = model_data['ci_lower'].where(pd.notna(model_data['ci_lower']), None)
607
+ model_data['ci_upper'] = model_data['ci_upper'].where(pd.notna(model_data['ci_upper']), None)
608
+
609
+ # Ensure frequency is numeric and non-negative
610
+ model_data['frequency'] = pd.to_numeric(model_data['frequency'], errors='coerce').fillna(0)
611
+ model_data['frequency'] = model_data['frequency'].clip(lower=0)
612
+
613
+ # Debug: print model data for first model
614
+ if i == 0: # Only print for first model to avoid spam
615
+ print(f"DEBUG: Model {model} data sample:")
616
+ print(f" - Clusters: {len(model_data)}")
617
+ print(f" - Frequency range: {model_data['frequency'].min():.2f} - {model_data['frequency'].max():.2f}")
618
+ print(f" - Non-zero frequencies: {(model_data['frequency'] > 0).sum()}")
619
+ if len(model_data) > 0:
620
+ print(f" - Sample row: {model_data.iloc[0][['property_description', 'frequency']].to_dict()}")
621
+
622
+ # Remove any rows where property_description is NaN (these are clusters this model doesn't appear in)
623
+ model_data = model_data.dropna(subset=['property_description'])
624
+
625
+ # Get confidence intervals for error bars
626
+ ci_lower = []
627
+ ci_upper = []
628
+ for _, row in model_data.iterrows():
629
+ freq_value = row.get('frequency', 0)
630
+ if (row.get('has_ci', False) and
631
+ pd.notna(row.get('ci_lower')) and
632
+ pd.notna(row.get('ci_upper')) and
633
+ freq_value > 0): # Only calculate CIs for non-zero frequencies
634
+
635
+ # IMPORTANT: These are distinctiveness score CIs, not frequency CIs
636
+ # The distinctiveness score measures how much more/less frequently
637
+ # a model exhibits this behavior compared to the median model
638
+ # We can use this to estimate uncertainty in the frequency measurement
639
+ distinctiveness_ci_width = row['ci_upper'] - row['ci_lower']
640
+
641
+ # Convert to frequency uncertainty (approximate)
642
+ # A wider distinctiveness CI suggests more uncertainty in the frequency
643
+ freq_uncertainty = distinctiveness_ci_width * freq_value * 0.1
644
+ ci_lower.append(max(0, freq_value - freq_uncertainty))
645
+ ci_upper.append(freq_value + freq_uncertainty)
646
+ else:
647
+ ci_lower.append(None)
648
+ ci_upper.append(None)
649
+
650
+ # Debug: Check the data going into the plot
651
+ print(f"DEBUG: Adding trace for model {model}:")
652
+ print(f" - Y values (clusters): {model_data['property_description'].tolist()[:3]}...") # First 3 clusters
653
+ print(f" - X values (frequencies): {model_data['frequency'].tolist()[:3]}...") # First 3 frequencies
654
+ print(f" - Total data points: {len(model_data)}")
655
+
656
+ fig.add_trace(go.Bar(
657
+ y=model_data['property_description'],
658
+ x=model_data['frequency'],
659
+ name=model,
660
+ orientation='h',
661
+ marker_color=colors[i],
662
+ error_x=dict(
663
+ type='data',
664
+ array=[u - l if u is not None and l is not None else None for l, u in zip(ci_lower, ci_upper)],
665
+ arrayminus=[f - l if f is not None and l is not None else None for f, l in zip(model_data['frequency'], ci_lower)],
666
+ visible=show_confidence_intervals,
667
+ thickness=1,
668
+ width=3,
669
+ color='rgba(0,0,0,0.3)'
670
+ ),
671
+ hovertemplate='<b>%{y}</b><br>' +
672
+ f'Model: {model}<br>' +
673
+ 'Frequency: %{x:.1f}%<br>' +
674
+ 'CI: %{customdata[0]}<extra></extra>',
675
+ customdata=[[
676
+ format_confidence_interval({
677
+ 'lower': l,
678
+ 'upper': u
679
+ }) if l is not None and u is not None else "N/A"
680
+ for l, u in zip(ci_lower, ci_upper)
681
+ ]]
682
+ ))
683
+
684
+ # Update layout
685
+ fig.update_layout(
686
+ title=f"Model Frequencies in Top {len(top_clusters)} Clusters",
687
+ xaxis_title="Frequency (%)",
688
+ yaxis_title="Cluster Description",
689
+ barmode='group', # Group bars side by side
690
+ height=max(600, len(top_clusters) * 25), # Adjust height based on number of clusters
691
+ showlegend=True,
692
+ legend=dict(
693
+ orientation="h",
694
+ yanchor="bottom",
695
+ y=1.02,
696
+ xanchor="right",
697
+ x=1
698
+ )
699
+ )
700
+
701
+ # Update y-axis to show truncated cluster names
702
+ fig.update_yaxes(
703
+ tickmode='array',
704
+ ticktext=[truncate_cluster_name(desc, 60) for desc in top_clusters],
705
+ tickvals=top_clusters
706
+ )
707
+
708
+ # Create quality score chart
709
+ # Get quality scores for the same clusters (single score per cluster)
710
+ quality_data = []
711
+ quality_cis = [] # Add confidence intervals for quality scores
712
+
713
+ for cluster_desc in top_clusters:
714
+ # Get the first available quality score for this cluster
715
+ for model_name, model_data in model_stats.items():
716
+ clusters = model_data.get(cluster_level, [])
717
+ for cluster in clusters:
718
+ if cluster['property_description'] == cluster_desc:
719
+ quality_score = extract_quality_score(cluster.get('quality_score', 0))
720
+ quality_data.append({
721
+ 'property_description': cluster_desc,
722
+ 'quality_score': quality_score
723
+ })
724
+
725
+ # Get quality score confidence intervals
726
+ quality_ci = cluster.get('quality_score_ci', {})
727
+ if isinstance(quality_ci, dict) and quality_ci:
728
+ # Get the first available quality CI
729
+ for score_key, ci_data in quality_ci.items():
730
+ if isinstance(ci_data, dict):
731
+ ci_lower = ci_data.get('lower')
732
+ ci_upper = ci_data.get('upper')
733
+ if ci_lower is not None and ci_upper is not None:
734
+ quality_cis.append({
735
+ 'property_description': cluster_desc,
736
+ 'ci_lower': ci_lower,
737
+ 'ci_upper': ci_upper
738
+ })
739
+ break
740
+ else:
741
+ quality_cis.append({
742
+ 'property_description': cluster_desc,
743
+ 'ci_lower': None,
744
+ 'ci_upper': None
745
+ })
746
+ else:
747
+ quality_cis.append({
748
+ 'property_description': cluster_desc,
749
+ 'ci_lower': None,
750
+ 'ci_upper': None
751
+ })
752
+ break
753
+ if any(q['property_description'] == cluster_desc for q in quality_data):
754
+ break
755
+
756
+ if quality_data:
757
+ quality_df = pd.DataFrame(quality_data)
758
+ quality_cis_df = pd.DataFrame(quality_cis) if quality_cis else None
759
+
760
+ # Create quality score chart with single bars
761
+ fig_quality = go.Figure()
762
+
763
+ # Prepare confidence intervals for error bars
764
+ ci_lower = []
765
+ ci_upper = []
766
+ for _, row in quality_df.iterrows():
767
+ cluster_desc = row['property_description']
768
+ if quality_cis_df is not None:
769
+ ci_row = quality_cis_df[quality_cis_df['property_description'] == cluster_desc]
770
+ if not ci_row.empty:
771
+ ci_lower.append(ci_row.iloc[0]['ci_lower'])
772
+ ci_upper.append(ci_row.iloc[0]['ci_upper'])
773
+ else:
774
+ ci_lower.append(None)
775
+ ci_upper.append(None)
776
+ else:
777
+ ci_lower.append(None)
778
+ ci_upper.append(None)
779
+
780
+ # Add a single bar for each cluster
781
+ fig_quality.add_trace(go.Bar(
782
+ y=[truncate_cluster_name(desc, 60) for desc in quality_df['property_description']],
783
+ x=quality_df['quality_score'],
784
+ orientation='h',
785
+ marker_color='lightblue', # Single color for all bars
786
+ name='Quality Score',
787
+ showlegend=False,
788
+ error_x=dict(
789
+ type='data',
790
+ array=[u - l if u is not None and l is not None else None for l, u in zip(ci_lower, ci_upper)],
791
+ arrayminus=[q - l if q is not None and l is not None else None for q, l in zip(quality_df['quality_score'], ci_lower)],
792
+ visible=show_confidence_intervals,
793
+ thickness=1,
794
+ width=3,
795
+ color='rgba(0,0,0,0.3)'
796
+ ),
797
+ hovertemplate='<b>%{y}</b><br>' +
798
+ 'Quality Score: %{x:.3f}<br>' +
799
+ 'CI: %{customdata[0]}<extra></extra>',
800
+ customdata=[[
801
+ format_confidence_interval({
802
+ 'lower': l,
803
+ 'upper': u
804
+ }) if l is not None and u is not None else "N/A"
805
+ for l, u in zip(ci_lower, ci_upper)
806
+ ]]
807
+ ))
808
+
809
+ # Update layout
810
+ fig_quality.update_layout(
811
+ title=f"Quality Scores",
812
+ xaxis_title="Quality Score",
813
+ yaxis_title="", # No y-axis title to save space
814
+ height=max(600, len(top_clusters) * 25), # Same height as main chart
815
+ showlegend=False,
816
+ yaxis=dict(showticklabels=False) # Hide y-axis labels to save space
817
+ )
818
+ else:
819
+ # Create empty quality figure
820
+ fig_quality = go.Figure()
821
+ fig_quality.add_annotation(text="No quality score data available",
822
+ xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
823
+
824
+ return fig, fig_quality
825
+
826
+
827
+ def search_clusters_by_text(clustered_df: pd.DataFrame,
828
+ search_term: str,
829
+ search_in: str = 'description') -> pd.DataFrame:
830
+ """Search clusters by text in descriptions or other fields."""
831
+ if not search_term:
832
+ return clustered_df.head(100) # Return first 100 if no search
833
+
834
+ search_term = search_term.lower()
835
+
836
+ if search_in == 'description':
837
+ mask = clustered_df['property_description'].str.lower().str.contains(search_term, na=False)
838
+ elif search_in == 'model':
839
+ mask = clustered_df['model'].str.lower().str.contains(search_term, na=False)
840
+ elif search_in == 'cluster_label':
841
+ # Use correct column names from pipeline
842
+ fine_label_col = 'property_description_fine_cluster_label'
843
+ coarse_label_col = 'property_description_coarse_cluster_label'
844
+ mask = pd.Series([False] * len(clustered_df))
845
+
846
+ if fine_label_col in clustered_df.columns:
847
+ mask |= clustered_df[fine_label_col].str.lower().str.contains(search_term, na=False)
848
+ if coarse_label_col in clustered_df.columns:
849
+ mask |= clustered_df[coarse_label_col].str.lower().str.contains(search_term, na=False)
850
+ else:
851
+ # Search in all text columns using correct column names
852
+ text_cols = ['property_description', 'model',
853
+ 'property_description_fine_cluster_label',
854
+ 'property_description_coarse_cluster_label']
855
+ mask = pd.Series([False] * len(clustered_df))
856
+ for col in text_cols:
857
+ if col in clustered_df.columns:
858
+ mask |= clustered_df[col].str.lower().str.contains(search_term, na=False)
859
+
860
+ return clustered_df[mask].head(100)
861
+
862
+
863
+ def search_clusters_only(clustered_df: pd.DataFrame,
864
+ search_term: str,
865
+ cluster_level: str = 'fine') -> pd.DataFrame:
866
+ """Search only over cluster labels, not individual property descriptions."""
867
+ if not search_term:
868
+ return clustered_df
869
+
870
+ search_term = search_term.lower()
871
+
872
+ # Use the correct column names based on cluster level
873
+ if cluster_level == 'fine':
874
+ label_col = 'property_description_fine_cluster_label'
875
+ alt_label_col = 'fine_cluster_label'
876
+ else:
877
+ label_col = 'property_description_coarse_cluster_label'
878
+ alt_label_col = 'coarse_cluster_label'
879
+
880
+ # Try both naming patterns
881
+ if label_col in clustered_df.columns:
882
+ mask = clustered_df[label_col].str.lower().str.contains(search_term, na=False)
883
+ elif alt_label_col in clustered_df.columns:
884
+ mask = clustered_df[alt_label_col].str.lower().str.contains(search_term, na=False)
885
+ else:
886
+ # If neither column exists, return empty DataFrame
887
+ return pd.DataFrame()
888
+
889
+ return clustered_df[mask]
890
+
891
+
892
+ def create_interactive_cluster_viewer(clustered_df: pd.DataFrame,
893
+ selected_models: Optional[List[str]] = None,
894
+ cluster_level: str = 'fine') -> str:
895
+ """Create interactive cluster viewer HTML similar to Streamlit version."""
896
+ if clustered_df.empty:
897
+ return "<p>No cluster data available</p>"
898
+
899
+ df = clustered_df.copy()
900
+
901
+ # Debug information
902
+ print(f"DEBUG: create_interactive_cluster_viewer called")
903
+ print(f" - Input DataFrame shape: {df.shape}")
904
+ print(f" - Selected models: {selected_models}")
905
+ print(f" - Available models in data: {df['model'].unique().tolist() if 'model' in df.columns else 'No model column'}")
906
+
907
+ # Filter by models if specified
908
+ if selected_models:
909
+ print(f" - Filtering by {len(selected_models)} selected models")
910
+ df = df[df['model'].isin(selected_models)]
911
+ print(f" - After filtering shape: {df.shape}")
912
+ print(f" - Models after filtering: {df['model'].unique().tolist()}")
913
+ else:
914
+ print(f" - No model filtering applied")
915
+
916
+ if df.empty:
917
+ return f"<p>No data found for selected models: {', '.join(selected_models or [])}</p>"
918
+
919
+ # Get cluster scores data for quality and frequency information
920
+ from .state import app_state
921
+ cluster_scores = app_state.get("metrics", {}).get("cluster_scores", {})
922
+
923
+ # Use the actual column names from the pipeline output (matching Streamlit version)
924
+ if cluster_level == 'fine':
925
+ id_col = 'property_description_fine_cluster_id'
926
+ label_col = 'property_description_fine_cluster_label'
927
+ # Also check for alternative naming without prefix
928
+ alt_id_col = 'fine_cluster_id'
929
+ alt_label_col = 'fine_cluster_label'
930
+ else:
931
+ id_col = 'property_description_coarse_cluster_id'
932
+ label_col = 'property_description_coarse_cluster_label'
933
+ # Also check for alternative naming without prefix
934
+ alt_id_col = 'coarse_cluster_id'
935
+ alt_label_col = 'coarse_cluster_label'
936
+
937
+ # Track if we fall back from coarse to fine
938
+ fell_back_to_fine = False
939
+
940
+ # Check if required columns exist and provide helpful debug info
941
+ # Try both naming patterns
942
+ if id_col in df.columns and label_col in df.columns:
943
+ # Use the expected naming pattern
944
+ pass
945
+ elif alt_id_col in df.columns and alt_label_col in df.columns:
946
+ # Use the alternative naming pattern
947
+ id_col = alt_id_col
948
+ label_col = alt_label_col
949
+ else:
950
+ # If coarse clusters are not available, try to fall back to fine clusters
951
+ if cluster_level == 'coarse':
952
+ # Check if fine clusters are available
953
+ fine_id_col = 'property_description_fine_cluster_id'
954
+ fine_label_col = 'property_description_fine_cluster_label'
955
+ fine_alt_id_col = 'fine_cluster_id'
956
+ fine_alt_label_col = 'fine_cluster_label'
957
+
958
+ if (fine_id_col in df.columns and fine_label_col in df.columns) or (fine_alt_id_col in df.columns and fine_alt_label_col in df.columns):
959
+ # Fall back to fine clusters
960
+ if fine_id_col in df.columns and fine_label_col in df.columns:
961
+ id_col = fine_id_col
962
+ label_col = fine_label_col
963
+ else:
964
+ id_col = fine_alt_id_col
965
+ label_col = fine_alt_label_col
966
+ cluster_level = 'fine' # Update the cluster level for display
967
+ fell_back_to_fine = True
968
+ else:
969
+ # No cluster columns available at all
970
+ available_cols = list(df.columns)
971
+ return f"""
972
+ <div style="padding: 20px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px;">
973
+ <h4>❌ Missing cluster columns in data</h4>
974
+ <p><strong>Expected:</strong> {id_col}, {label_col} OR {alt_id_col}, {alt_label_col}</p>
975
+ <p><strong>Available columns:</strong> {', '.join(available_cols)}</p>
976
+ <p>Please ensure your data contains clustering results from the LMM-Vibes pipeline.</p>
977
+ </div>
978
+ """
979
+ else:
980
+ # For fine clusters, show the original error
981
+ available_cols = list(df.columns)
982
+ return f"""
983
+ <div style="padding: 20px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px;">
984
+ <h4>❌ Missing {cluster_level} cluster columns in data</h4>
985
+ <p><strong>Expected:</strong> {id_col}, {label_col} OR {alt_id_col}, {alt_label_col}</p>
986
+ <p><strong>Available columns:</strong> {', '.join(available_cols)}</p>
987
+ <p>Please ensure your data contains clustering results from the LMM-Vibes pipeline.</p>
988
+ </div>
989
+ """
990
+
991
+ # Group by cluster to get cluster information
992
+ try:
993
+ print(f" - Grouping by cluster columns: {id_col}, {label_col}")
994
+ cluster_groups = df.groupby([id_col, label_col]).agg({
995
+ 'property_description': ['count', lambda x: x.unique().tolist()],
996
+ 'model': lambda x: x.unique().tolist()
997
+ }).reset_index()
998
+
999
+ # Flatten column names
1000
+ cluster_groups.columns = [
1001
+ id_col, label_col, 'size', 'property_descriptions', 'models'
1002
+ ]
1003
+
1004
+ # Sort by size (largest first)
1005
+ cluster_groups = cluster_groups.sort_values('size', ascending=False)
1006
+
1007
+ # Filter out "No properties" clusters
1008
+ cluster_groups = cluster_groups[cluster_groups[label_col] != "No properties"]
1009
+
1010
+ print(f" - Found {len(cluster_groups)} clusters")
1011
+ print(f" - Cluster sizes: {cluster_groups['size'].tolist()}")
1012
+ print(f" - Models per cluster: {[len(models) for models in cluster_groups['models']]}")
1013
+
1014
+ except Exception as e:
1015
+ return f"""
1016
+ <div style="padding: 20px; background: #f8d7da; border: 1px solid #f5c6cb; border-radius: 8px;">
1017
+ <h4>❌ Error processing cluster data</h4>
1018
+ <p><strong>Error:</strong> {str(e)}</p>
1019
+ <p>Please check your data format and try again.</p>
1020
+ </div>
1021
+ """
1022
+
1023
+ if len(cluster_groups) == 0:
1024
+ return """
1025
+ <div style="padding: 20px; background: #d1ecf1; border: 1px solid #bee5eb; border-radius: 8px;">
1026
+ <h4>ℹ️ No clusters found</h4>
1027
+ <p>No clusters match your current filters. Try selecting different models or adjusting your search.</p>
1028
+ </div>
1029
+ """
1030
+
1031
+ # Create HTML
1032
+ html = f"""
1033
+ <div style="max-width: 1600px; margin: 0 auto;">
1034
+ <h3>πŸ” Interactive Cluster Viewer ({cluster_level.title()} Level)</h3>
1035
+ <p style="color: #666; margin-bottom: 20px;">
1036
+ Click on clusters below to explore their property descriptions.
1037
+ Showing {len(cluster_groups)} clusters sorted by size.
1038
+ </p>
1039
+ """
1040
+
1041
+ # Add a note if we fell back from coarse to fine clusters
1042
+ if cluster_level == 'fine' and fell_back_to_fine:
1043
+ html += """
1044
+ <div style="padding: 15px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px; margin-bottom: 20px;">
1045
+ <strong>Note:</strong> Coarse clusters not available in this dataset. Showing fine clusters instead.
1046
+ </div>
1047
+ """
1048
+
1049
+ for i, row in cluster_groups.iterrows():
1050
+ cluster_id = row[id_col]
1051
+ cluster_label = row[label_col]
1052
+ cluster_size = row['size']
1053
+ property_descriptions = row['property_descriptions']
1054
+ models_in_cluster = row['models']
1055
+
1056
+ # Get quality and frequency information from cluster_scores
1057
+ cluster_metrics = cluster_scores.get(cluster_label, {})
1058
+ frequency_pct = cluster_metrics.get("proportion", 0) * 100 if cluster_metrics else 0
1059
+ quality_scores = cluster_metrics.get("quality", {})
1060
+ quality_delta = cluster_metrics.get("quality_delta", {})
1061
+
1062
+ # Build per-metric header display: "metric: score (delta)"
1063
+ header_quality_display = "N/A"
1064
+ if quality_scores or quality_delta:
1065
+ metric_names = sorted(set(quality_scores.keys()) | set(quality_delta.keys()))
1066
+ parts: list[str] = []
1067
+ for metric_name in metric_names:
1068
+ score_val = quality_scores.get(metric_name)
1069
+ delta_val = quality_delta.get(metric_name)
1070
+ score_str = f"{score_val:.3f}" if isinstance(score_val, (int, float)) else "N/A"
1071
+ if isinstance(delta_val, (int, float)):
1072
+ color = "#28a745" if delta_val >= 0 else "#dc3545"
1073
+ parts.append(f"{metric_name}: {score_str} <span style=\"color: {color};\">({delta_val:+.3f})</span>")
1074
+ else:
1075
+ parts.append(f"{metric_name}: {score_str}")
1076
+ header_quality_display = "\n".join(parts)
1077
+
1078
+ # Format quality scores for detailed view
1079
+ quality_html = ""
1080
+ if quality_scores:
1081
+ quality_parts = []
1082
+ for metric_name, score in quality_scores.items():
1083
+ color = "#28a745" if score >= 0 else "#dc3545"
1084
+ quality_parts.append(f'<span style="color:{color}; font-weight:500;">{metric_name}: {score:.3f}</span>')
1085
+ quality_html = " | ".join(quality_parts)
1086
+ else:
1087
+ quality_html = '<span style="color:#666;">No quality data</span>'
1088
+
1089
+ # Format quality delta (relative to average)
1090
+ quality_delta_html = ""
1091
+ if quality_delta:
1092
+ delta_parts = []
1093
+ for metric_name, delta in quality_delta.items():
1094
+ color = "#28a745" if delta >= 0 else "#dc3545"
1095
+ sign = "+" if delta >= 0 else ""
1096
+ delta_parts.append(f'<span style="color:{color}; font-weight:500;">{metric_name}: {sign}{delta:.3f}</span>')
1097
+ quality_delta_html = " | ".join(delta_parts)
1098
+ else:
1099
+ quality_delta_html = '<span style="color:#666;">No delta data</span>'
1100
+
1101
+ # Format header quality score with visual indicators
1102
+ header_quality_text = header_quality_display
1103
+
1104
+ # Get light color for this cluster (matching overview style)
1105
+ cluster_color = get_light_color_for_cluster(cluster_label, i)
1106
+
1107
+ # Create expandable cluster card with overview-style design
1108
+ html += f"""
1109
+ <details style="margin: 15px 0; border: 1px solid #e0e0e0; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
1110
+ <summary style="
1111
+ padding: 15px;
1112
+ background: {cluster_color};
1113
+ color: #333;
1114
+ cursor: pointer;
1115
+ font-weight: 600;
1116
+ font-size: 16px;
1117
+ user-select: none;
1118
+ list-style: none;
1119
+ display: flex;
1120
+ justify-content: space-between;
1121
+ align-items: center;
1122
+ border-bottom: 1px solid #dee2e6;
1123
+ ">
1124
+ <div style="max-width: 80%;">
1125
+ <div style="margin-bottom: 4px;">
1126
+ <strong style="font-size: 14px;">{cluster_label}</strong>
1127
+ </div>
1128
+ <span style="font-size: 12px; color: #555;">
1129
+ {frequency_pct:.1f}% frequency ({cluster_size} properties) Β· {len(models_in_cluster)} models
1130
+ </span>
1131
+ </div>
1132
+ <div style="font-size: 12px; font-weight: normal; white-space: nowrap; text-align: right;">
1133
+ <div style="margin-bottom: 4px;">
1134
+ <span style="font-weight: 500;">{header_quality_text}</span>
1135
+ </div>
1136
+ <div style="color: #6c757d;">
1137
+ {frequency_pct:.1f}% frequency
1138
+ </div>
1139
+ </div>
1140
+ </summary>
1141
+
1142
+ <div style="padding: 20px; background: #f8f9fa;">
1143
+ <div style="margin-bottom: 15px;">
1144
+ <strong>Cluster ID:</strong> {cluster_id}<br>
1145
+ <strong>Size:</strong> {cluster_size} properties<br>
1146
+ <strong>Models:</strong> {', '.join(models_in_cluster)}<br>
1147
+ <strong>Frequency:</strong> {frequency_pct:.1f}% of all conversations<br>
1148
+ <strong>Quality Scores:</strong> {quality_html}<br>
1149
+ <strong>Quality vs Average:</strong> {quality_delta_html}
1150
+ </div>
1151
+
1152
+ <h4 style="color: #333; margin: 15px 0 10px 0;">
1153
+ Property Descriptions ({len(property_descriptions)})
1154
+ </h4>
1155
+
1156
+ <div style="max-height: 300px; overflow-y: auto; background: white; border: 1px solid #ddd; border-radius: 4px; padding: 10px;">
1157
+ """
1158
+
1159
+ # Display property descriptions
1160
+ for i, desc in enumerate(property_descriptions, 1):
1161
+ html += f"""
1162
+ <div style="
1163
+ padding: 8px;
1164
+ margin: 4px 0;
1165
+ background: #f8f9fa;
1166
+ border-left: 3px solid #667eea;
1167
+ border-radius: 2px;
1168
+ ">
1169
+ <strong>{i}.</strong> {desc}
1170
+ </div>
1171
+ """
1172
+
1173
+ html += """
1174
+ </div>
1175
+ </div>
1176
+ </details>
1177
+ """
1178
+
1179
+ html += "</div>"
1180
+ return html
1181
+
1182
+
1183
+ def get_cluster_statistics(clustered_df: pd.DataFrame,
1184
+ selected_models: Optional[List[str]] = None) -> Dict[str, Any]:
1185
+ """Get cluster statistics for display."""
1186
+ if clustered_df.empty:
1187
+ return {}
1188
+
1189
+ df = clustered_df.copy()
1190
+
1191
+ # Filter by models if specified
1192
+ if selected_models:
1193
+ df = df[df['model'].isin(selected_models)]
1194
+
1195
+ stats = {
1196
+ 'total_properties': len(df),
1197
+ 'total_models': df['model'].nunique() if 'model' in df.columns else 0,
1198
+ }
1199
+
1200
+ # Fine cluster statistics - try both naming patterns
1201
+ fine_id_col = 'property_description_fine_cluster_id'
1202
+ alt_fine_id_col = 'fine_cluster_id'
1203
+
1204
+ if fine_id_col in df.columns:
1205
+ stats['fine_clusters'] = df[fine_id_col].nunique()
1206
+ cluster_sizes = df.groupby(fine_id_col).size()
1207
+ stats['min_properties_per_fine_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
1208
+ stats['max_properties_per_fine_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
1209
+ stats['avg_properties_per_fine_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
1210
+ elif alt_fine_id_col in df.columns:
1211
+ stats['fine_clusters'] = df[alt_fine_id_col].nunique()
1212
+ cluster_sizes = df.groupby(alt_fine_id_col).size()
1213
+ stats['min_properties_per_fine_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
1214
+ stats['max_properties_per_fine_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
1215
+ stats['avg_properties_per_fine_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
1216
+
1217
+ # Coarse cluster statistics - try both naming patterns
1218
+ coarse_id_col = 'property_description_coarse_cluster_id'
1219
+ alt_coarse_id_col = 'coarse_cluster_id'
1220
+
1221
+ if coarse_id_col in df.columns:
1222
+ stats['coarse_clusters'] = df[coarse_id_col].nunique()
1223
+ cluster_sizes = df.groupby(coarse_id_col).size()
1224
+ stats['min_properties_per_coarse_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
1225
+ stats['max_properties_per_coarse_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
1226
+ stats['avg_properties_per_coarse_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
1227
+ elif alt_coarse_id_col in df.columns:
1228
+ stats['coarse_clusters'] = df[alt_coarse_id_col].nunique()
1229
+ cluster_sizes = df.groupby(alt_coarse_id_col).size()
1230
+ stats['min_properties_per_coarse_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
1231
+ stats['max_properties_per_coarse_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
1232
+ stats['avg_properties_per_coarse_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
1233
+
1234
+ return stats
1235
+
1236
+
1237
+ def get_unique_values_for_dropdowns(clustered_df: pd.DataFrame) -> Dict[str, List[str]]:
1238
+ """Get unique values for dropdown menus."""
1239
+ if clustered_df.empty:
1240
+ return {'prompts': [], 'models': [], 'properties': []}
1241
+
1242
+ # Get unique values, handling missing columns gracefully
1243
+ prompts = []
1244
+ if 'prompt' in clustered_df.columns:
1245
+ unique_prompts = clustered_df['prompt'].dropna().unique().tolist()
1246
+ prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
1247
+ elif 'question' in clustered_df.columns:
1248
+ unique_prompts = clustered_df['question'].dropna().unique().tolist()
1249
+ prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
1250
+ elif 'input' in clustered_df.columns:
1251
+ unique_prompts = clustered_df['input'].dropna().unique().tolist()
1252
+ prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
1253
+ elif 'user_prompt' in clustered_df.columns:
1254
+ unique_prompts = clustered_df['user_prompt'].dropna().unique().tolist()
1255
+ prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
1256
+
1257
+ # Handle both single model and side-by-side datasets
1258
+ models = []
1259
+ if 'model' in clustered_df.columns:
1260
+ # Single model datasets
1261
+ models = sorted(clustered_df['model'].dropna().unique().tolist())
1262
+ elif 'model_a' in clustered_df.columns and 'model_b' in clustered_df.columns:
1263
+ # Side-by-side datasets - combine models from both columns
1264
+ models_a = clustered_df['model_a'].dropna().unique().tolist()
1265
+ models_b = clustered_df['model_b'].dropna().unique().tolist()
1266
+ all_models = set(models_a + models_b)
1267
+ models = sorted(list(all_models))
1268
+
1269
+ # Use fine cluster labels instead of property descriptions - try both naming patterns
1270
+ properties = []
1271
+ fine_label_col = 'property_description_fine_cluster_label'
1272
+ alt_fine_label_col = 'fine_cluster_label'
1273
+
1274
+ if fine_label_col in clustered_df.columns:
1275
+ unique_properties = clustered_df[fine_label_col].dropna().unique().tolist()
1276
+ # Filter out "No properties" clusters
1277
+ unique_properties = [prop for prop in unique_properties if prop != "No properties"]
1278
+ properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
1279
+ elif alt_fine_label_col in clustered_df.columns:
1280
+ unique_properties = clustered_df[alt_fine_label_col].dropna().unique().tolist()
1281
+ # Filter out "No properties" clusters
1282
+ unique_properties = [prop for prop in unique_properties if prop != "No properties"]
1283
+ properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
1284
+ elif 'property_description' in clustered_df.columns:
1285
+ # Fallback to property descriptions if cluster labels not available
1286
+ unique_properties = clustered_df['property_description'].dropna().unique().tolist()
1287
+ # Filter out "No properties" clusters
1288
+ unique_properties = [prop for prop in unique_properties if prop != "No properties"]
1289
+ properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
1290
+
1291
+ return {
1292
+ 'prompts': prompts,
1293
+ 'models': models,
1294
+ 'properties': properties
1295
+ }
1296
+
1297
+ # ---------------------------------------------------------------------------
1298
+ # Example data extraction (restored)
1299
+ # ---------------------------------------------------------------------------
1300
+
1301
+ def get_example_data(
1302
+ clustered_df: pd.DataFrame,
1303
+ selected_prompt: str | None = None,
1304
+ selected_model: str | None = None,
1305
+ selected_property: str | None = None,
1306
+ max_examples: int = 5,
1307
+ show_unexpected_behavior: bool = False,
1308
+ randomize: bool = False,
1309
+ ) -> List[Dict[str, Any]]:
1310
+ """Return a list of example rows filtered by prompt / model / property.
1311
+
1312
+ This function was accidentally removed during a refactor; it is required by
1313
+ *examples_tab.py* and other parts of the UI.
1314
+
1315
+ Args:
1316
+ clustered_df: DataFrame containing the clustered results data
1317
+ selected_prompt: Prompt to filter by (None for all)
1318
+ selected_model: Model to filter by (None for all)
1319
+ selected_property: Property description to filter by (None for all)
1320
+ max_examples: Maximum number of examples to return
1321
+ show_unexpected_behavior: If True, filter to only show unexpected behavior
1322
+ randomize: If True, sample randomly from the filtered set instead of taking the first rows
1323
+
1324
+ Returns:
1325
+ List of example dictionaries with extracted data
1326
+ """
1327
+
1328
+ if clustered_df.empty:
1329
+ return []
1330
+
1331
+ df = clustered_df.copy()
1332
+
1333
+ # Filter by unexpected behavior if requested
1334
+ if show_unexpected_behavior:
1335
+ if "unexpected_behavior" in df.columns:
1336
+ # Assuming True/1 means unexpected behavior
1337
+ df = df[df["unexpected_behavior"].isin([True, 1, "True", "true"])]
1338
+ else:
1339
+ # If no unexpected_behavior column, return empty (or could return all)
1340
+ return []
1341
+
1342
+ # Filter by prompt
1343
+ if selected_prompt:
1344
+ prompt_cols = ["prompt", "question", "input", "user_prompt"]
1345
+ for col in prompt_cols:
1346
+ if col in df.columns:
1347
+ df = df[df[col].str.contains(selected_prompt, case=False, na=False)]
1348
+ break
1349
+
1350
+ # Filter by model - handle both single model and side-by-side datasets
1351
+ if selected_model:
1352
+ if "model" in df.columns:
1353
+ # Single model datasets
1354
+ df = df[df["model"] == selected_model]
1355
+ elif "model_a" in df.columns and "model_b" in df.columns:
1356
+ # Side-by-side datasets - filter where either model_a or model_b matches
1357
+ df = df[(df["model_a"] == selected_model) | (df["model_b"] == selected_model)]
1358
+
1359
+ # Filter by property
1360
+ if selected_property:
1361
+ property_cols = ["property_description", "cluster", "fine_cluster_label", "property_description_fine_cluster_label"]
1362
+ for col in property_cols:
1363
+ if col in df.columns:
1364
+ df = df[df[col].str.contains(selected_property, case=False, na=False)]
1365
+ break
1366
+
1367
+ # Limit to max_examples (randomized if requested)
1368
+ if randomize:
1369
+ if len(df) > max_examples:
1370
+ df = df.sample(n=max_examples)
1371
+ else:
1372
+ df = df.sample(frac=1)
1373
+ else:
1374
+ df = df.head(max_examples)
1375
+
1376
+ examples: List[Dict[str, Any]] = []
1377
+ for _, row in df.iterrows():
1378
+ prompt_val = next(
1379
+ (row.get(col) for col in ["prompt", "question", "input", "user_prompt"] if row.get(col) is not None),
1380
+ "N/A",
1381
+ )
1382
+
1383
+ # Check if this is a side-by-side dataset
1384
+ is_side_by_side = ('model_a_response' in row and 'model_b_response' in row and
1385
+ row.get('model_a_response') is not None and row.get('model_b_response') is not None)
1386
+
1387
+ if is_side_by_side:
1388
+ # For side-by-side datasets, store both responses separately
1389
+ response_val = "SIDE_BY_SIDE" # Special marker
1390
+ model_val = f"{row.get('model_a', 'Model A')} vs {row.get('model_b', 'Model B')}"
1391
+ else:
1392
+ # For single response datasets, use the existing logic
1393
+ response_val = next(
1394
+ (
1395
+ row.get(col)
1396
+ for col in [
1397
+ "model_response",
1398
+ "model_a_response",
1399
+ "model_b_response",
1400
+ "responses",
1401
+ "response",
1402
+ "output",
1403
+ ]
1404
+ if row.get(col) is not None
1405
+ ),
1406
+ "N/A",
1407
+ )
1408
+ model_val = row.get("model", "N/A")
1409
+
1410
+ # Try both naming patterns for cluster data
1411
+ fine_cluster_id = row.get("property_description_fine_cluster_id", row.get("fine_cluster_id", "N/A"))
1412
+ fine_cluster_label = row.get("property_description_fine_cluster_label", row.get("fine_cluster_label", "N/A"))
1413
+ coarse_cluster_id = row.get("property_description_coarse_cluster_id", row.get("coarse_cluster_id", "N/A"))
1414
+ coarse_cluster_label = row.get("property_description_coarse_cluster_label", row.get("coarse_cluster_label", "N/A"))
1415
+
1416
+ example_dict = {
1417
+ "id": row.get("id", "N/A"),
1418
+ "model": model_val,
1419
+ "prompt": prompt_val,
1420
+ "response": response_val,
1421
+ "property_description": row.get("property_description", "N/A"),
1422
+ "score": row.get("score", "N/A"),
1423
+ "fine_cluster_id": fine_cluster_id,
1424
+ "fine_cluster_label": fine_cluster_label,
1425
+ "coarse_cluster_id": coarse_cluster_id,
1426
+ "coarse_cluster_label": coarse_cluster_label,
1427
+ "category": row.get("category", "N/A"),
1428
+ "type": row.get("type", "N/A"),
1429
+ "impact": row.get("impact", "N/A"),
1430
+ "reason": row.get("reason", "N/A"),
1431
+ "evidence": row.get("evidence", "N/A"),
1432
+ "user_preference_direction": row.get("user_preference_direction", "N/A"),
1433
+ "raw_response": row.get("raw_response", "N/A"),
1434
+ "contains_errors": row.get("contains_errors", "N/A"),
1435
+ "unexpected_behavior": row.get("unexpected_behavior", "N/A"),
1436
+ }
1437
+
1438
+ # Add side-by-side specific fields if applicable
1439
+ if is_side_by_side:
1440
+ example_dict.update({
1441
+ "is_side_by_side": True,
1442
+ "model_a": row.get("model_a", "Model A"),
1443
+ "model_b": row.get("model_b", "Model B"),
1444
+ "model_a_response": row.get("model_a_response", "N/A"),
1445
+ "model_b_response": row.get("model_b_response", "N/A"),
1446
+ "winner": row.get("winner", None),
1447
+ })
1448
+ else:
1449
+ example_dict["is_side_by_side"] = False
1450
+
1451
+ examples.append(example_dict)
1452
+
1453
+ return examples
1454
+
1455
+
1456
+ def format_examples_display(examples: List[Dict[str, Any]],
1457
+ selected_prompt: str = None,
1458
+ selected_model: str = None,
1459
+ selected_property: str = None,
1460
+ use_accordion: bool = True,
1461
+ pretty_print_dicts: bool = True) -> str:
1462
+ """Format examples for HTML display with proper conversation rendering.
1463
+
1464
+ Args:
1465
+ examples: List of example dictionaries
1466
+ selected_prompt: Currently selected prompt filter
1467
+ selected_model: Currently selected model filter
1468
+ selected_property: Currently selected property filter
1469
+ use_accordion: If True, group system and info messages in collapsible accordions
1470
+ pretty_print_dicts: If True, pretty-print embedded dictionaries
1471
+
1472
+ Returns:
1473
+ HTML string for display
1474
+ """
1475
+ from .conversation_display import convert_to_openai_format, display_openai_conversation_html
1476
+ from .side_by_side_display import display_side_by_side_responses
1477
+
1478
+ if not examples:
1479
+ return "<p style='color: #e74c3c; padding: 20px;'>No examples found matching the current filters.</p>"
1480
+
1481
+ # Create filter summary
1482
+ filter_parts = []
1483
+ if selected_prompt and selected_prompt != "All Prompts":
1484
+ filter_parts.append(f"Prompt: {selected_prompt}")
1485
+ if selected_model and selected_model != "All Models":
1486
+ filter_parts.append(f"Model: {selected_model}")
1487
+ if selected_property and selected_property != "All Clusters":
1488
+ filter_parts.append(f"Cluster: {selected_property}")
1489
+
1490
+ filter_summary = ""
1491
+ if filter_parts:
1492
+ filter_summary = f"""
1493
+ <div style="background: #e3f2fd; padding: 15px; border-radius: 8px; margin-bottom: 20px; border-left: 4px solid #2196f3;">
1494
+ <strong>πŸ” Active Filters:</strong> {" β€’ ".join(filter_parts)}
1495
+ </div>
1496
+ """
1497
+
1498
+ html = f"""
1499
+ <div style="font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;">
1500
+ <h3 style="color: #333; margin-bottom: 15px;">πŸ“‹ Examples ({len(examples)} found)</h3>
1501
+ {filter_summary}
1502
+ """
1503
+
1504
+ for i, example in enumerate(examples, 1):
1505
+ # Check if this is a side-by-side example
1506
+ if example.get('is_side_by_side', False):
1507
+ # Use side-by-side display for comparison datasets
1508
+ conversation_html = display_side_by_side_responses(
1509
+ model_a=example['model_a'],
1510
+ model_b=example['model_b'],
1511
+ model_a_response=example['model_a_response'],
1512
+ model_b_response=example['model_b_response'],
1513
+ use_accordion=use_accordion,
1514
+ pretty_print_dicts=pretty_print_dicts,
1515
+ score=example['score'],
1516
+ winner=example.get('winner')
1517
+ )
1518
+ else:
1519
+ # Convert response to OpenAI format for proper display (single model)
1520
+ response_data = example['response']
1521
+ if response_data != 'N/A':
1522
+ openai_conversation = convert_to_openai_format(response_data)
1523
+ conversation_html = display_openai_conversation_html(openai_conversation, use_accordion=use_accordion, pretty_print_dicts=pretty_print_dicts)
1524
+ else:
1525
+ conversation_html = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
1526
+
1527
+ # Determine cluster info
1528
+ cluster_info = ""
1529
+ if example['fine_cluster_label'] != 'N/A':
1530
+ cluster_info = f"""
1531
+ <div style="margin-top: 10px; font-size: 13px; color: #666;">
1532
+ <strong>🏷️ Cluster:</strong> {example['fine_cluster_label']} (ID: {example['fine_cluster_id']})
1533
+ </div>
1534
+ """
1535
+
1536
+ # Score display for summary (only for non-side-by-side or when not shown in side-by-side)
1537
+ score_badge = ""
1538
+ if not example.get('is_side_by_side', False) and example['score'] != 'N/A':
1539
+ try:
1540
+ score_val = float(example['score'])
1541
+ score_color = '#28a745' if score_val >= 0 else '#dc3545'
1542
+ score_badge = f"""
1543
+ <span style="
1544
+ background: {score_color};
1545
+ color: white;
1546
+ padding: 4px 8px;
1547
+ border-radius: 12px;
1548
+ font-size: 12px;
1549
+ font-weight: bold;
1550
+ margin-left: 10px;
1551
+ ">
1552
+ Score: {score_val:.3f}
1553
+ </span>
1554
+ """
1555
+ except:
1556
+ pass
1557
+
1558
+ # Create short preview of prompt for summary
1559
+ prompt_preview = example['prompt'][:80] + "..." if len(example['prompt']) > 80 else example['prompt']
1560
+
1561
+ # Create expandable example card
1562
+ # First example is expanded by default
1563
+ open_attr = "open" if i == 1 else ""
1564
+
1565
+ html += f"""
1566
+ <details {open_attr} style="border: 1px solid #dee2e6; border-radius: 8px; margin-bottom: 15px; background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
1567
+ <summary style="
1568
+ padding: 15px;
1569
+ cursor: pointer;
1570
+ font-weight: 600;
1571
+ color: #495057;
1572
+ background: linear-gradient(90deg, #f8f9fa 0%, #e9ecef 100%);
1573
+ border-radius: 8px 8px 0 0;
1574
+ border-bottom: 1px solid #dee2e6;
1575
+ display: flex;
1576
+ align-items: center;
1577
+ justify-content: space-between;
1578
+ ">
1579
+ <span>
1580
+ <span style="background: #6c757d; color: white; padding: 4px 8px; border-radius: 4px; font-size: 12px; margin-right: 10px;">#{i}</span>
1581
+ {prompt_preview}
1582
+ </span>
1583
+ <span style="font-size: 12px; color: #6c757d;">
1584
+ {example['model']}{score_badge}
1585
+ </span>
1586
+ </summary>
1587
+
1588
+ <div style="padding: 20px;">
1589
+ <div style="margin-bottom: 15px; padding: 15px; background: #f8f9fa; border-radius: 6px; border-left: 4px solid #17a2b8;">
1590
+
1591
+ <div style="display: flex; flex-wrap: wrap; gap: 15px; margin-top: 15px; font-size: 13px; color: #666;">
1592
+ <div><strong>Model:</strong> {example['model']}</div>
1593
+ <div><strong>ID:</strong> {example['id']}</div>
1594
+ {f'<div><strong>Category:</strong> {example["category"]}</div>' if example["category"] not in ["N/A", "None"] else ""}
1595
+ {f'<div><strong>Type:</strong> {example["type"]}</div>' if example["type"] not in ["N/A", "None"] else ""}
1596
+ {f'<div><strong>Impact:</strong> {example["impact"]}</div>' if example["impact"] not in ["N/A", "None"] else ""}
1597
+ </div>
1598
+
1599
+ <div style="margin-top: 10px;">
1600
+ {f'<div style="margin-top: 10px;"><strong>Property:</strong> {example["property_description"]}</div>' if example["property_description"] not in ["N/A", "None"] else ""}
1601
+ {f'<div style="margin-top: 10px;"><strong>Reason:</strong> {example["reason"]}</div>' if example["reason"] not in ["N/A", "None"] else ""}
1602
+ {f'<div style="margin-top: 10px;"><strong>Evidence:</strong> {example["evidence"]}</div>' if example["evidence"] not in ["N/A", "None"] else ""}
1603
+ </div>
1604
+ </div>
1605
+
1606
+ <div style="margin-bottom: 15px;">
1607
+ <h5 style="margin: 0 0 8px 0; color: #333; font-size: 14px;">πŸ’¬ {"Response Comparison" if example.get('is_side_by_side', False) else "Conversation"}</h5>
1608
+ <div style="border-radius: 6px; font-size: 13px; line-height: 1.5;">
1609
+ {conversation_html}
1610
+ </div>
1611
+ </div>
1612
+ </div>
1613
+ </details>
1614
+ """
1615
+
1616
+ html += "</div>"
1617
+ return html
1618
+
1619
+ # ---------------------------------------------------------------------------
1620
+ # Legacy function aliases (backward compatibility)
1621
+ # ---------------------------------------------------------------------------
1622
+
1623
+ def compute_model_rankings(*args, **kwargs):
1624
+ """Legacy alias β†’ forwards to compute_model_rankings_new."""
1625
+ return compute_model_rankings_new(*args, **kwargs)
1626
+
1627
+
1628
+ def create_model_summary_card(*args, **kwargs):
1629
+ """Legacy alias β†’ forwards to create_model_summary_card_new."""
1630
+ return create_model_summary_card_new(*args, **kwargs)
1631
+
1632
+
1633
+ def get_total_clusters_count(metrics: Dict[str, Any]) -> int:
1634
+ """Get the total number of clusters from the metrics data."""
1635
+ cluster_scores = metrics.get("cluster_scores", {})
1636
+ # Filter out "No properties" clusters
1637
+ cluster_scores = {k: v for k, v in cluster_scores.items() if k != "No properties"}
1638
+ return len(cluster_scores)
1639
+
1640
+
1641
+ def get_light_color_for_cluster(cluster_name: str, index: int) -> str:
1642
+ """Generate a light dusty blue background for cluster boxes.
1643
+
1644
+ Returns a consistent light dusty blue color for all clusters.
1645
+ """
1646
+ return "#f0f4f8" # Very light dusty blue
1647
+
1648
+ __all__ = [
1649
+ "get_model_clusters",
1650
+ "get_all_models",
1651
+ "get_all_clusters",
1652
+ "format_confidence_interval",
1653
+ "get_confidence_interval_width",
1654
+ "has_confidence_intervals",
1655
+ "extract_quality_score",
1656
+ "get_top_clusters_for_model",
1657
+ "compute_model_rankings_new",
1658
+ "create_model_summary_card_new",
1659
+ "format_cluster_dataframe",
1660
+ "truncate_cluster_name",
1661
+ "create_frequency_comparison_table",
1662
+ "create_frequency_comparison_plots",
1663
+ "search_clusters_by_text",
1664
+ "search_clusters_only",
1665
+ "create_interactive_cluster_viewer",
1666
+ "get_cluster_statistics",
1667
+ "get_unique_values_for_dropdowns",
1668
+ "get_example_data",
1669
+ "format_examples_display",
1670
+ "compute_model_rankings",
1671
+ "create_model_summary_card",
1672
+ "get_total_clusters_count",
1673
+ ]