codelion commited on
Commit
ccde0a2
·
verified ·
1 Parent(s): 9970afb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +541 -345
app.py CHANGED
@@ -9,7 +9,6 @@ import ast
9
  import logging
10
  import numpy as np
11
  from sklearn.cluster import KMeans
12
- from sklearn.decomposition import PCA
13
  from sklearn.manifold import TSNE
14
  from scipy import stats
15
  from scipy.stats import entropy
@@ -65,7 +64,7 @@ def ensure_float(value):
65
  return None
66
 
67
  # Function to process and visualize log probs with multiple analyses
68
- def visualize_logprobs(json_input, prob_filter=-1e9): # Changed -float('inf') to -1e9
69
  try:
70
  # Parse the input (handles both JSON and Python dictionaries)
71
  data = parse_input(json_input)
@@ -111,334 +110,503 @@ def visualize_logprobs(json_input, prob_filter=-1e9): # Changed -float('inf') t
111
  else:
112
  logger.debug("Skipping entry with logprob: %s (type: %s)", entry.get("logprob"), type(entry.get("logprob", None)))
113
 
114
- # If no valid data after filtering, return error messages
115
- if not logprobs:
116
- return ("No finite log probabilities to visualize after filtering.", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
117
 
118
  # 1. Main Log Probability Plot (with click for tokens)
119
- fig_main, ax_main = plt.subplots(figsize=(10, 5))
120
- scatter = ax_main.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b", label="Selected Token")[0]
121
- ax_main.set_title("Log Probabilities of Generated Tokens")
122
- ax_main.set_xlabel("Token Position")
123
- ax_main.set_ylabel("Log Probability")
124
- ax_main.grid(True)
125
- ax_main.set_xticks([]) # Hide X-axis labels by default
126
-
127
- # Add click functionality to show token
128
- token_annotations = []
129
- for i, (x, y) in enumerate(zip(range(len(logprobs)), logprobs)):
130
- annotation = ax_main.annotate('', (x, y), xytext=(10, 10), textcoords='offset points', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8), visible=False)
131
- token_annotations.append(annotation)
132
-
133
- def on_click(event):
134
- if event.inaxes == ax_main:
135
- for i, (x, y) in enumerate(zip(range(len(logprobs)), logprobs)):
136
- contains, _ = scatter.contains(event)
137
- if contains and abs(event.xdata - x) < 0.5 and abs(event.ydata - y) < 0.5:
138
- token_annotations[i].set_text(tokens[i])
139
- token_annotations[i].set_visible(True)
140
- fig_main.canvas.draw_idle()
141
- else:
142
- token_annotations[i].set_visible(False)
143
- fig_main.canvas.draw_idle()
144
-
145
- fig_main.canvas.mpl_connect('button_press_event', on_click)
146
-
147
- # Save main plot
148
- buf_main = io.BytesIO()
149
- plt.savefig(buf_main, format="png", bbox_inches="tight", dpi=100)
150
- buf_main.seek(0)
151
- plt.close(fig_main)
152
- img_main_bytes = buf_main.getvalue()
153
- img_main_base64 = base64.b64encode(img_main_bytes).decode("utf-8")
154
- img_main_html = f'<img src="data:image/png;base64,{img_main_base64}" style="max-width: 100%; height: auto;">'
155
 
156
  # 2. K-Means Clustering of Log Probabilities
157
- kmeans = KMeans(n_clusters=3, random_state=42)
158
- cluster_labels = kmeans.fit_predict(np.array(logprobs).reshape(-1, 1))
159
- fig_cluster, ax_cluster = plt.subplots(figsize=(10, 5))
160
- scatter = ax_cluster.scatter(range(len(logprobs)), logprobs, c=cluster_labels, cmap='viridis')
161
- ax_cluster.set_title("K-Means Clustering of Log Probabilities")
162
- ax_cluster.set_xlabel("Token Position")
163
- ax_cluster.set_ylabel("Log Probability")
164
- ax_cluster.grid(True)
165
- plt.colorbar(scatter, ax=ax_cluster, label="Cluster")
166
- buf_cluster = io.BytesIO()
167
- plt.savefig(buf_cluster, format="png", bbox_inches="tight", dpi=100)
168
- buf_cluster.seek(0)
169
- plt.close(fig_cluster)
170
- img_cluster_bytes = buf_cluster.getvalue()
171
- img_cluster_base64 = base64.b64encode(img_cluster_bytes).decode("utf-8")
172
- img_cluster_html = f'<img src="data:image/png;base64,{img_cluster_base64}" style="max-width: 100%; height: auto;">'
 
173
 
174
  # 3. Probability Drop Analysis
175
- drops = [logprobs[i+1] - logprobs[i] if i < len(logprobs)-1 else 0 for i in range(len(logprobs))]
176
- fig_drops, ax_drops = plt.subplots(figsize=(10, 5))
177
- ax_drops.bar(range(len(drops)), drops, color='red', alpha=0.5)
178
- ax_drops.set_title("Significant Probability Drops")
179
- ax_drops.set_xlabel("Token Position")
180
- ax_drops.set_ylabel("Log Probability Drop")
181
- ax_drops.grid(True)
182
- buf_drops = io.BytesIO()
183
- plt.savefig(buf_drops, format="png", bbox_inches="tight", dpi=100)
184
- buf_drops.seek(0)
185
- plt.close(fig_drops)
186
- img_drops_bytes = buf_drops.getvalue()
187
- img_drops_base64 = base64.b64encode(img_drops_bytes).decode("utf-8")
188
- img_drops_html = f'<img src="data:image/png;base64,{img_drops_base64}" style="max-width: 100%; height: auto;">'
 
189
 
190
  # 4. N-Gram Analysis (Bigrams for simplicity)
191
- bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]
192
- bigram_probs = [logprobs[i] + logprobs[i+1] for i in range(len(tokens)-1)]
193
- fig_ngram, ax_ngram = plt.subplots(figsize=(10, 5))
194
- ax_ngram.bar(range(len(bigrams)), bigram_probs, color='green')
195
- ax_ngram.set_title("N-Gram (Bigrams) Probability Sum")
196
- ax_ngram.set_xlabel("Bigram Position")
197
- ax_ngram.set_ylabel("Sum of Log Probabilities")
198
- ax_ngram.set_xticks(range(len(bigrams)))
199
- ax_ngram.set_xticklabels([f"{b[0]}->{b[1]}" for b in bigrams], rotation=45, ha="right")
200
- ax_ngram.grid(True)
201
- buf_ngram = io.BytesIO()
202
- plt.savefig(buf_ngram, format="png", bbox_inches="tight", dpi=100)
203
- buf_ngram.seek(0)
204
- plt.close(fig_ngram)
205
- img_ngram_bytes = buf_ngram.getvalue()
206
- img_ngram_base64 = base64.b64encode(img_ngram_bytes).decode("utf-8")
207
- img_ngram_html = f'<img src="data:image/png;base64,{img_ngram_base64}" style="max-width: 100%; height: auto;">'
 
208
 
209
  # 5. Markov Chain Modeling (Simple Graph)
210
- G = nx.DiGraph()
211
- for i in range(len(tokens)-1):
212
- G.add_edge(tokens[i], tokens[i+1], weight=logprobs[i+1] - logprobs[i])
213
- fig_markov, ax_markov = plt.subplots(figsize=(10, 5))
214
- pos = nx.spring_layout(G)
215
- nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=500, edge_color='gray', width=1, ax=ax_markov)
216
- ax_markov.set_title("Markov Chain of Token Transitions")
217
- buf_markov = io.BytesIO()
218
- plt.savefig(buf_markov, format="png", bbox_inches="tight", dpi=100)
219
- buf_markov.seek(0)
220
- plt.close(fig_markov)
221
- img_markov_bytes = buf_markov.getvalue()
222
- img_markov_base64 = base64.b64encode(img_markov_bytes).decode("utf-8")
223
- img_markov_html = f'<img src="data:image/png;base64,{img_markov_base64}" style="max-width: 100%; height: auto;">'
 
224
 
225
  # 6. Anomaly Detection (Outlier Detection with Z-Score)
226
- z_scores = np.abs(stats.zscore(logprobs))
227
- outliers = z_scores > 2 # Threshold for outliers
228
- fig_anomaly, ax_anomaly = plt.subplots(figsize=(10, 5))
229
- ax_anomaly.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b")
230
- ax_anomaly.plot(np.where(outliers)[0], [logprobs[i] for i in np.where(outliers)[0]], "ro", label="Outliers")
231
- ax_anomaly.set_title("Log Probabilities with Outliers")
232
- ax_anomaly.set_xlabel("Token Position")
233
- ax_anomaly.set_ylabel("Log Probability")
234
- ax_anomaly.grid(True)
235
- ax_anomaly.legend()
236
- ax_anomaly.set_xticks([]) # Hide X-axis labels
237
- buf_anomaly = io.BytesIO()
238
- plt.savefig(buf_anomaly, format="png", bbox_inches="tight", dpi=100)
239
- buf_anomaly.seek(0)
240
- plt.close(fig_anomaly)
241
- img_anomaly_bytes = buf_anomaly.getvalue()
242
- img_anomaly_base64 = base64.b64encode(img_anomaly_bytes).decode("utf-8")
243
- img_anomaly_html = f'<img src="data:image/png;base64,{img_anomaly_base64}" style="max-width: 100%; height: auto;">'
 
244
 
245
  # 7. Autocorrelation
246
- autocorr = correlate(logprobs, logprobs, mode='full')
247
- autocorr = autocorr[len(autocorr)//2:] / len(logprobs) # Normalize
248
- fig_autocorr, ax_autocorr = plt.subplots(figsize=(10, 5))
249
- ax_autocorr.plot(range(len(autocorr)), autocorr, color='purple')
250
- ax_autocorr.set_title("Autocorrelation of Log Probabilities")
251
- ax_autocorr.set_xlabel("Lag")
252
- ax_autocorr.set_ylabel("Autocorrelation")
253
- ax_autocorr.grid(True)
254
- buf_autocorr = io.BytesIO()
255
- plt.savefig(buf_autocorr, format="png", bbox_inches="tight", dpi=100)
256
- buf_autocorr.seek(0)
257
- plt.close(fig_autocorr)
258
- img_autocorr_bytes = buf_autocorr.getvalue()
259
- img_autocorr_base64 = base64.b64encode(img_autocorr_bytes).decode("utf-8")
260
- img_autocorr_html = f'<img src="data:image/png;base64,{img_autocorr_base64}" style="max-width: 100%; height: auto;">'
 
261
 
262
  # 8. Smoothing (Moving Average)
263
- window_size = 3
264
- moving_avg = np.convolve(logprobs, np.ones(window_size)/window_size, mode='valid')
265
- fig_smoothing, ax_smoothing = plt.subplots(figsize=(10, 5))
266
- ax_smoothing.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b", label="Original")
267
- ax_smoothing.plot(range(window_size-1, len(logprobs)), moving_avg, color="orange", label="Moving Average")
268
- ax_smoothing.set_title("Log Probabilities with Moving Average")
269
- ax_smoothing.set_xlabel("Token Position")
270
- ax_smoothing.set_ylabel("Log Probability")
271
- ax_smoothing.grid(True)
272
- ax_smoothing.legend()
273
- ax_smoothing.set_xticks([]) # Hide X-axis labels
274
- buf_smoothing = io.BytesIO()
275
- plt.savefig(buf_smoothing, format="png", bbox_inches="tight", dpi=100)
276
- buf_smoothing.seek(0)
277
- plt.close(fig_smoothing)
278
- img_smoothing_bytes = buf_smoothing.getvalue()
279
- img_smoothing_base64 = base64.b64encode(img_smoothing_bytes).decode("utf-8")
280
- img_smoothing_html = f'<img src="data:image/png;base64,{img_smoothing_base64}" style="max-width: 100%; height: auto;">'
 
281
 
282
  # 9. Uncertainty Propagation (Variance of Top Logprobs)
283
- variances = []
284
- for probs in top_alternatives:
285
- if len(probs) > 1:
286
- values = [p[1] for p in probs]
287
- variances.append(np.var(values))
288
- else:
289
- variances.append(0)
290
- fig_uncertainty, ax_uncertainty = plt.subplots(figsize=(10, 5))
291
- ax_uncertainty.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b", label="Log Prob")
292
- ax_uncertainty.fill_between(range(len(logprobs)), [lp - v for lp, v in zip(logprobs, variances)],
293
- [lp + v for lp, v in zip(logprobs, variances)], color='gray', alpha=0.3, label="Uncertainty")
294
- ax_uncertainty.set_title("Log Probabilities with Uncertainty Propagation")
295
- ax_uncertainty.set_xlabel("Token Position")
296
- ax_uncertainty.set_ylabel("Log Probability")
297
- ax_uncertainty.grid(True)
298
- ax_uncertainty.legend()
299
- ax_uncertainty.set_xticks([]) # Hide X-axis labels
300
- buf_uncertainty = io.BytesIO()
301
- plt.savefig(buf_uncertainty, format="png", bbox_inches="tight", dpi=100)
302
- buf_uncertainty.seek(0)
303
- plt.close(fig_uncertainty)
304
- img_uncertainty_bytes = buf_uncertainty.getvalue()
305
- img_uncertainty_base64 = base64.b64encode(img_uncertainty_bytes).decode("utf-8")
306
- img_uncertainty_html = f'<img src="data:image/png;base64,{img_uncertainty_base64}" style="max-width: 100%; height: auto;">'
 
307
 
308
  # 10. Correlation Heatmap
309
- corr_matrix = np.corrcoef(logprobs, rowvar=False)
310
- fig_corr, ax_corr = plt.subplots(figsize=(10, 5))
311
- im = ax_corr.imshow(corr_matrix, cmap='coolwarm', interpolation='nearest')
312
- ax_corr.set_title("Correlation of Log Probabilities Across Positions")
313
- ax_corr.set_xlabel("Token Position")
314
- ax_corr.set_ylabel("Token Position")
315
- plt.colorbar(im, ax=ax_corr, label="Correlation")
316
- buf_corr = io.BytesIO()
317
- plt.savefig(buf_corr, format="png", bbox_inches="tight", dpi=100)
318
- buf_corr.seek(0)
319
- plt.close(fig_corr)
320
- img_corr_bytes = buf_corr.getvalue()
321
- img_corr_base64 = base64.b64encode(img_corr_bytes).decode("utf-8")
322
- img_corr_html = f'<img src="data:image/png;base64,{img_corr_base64}" style="max-width: 100%; height: auto;">'
 
323
 
324
  # 11. Token Type Correlation
325
- type_probs = {t: [] for t in set(token_types)}
326
- for t, p in zip(token_types, logprobs):
327
- type_probs[t].append(p)
328
- fig_type, ax_type = plt.subplots(figsize=(10, 5))
329
- for t in type_probs:
330
- ax_type.bar(t, np.mean(type_probs[t]), yerr=np.std(type_probs[t]), capsize=5, label=t)
331
- ax_type.set_title("Average Log Probability by Token Type")
332
- ax_type.set_xlabel("Token Type")
333
- ax_type.set_ylabel("Average Log Probability")
334
- ax_type.grid(True)
335
- ax_type.legend()
336
- buf_type = io.BytesIO()
337
- plt.savefig(buf_type, format="png", bbox_inches="tight", dpi=100)
338
- buf_type.seek(0)
339
- plt.close(fig_type)
340
- img_type_bytes = buf_type.getvalue()
341
- img_type_base64 = base64.b64encode(img_type_bytes).decode("utf-8")
342
- img_type_html = f'<img src="data:image/png;base64,{img_type_base64}" style="max-width: 100%; height: auto;">'
 
343
 
344
  # 12. Token Embedding Similarity vs. Probability (Simulated)
345
- # Simulate embedding distances (e.g., cosine similarity) as random values for demonstration
346
- simulated_embeddings = np.random.rand(len(tokens), 2) # 2D embeddings
347
- fig_embed, ax_embed = plt.subplots(figsize=(10, 5))
348
- ax_embed.scatter(simulated_embeddings[:, 0], simulated_embeddings[:, 1], c=logprobs, cmap='viridis')
349
- ax_embed.set_title("Token Embedding Similarity vs. Log Probability")
350
- ax_embed.set_xlabel("Embedding Dimension 1")
351
- ax_embed.set_ylabel("Embedding Dimension 2")
352
- plt.colorbar(ax_embed.collections[0], ax=ax_embed, label="Log Probability")
353
- buf_embed = io.BytesIO()
354
- plt.savefig(buf_embed, format="png", bbox_inches="tight", dpi=100)
355
- buf_embed.seek(0)
356
- plt.close(fig_embed)
357
- img_embed_bytes = buf_embed.getvalue()
358
- img_embed_base64 = base64.b64encode(img_embed_bytes).decode("utf-8")
359
- img_embed_html = f'<img src="data:image/png;base64,{img_embed_base64}" style="max-width: 100%; height: auto;">'
360
 
361
  # 13. Bayesian Inference (Simplified as Inferred Probabilities)
362
- # Simulate inferred probabilities based on top_logprobs entropy
363
- entropies = [entropy([p[1] for p in probs], base=2) for probs in top_alternatives if len(probs) > 1]
364
- fig_bayesian, ax_bayesian = plt.subplots(figsize=(10, 5))
365
- ax_bayesian.bar(range(len(entropies)), entropies, color='orange')
366
- ax_bayesian.set_title("Bayesian Inferred Uncertainty (Entropy)")
367
- ax_bayesian.set_xlabel("Token Position")
368
- ax_bayesian.set_ylabel("Entropy")
369
- ax_bayesian.grid(True)
370
- buf_bayesian = io.BytesIO()
371
- plt.savefig(buf_bayesian, format="png", bbox_inches="tight", dpi=100)
372
- buf_bayesian.seek(0)
373
- plt.close(fig_bayesian)
374
- img_bayesian_bytes = buf_bayesian.getvalue()
375
- img_bayesian_base64 = base64.b64encode(img_bayesian_bytes).decode("utf-8")
376
- img_bayesian_html = f'<img src="data:image/png;base64,{img_bayesian_base64}" style="max-width: 100%; height: auto;">'
377
 
378
  # 14. Graph-Based Analysis
379
- G = nx.DiGraph()
380
- for i in range(len(tokens)-1):
381
- G.add_edge(tokens[i], tokens[i+1], weight=logprobs[i+1] - logprobs[i])
382
- fig_graph, ax_graph = plt.subplots(figsize=(10, 5))
383
- pos = nx.spring_layout(G)
384
- nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=500, edge_color='gray', width=1, ax=ax_graph)
385
- ax_graph.set_title("Graph of Token Transitions")
386
- buf_graph = io.BytesIO()
387
- plt.savefig(buf_graph, format="png", bbox_inches="tight", dpi=100)
388
- buf_graph.seek(0)
389
- plt.close(fig_graph)
390
- img_graph_bytes = buf_graph.getvalue()
391
- img_graph_base64 = base64.b64encode(img_graph_bytes).decode("utf-8")
392
- img_graph_html = f'<img src="data:image/png;base64,{img_graph_base64}" style="max-width: 100%; height: auto;">'
 
393
 
394
  # 15. Dimensionality Reduction (t-SNE)
395
- features = np.array([logprobs + [p[1] for p in alts[:2]] for logprobs, alts in zip([logprobs], top_alternatives)])
396
- tsne = TSNE(n_components=2, random_state=42)
397
- tsne_result = tsne.fit_transform(features.T)
398
- fig_tsne, ax_tsne = plt.subplots(figsize=(10, 5))
399
- scatter = ax_tsne.scatter(tsne_result[:, 0], tsne_result[:, 1], c=logprobs, cmap='viridis')
400
- ax_tsne.set_title("t-SNE of Log Probabilities and Top Alternatives")
401
- ax_tsne.set_xlabel("t-SNE Dimension 1")
402
- ax_tsne.set_ylabel("t-SNE Dimension 2")
403
- plt.colorbar(scatter, ax=ax_tsne, label="Log Probability")
404
- buf_tsne = io.BytesIO()
405
- plt.savefig(buf_tsne, format="png", bbox_inches="tight", dpi=100)
406
- buf_tsne.seek(0)
407
- plt.close(fig_tsne)
408
- img_tsne_bytes = buf_tsne.getvalue()
409
- img_tsne_base64 = base64.b64encode(img_tsne_bytes).decode("utf-8")
410
- img_tsne_html = f'<img src="data:image/png;base64,{img_tsne_base64}" style="max-width: 100%; height: auto;">'
 
411
 
412
  # 16. Interactive Heatmap
413
- fig_heatmap, ax_heatmap = plt.subplots(figsize=(10, 5))
414
- im = ax_heatmap.imshow([logprobs], cmap='viridis', aspect='auto')
415
- ax_heatmap.set_title("Interactive Heatmap of Log Probabilities")
416
- ax_heatmap.set_xlabel("Token Position")
417
- ax_heatmap.set_ylabel("Probability Level")
418
- plt.colorbar(im, ax=ax_heatmap, label="Log Probability")
419
- buf_heatmap = io.BytesIO()
420
- plt.savefig(buf_heatmap, format="png", bbox_inches="tight", dpi=100)
421
- buf_heatmap.seek(0)
422
- plt.close(fig_heatmap)
423
- img_heatmap_bytes = buf_heatmap.getvalue()
424
- img_heatmap_base64 = base64.b64encode(img_heatmap_bytes).decode("utf-8")
425
- img_heatmap_html = f'<img src="data:image/png;base64,{img_heatmap_base64}" style="max-width: 100%; height: auto;">'
 
426
 
427
  # 17. Probability Distribution Plots (Box Plots for Top Logprobs)
428
- all_top_probs = [p[1] for alts in top_alternatives for p in alts]
429
- fig_dist, ax_dist = plt.subplots(figsize=(10, 5))
430
- ax_dist.boxplot([logprobs] + [p[1] for alts in top_alternatives for p in alts[:2]], labels=["Selected"] + ["Alt1", "Alt2"])
431
- ax_dist.set_title("Probability Distribution of Top Tokens")
432
- ax_dist.set_xlabel("Token Type")
433
- ax_dist.set_ylabel("Log Probability")
434
- ax_dist.grid(True)
435
- buf_dist = io.BytesIO()
436
- plt.savefig(buf_dist, format="png", bbox_inches="tight", dpi=100)
437
- buf_dist.seek(0)
438
- plt.close(fig_dist)
439
- img_dist_bytes = buf_dist.getvalue()
440
- img_dist_base64 = base64.b64encode(img_dist_bytes).decode("utf-8")
441
- img_dist_html = f'<img src="data:image/png;base64,{img_dist_base64}" style="max-width: 100%; height: auto;">'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
 
443
  # Create DataFrame for the table
444
  table_data = []
@@ -512,16 +680,36 @@ def visualize_logprobs(json_input, prob_filter=-1e9): # Changed -float('inf') t
512
  alt_viz_html += "</li>"
513
  alt_viz_html += "</ul>"
514
 
515
- return (img_main_html, df, colored_text_html, alt_viz_html, img_cluster_html, img_drops_html,
516
- img_ngram_html, img_markov_html, img_anomaly_html, img_autocorr_html, img_smoothing_html,
517
- img_uncertainty_html, img_corr_html, img_type_html, img_embed_html, img_bayesian_html,
518
- img_graph_html, img_tsne_html, img_heatmap_html, img_dist_html)
 
 
 
 
 
 
 
 
 
 
 
 
 
519
 
520
  except Exception as e:
521
  logger.error("Visualization failed: %s", str(e))
522
- return (f"Error: {str(e)}", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
 
 
 
 
 
 
 
523
 
524
- # Gradio interface with dynamic filtering
525
  with gr.Blocks(title="Log Probability Visualizer") as app:
526
  gr.Markdown("# Log Probability Visualizer")
527
  gr.Markdown(
@@ -529,49 +717,57 @@ with gr.Blocks(title="Log Probability Visualizer") as app:
529
  )
530
 
531
  with gr.Row():
532
- json_input = gr.Textbox(
533
- label="JSON Input",
534
- lines=10,
535
- placeholder="Paste your JSON (e.g., {\"content\": [...]}) or Python dict (e.g., {'content': [...]}) here...",
536
- )
537
- prob_filter = gr.Slider(minimum=-1e9, maximum=0, value=-1e9, label="Log Probability Filter (≥)") # Changed -float('inf') to -1e9
538
-
539
- with gr.Row():
540
- plot_output = gr.HTML(label="Log Probability Plot (Click for Tokens)")
541
- cluster_output = gr.HTML(label="K-Means Clustering")
542
- drops_output = gr.HTML(label="Probability Drops")
543
-
544
- with gr.Row():
545
- ngram_output = gr.HTML(label="N-Gram Analysis")
546
- markov_output = gr.HTML(label="Markov Chain")
547
-
548
- with gr.Row():
549
- anomaly_output = gr.HTML(label="Anomaly Detection")
550
- autocorr_output = gr.HTML(label="Autocorrelation")
551
-
552
- with gr.Row():
553
- smoothing_output = gr.HTML(label="Smoothing (Moving Average)")
554
- uncertainty_output = gr.HTML(label="Uncertainty Propagation")
555
-
556
- with gr.Row():
557
- corr_output = gr.HTML(label="Correlation Heatmap")
558
- type_output = gr.HTML(label="Token Type Correlation")
559
-
560
- with gr.Row():
561
- embed_output = gr.HTML(label="Embedding Similarity vs. Probability")
562
- bayesian_output = gr.HTML(label="Bayesian Inference (Entropy)")
563
-
564
- with gr.Row():
565
- graph_output = gr.HTML(label="Graph of Token Transitions")
566
- tsne_output = gr.HTML(label="t-SNE of Log Probabilities")
567
-
568
- with gr.Row():
569
- heatmap_output = gr.HTML(label="Interactive Heatmap")
570
- dist_output = gr.HTML(label="Probability Distribution")
571
-
572
- table_output = gr.Dataframe(label="Token Log Probabilities and Top Alternatives")
573
- text_output = gr.HTML(label="Colored Text (Confidence Visualization)")
574
- alt_viz_output = gr.HTML(label="Top 3 Token Log Probabilities")
 
 
 
 
 
 
 
 
575
 
576
  btn = gr.Button("Visualize")
577
  btn.click(
 
9
  import logging
10
  import numpy as np
11
  from sklearn.cluster import KMeans
 
12
  from sklearn.manifold import TSNE
13
  from scipy import stats
14
  from scipy.stats import entropy
 
64
  return None
65
 
66
  # Function to process and visualize log probs with multiple analyses
67
+ def visualize_logprobs(json_input, prob_filter=-1e9):
68
  try:
69
  # Parse the input (handles both JSON and Python dictionaries)
70
  data = parse_input(json_input)
 
110
  else:
111
  logger.debug("Skipping entry with logprob: %s (type: %s)", entry.get("logprob"), type(entry.get("logprob", None)))
112
 
113
+ # Check if there's valid data after filtering
114
+ if not logprobs or not tokens:
115
+ return ("No finite log probabilities or tokens to visualize after filtering.", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
116
 
117
  # 1. Main Log Probability Plot (with click for tokens)
118
+ def create_main_plot():
119
+ fig_main, ax_main = plt.subplots(figsize=(10, 5))
120
+ if not logprobs or not tokens:
121
+ raise ValueError("No data for main plot")
122
+ scatter = ax_main.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b", label="Selected Token")[0]
123
+ ax_main.set_title("Log Probabilities of Generated Tokens")
124
+ ax_main.set_xlabel("Token Position")
125
+ ax_main.set_ylabel("Log Probability")
126
+ ax_main.grid(True)
127
+ ax_main.set_xticks([]) # Hide X-axis labels by default
128
+
129
+ # Add click functionality to show token
130
+ token_annotations = []
131
+ for i, (x, y) in enumerate(zip(range(len(logprobs)), logprobs)):
132
+ annotation = ax_main.annotate('', (x, y), xytext=(10, 10), textcoords='offset points', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8), visible=False)
133
+ token_annotations.append(annotation)
134
+
135
+ def on_click(event):
136
+ if event.inaxes == ax_main:
137
+ for i, (x, y) in enumerate(zip(range(len(logprobs)), logprobs)):
138
+ contains, _ = scatter.contains(event)
139
+ if contains and abs(event.xdata - x) < 0.5 and abs(event.ydata - y) < 0.5:
140
+ token_annotations[i].set_text(tokens[i])
141
+ token_annotations[i].set_visible(True)
142
+ fig_main.canvas.draw_idle()
143
+ else:
144
+ token_annotations[i].set_visible(False)
145
+ fig_main.canvas.draw_idle()
146
+
147
+ fig_main.canvas.mpl_connect('button_press_event', on_click)
148
+
149
+ buf_main = io.BytesIO()
150
+ plt.savefig(buf_main, format="png", bbox_inches="tight", dpi=100)
151
+ buf_main.seek(0)
152
+ plt.close(fig_main)
153
+ return buf_main
154
 
155
  # 2. K-Means Clustering of Log Probabilities
156
+ def create_cluster_plot():
157
+ if not logprobs:
158
+ raise ValueError("No data for clustering plot")
159
+ kmeans = KMeans(n_clusters=3, random_state=42)
160
+ cluster_labels = kmeans.fit_predict(np.array(logprobs).reshape(-1, 1))
161
+ fig_cluster, ax_cluster = plt.subplots(figsize=(10, 5))
162
+ scatter = ax_cluster.scatter(range(len(logprobs)), logprobs, c=cluster_labels, cmap='viridis')
163
+ ax_cluster.set_title("K-Means Clustering of Log Probabilities")
164
+ ax_cluster.set_xlabel("Token Position")
165
+ ax_cluster.set_ylabel("Log Probability")
166
+ ax_cluster.grid(True)
167
+ plt.colorbar(scatter, ax=ax_cluster, label="Cluster")
168
+ buf_cluster = io.BytesIO()
169
+ plt.savefig(buf_cluster, format="png", bbox_inches="tight", dpi=100)
170
+ buf_cluster.seek(0)
171
+ plt.close(fig_cluster)
172
+ return buf_cluster
173
 
174
  # 3. Probability Drop Analysis
175
+ def create_drops_plot():
176
+ if not logprobs or len(logprobs) < 2:
177
+ raise ValueError("Insufficient data for probability drops")
178
+ drops = [logprobs[i+1] - logprobs[i] if i < len(logprobs)-1 else 0 for i in range(len(logprobs))]
179
+ fig_drops, ax_drops = plt.subplots(figsize=(10, 5))
180
+ ax_drops.bar(range(len(drops)), drops, color='red', alpha=0.5)
181
+ ax_drops.set_title("Significant Probability Drops")
182
+ ax_drops.set_xlabel("Token Position")
183
+ ax_drops.set_ylabel("Log Probability Drop")
184
+ ax_drops.grid(True)
185
+ buf_drops = io.BytesIO()
186
+ plt.savefig(buf_drops, format="png", bbox_inches="tight", dpi=100)
187
+ buf_drops.seek(0)
188
+ plt.close(fig_drops)
189
+ return buf_drops
190
 
191
  # 4. N-Gram Analysis (Bigrams for simplicity)
192
+ def create_ngram_plot():
193
+ if not logprobs or len(logprobs) < 2:
194
+ raise ValueError("Insufficient data for N-gram analysis")
195
+ bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]
196
+ bigram_probs = [logprobs[i] + logprobs[i+1] for i in range(len(tokens)-1)]
197
+ fig_ngram, ax_ngram = plt.subplots(figsize=(10, 5))
198
+ ax_ngram.bar(range(len(bigrams)), bigram_probs, color='green')
199
+ ax_ngram.set_title("N-Gram (Bigrams) Probability Sum")
200
+ ax_ngram.set_xlabel("Bigram Position")
201
+ ax_ngram.set_ylabel("Sum of Log Probabilities")
202
+ ax_ngram.set_xticks(range(len(bigrams)))
203
+ ax_ngram.set_xticklabels([f"{b[0]}->{b[1]}" for b in bigrams], rotation=45, ha="right")
204
+ ax_ngram.grid(True)
205
+ buf_ngram = io.BytesIO()
206
+ plt.savefig(buf_ngram, format="png", bbox_inches="tight", dpi=100)
207
+ buf_ngram.seek(0)
208
+ plt.close(fig_ngram)
209
+ return buf_ngram
210
 
211
  # 5. Markov Chain Modeling (Simple Graph)
212
+ def create_markov_plot():
213
+ if not tokens or len(tokens) < 2:
214
+ raise ValueError("Insufficient data for Markov chain")
215
+ G = nx.DiGraph()
216
+ for i in range(len(tokens)-1):
217
+ G.add_edge(tokens[i], tokens[i+1], weight=logprobs[i+1] - logprobs[i])
218
+ fig_markov, ax_markov = plt.subplots(figsize=(10, 5))
219
+ pos = nx.spring_layout(G)
220
+ nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=500, edge_color='gray', width=1, ax=ax_markov)
221
+ ax_markov.set_title("Markov Chain of Token Transitions")
222
+ buf_markov = io.BytesIO()
223
+ plt.savefig(buf_markov, format="png", bbox_inches="tight", dpi=100)
224
+ buf_markov.seek(0)
225
+ plt.close(fig_markov)
226
+ return buf_markov
227
 
228
  # 6. Anomaly Detection (Outlier Detection with Z-Score)
229
+ def create_anomaly_plot():
230
+ if not logprobs:
231
+ raise ValueError("No data for anomaly detection")
232
+ z_scores = np.abs(stats.zscore(logprobs))
233
+ outliers = z_scores > 2 # Threshold for outliers
234
+ fig_anomaly, ax_anomaly = plt.subplots(figsize=(10, 5))
235
+ ax_anomaly.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b")
236
+ ax_anomaly.plot(np.where(outliers)[0], [logprobs[i] for i in np.where(outliers)[0]], "ro", label="Outliers")
237
+ ax_anomaly.set_title("Log Probabilities with Outliers")
238
+ ax_anomaly.set_xlabel("Token Position")
239
+ ax_anomaly.set_ylabel("Log Probability")
240
+ ax_anomaly.grid(True)
241
+ ax_anomaly.legend()
242
+ ax_anomaly.set_xticks([]) # Hide X-axis labels
243
+ buf_anomaly = io.BytesIO()
244
+ plt.savefig(buf_anomaly, format="png", bbox_inches="tight", dpi=100)
245
+ buf_anomaly.seek(0)
246
+ plt.close(fig_anomaly)
247
+ return buf_anomaly
248
 
249
  # 7. Autocorrelation
250
+ def create_autocorr_plot():
251
+ if not logprobs:
252
+ raise ValueError("No data for autocorrelation")
253
+ autocorr = correlate(logprobs, logprobs, mode='full')
254
+ autocorr = autocorr[len(autocorr)//2:] / len(logprobs) # Normalize
255
+ fig_autocorr, ax_autocorr = plt.subplots(figsize=(10, 5))
256
+ ax_autocorr.plot(range(len(autocorr)), autocorr, color='purple')
257
+ ax_autocorr.set_title("Autocorrelation of Log Probabilities")
258
+ ax_autocorr.set_xlabel("Lag")
259
+ ax_autocorr.set_ylabel("Autocorrelation")
260
+ ax_autocorr.grid(True)
261
+ buf_autocorr = io.BytesIO()
262
+ plt.savefig(buf_autocorr, format="png", bbox_inches="tight", dpi=100)
263
+ buf_autocorr.seek(0)
264
+ plt.close(fig_autocorr)
265
+ return buf_autocorr
266
 
267
  # 8. Smoothing (Moving Average)
268
+ def create_smoothing_plot():
269
+ if not logprobs:
270
+ raise ValueError("No data for smoothing")
271
+ window_size = 3
272
+ moving_avg = np.convolve(logprobs, np.ones(window_size)/window_size, mode='valid')
273
+ fig_smoothing, ax_smoothing = plt.subplots(figsize=(10, 5))
274
+ ax_smoothing.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b", label="Original")
275
+ ax_smoothing.plot(range(window_size-1, len(logprobs)), moving_avg, color="orange", label="Moving Average")
276
+ ax_smoothing.set_title("Log Probabilities with Moving Average")
277
+ ax_smoothing.set_xlabel("Token Position")
278
+ ax_smoothing.set_ylabel("Log Probability")
279
+ ax_smoothing.grid(True)
280
+ ax_smoothing.legend()
281
+ ax_smoothing.set_xticks([]) # Hide X-axis labels
282
+ buf_smoothing = io.BytesIO()
283
+ plt.savefig(buf_smoothing, format="png", bbox_inches="tight", dpi=100)
284
+ buf_smoothing.seek(0)
285
+ plt.close(fig_smoothing)
286
+ return buf_smoothing
287
 
288
  # 9. Uncertainty Propagation (Variance of Top Logprobs)
289
+ def create_uncertainty_plot():
290
+ if not logprobs or not top_alternatives:
291
+ raise ValueError("No data for uncertainty propagation")
292
+ variances = []
293
+ for probs in top_alternatives:
294
+ if len(probs) > 1:
295
+ values = [p[1] for p in probs]
296
+ variances.append(np.var(values))
297
+ else:
298
+ variances.append(0)
299
+ fig_uncertainty, ax_uncertainty = plt.subplots(figsize=(10, 5))
300
+ ax_uncertainty.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b", label="Log Prob")
301
+ ax_uncertainty.fill_between(range(len(logprobs)), [lp - v for lp, v in zip(logprobs, variances)],
302
+ [lp + v for lp, v in zip(logprobs, variances)], color='gray', alpha=0.3, label="Uncertainty")
303
+ ax_uncertainty.set_title("Log Probabilities with Uncertainty Propagation")
304
+ ax_uncertainty.set_xlabel("Token Position")
305
+ ax_uncertainty.set_ylabel("Log Probability")
306
+ ax_uncertainty.grid(True)
307
+ ax_uncertainty.legend()
308
+ ax_uncertainty.set_xticks([]) # Hide X-axis labels
309
+ buf_uncertainty = io.BytesIO()
310
+ plt.savefig(buf_uncertainty, format="png", bbox_inches="tight", dpi=100)
311
+ buf_uncertainty.seek(0)
312
+ plt.close(fig_uncertainty)
313
+ return buf_uncertainty
314
 
315
  # 10. Correlation Heatmap
316
+ def create_corr_plot():
317
+ if not logprobs or len(logprobs) < 2:
318
+ raise ValueError("Insufficient data for correlation heatmap")
319
+ corr_matrix = np.corrcoef(logprobs, rowvar=False)
320
+ fig_corr, ax_corr = plt.subplots(figsize=(10, 5))
321
+ im = ax_corr.imshow(corr_matrix, cmap='coolwarm', interpolation='nearest')
322
+ ax_corr.set_title("Correlation of Log Probabilities Across Positions")
323
+ ax_corr.set_xlabel("Token Position")
324
+ ax_corr.set_ylabel("Token Position")
325
+ plt.colorbar(im, ax=ax_corr, label="Correlation")
326
+ buf_corr = io.BytesIO()
327
+ plt.savefig(buf_corr, format="png", bbox_inches="tight", dpi=100)
328
+ buf_corr.seek(0)
329
+ plt.close(fig_corr)
330
+ return buf_corr
331
 
332
  # 11. Token Type Correlation
333
+ def create_type_plot():
334
+ if not logprobs or not token_types:
335
+ raise ValueError("No data for token type correlation")
336
+ type_probs = {t: [] for t in set(token_types)}
337
+ for t, p in zip(token_types, logprobs):
338
+ type_probs[t].append(p)
339
+ fig_type, ax_type = plt.subplots(figsize=(10, 5))
340
+ for t in type_probs:
341
+ ax_type.bar(t, np.mean(type_probs[t]), yerr=np.std(type_probs[t]), capsize=5, label=t)
342
+ ax_type.set_title("Average Log Probability by Token Type")
343
+ ax_type.set_xlabel("Token Type")
344
+ ax_type.set_ylabel("Average Log Probability")
345
+ ax_type.grid(True)
346
+ ax_type.legend()
347
+ buf_type = io.BytesIO()
348
+ plt.savefig(buf_type, format="png", bbox_inches="tight", dpi=100)
349
+ buf_type.seek(0)
350
+ plt.close(fig_type)
351
+ return buf_type
352
 
353
  # 12. Token Embedding Similarity vs. Probability (Simulated)
354
+ def create_embed_plot():
355
+ if not logprobs or not tokens:
356
+ raise ValueError("No data for embedding similarity")
357
+ simulated_embeddings = np.random.rand(len(tokens), 2) # 2D embeddings
358
+ fig_embed, ax_embed = plt.subplots(figsize=(10, 5))
359
+ ax_embed.scatter(simulated_embeddings[:, 0], simulated_embeddings[:, 1], c=logprobs, cmap='viridis')
360
+ ax_embed.set_title("Token Embedding Similarity vs. Log Probability")
361
+ ax_embed.set_xlabel("Embedding Dimension 1")
362
+ ax_embed.set_ylabel("Embedding Dimension 2")
363
+ plt.colorbar(ax_embed.collections[0], ax=ax_embed, label="Log Probability")
364
+ buf_embed = io.BytesIO()
365
+ plt.savefig(buf_embed, format="png", bbox_inches="tight", dpi=100)
366
+ buf_embed.seek(0)
367
+ plt.close(fig_embed)
368
+ return buf_embed
369
 
370
  # 13. Bayesian Inference (Simplified as Inferred Probabilities)
371
+ def create_bayesian_plot():
372
+ if not top_alternatives:
373
+ raise ValueError("No data for Bayesian inference")
374
+ entropies = [entropy([p[1] for p in probs], base=2) for probs in top_alternatives if len(probs) > 1]
375
+ fig_bayesian, ax_bayesian = plt.subplots(figsize=(10, 5))
376
+ ax_bayesian.bar(range(len(entropies)), entropies, color='orange')
377
+ ax_bayesian.set_title("Bayesian Inferred Uncertainty (Entropy)")
378
+ ax_bayesian.set_xlabel("Token Position")
379
+ ax_bayesian.set_ylabel("Entropy")
380
+ ax_bayesian.grid(True)
381
+ buf_bayesian = io.BytesIO()
382
+ plt.savefig(buf_bayesian, format="png", bbox_inches="tight", dpi=100)
383
+ buf_bayesian.seek(0)
384
+ plt.close(fig_bayesian)
385
+ return buf_bayesian
386
 
387
  # 14. Graph-Based Analysis
388
+ def create_graph_plot():
389
+ if not tokens or len(tokens) < 2:
390
+ raise ValueError("Insufficient data for graph analysis")
391
+ G = nx.DiGraph()
392
+ for i in range(len(tokens)-1):
393
+ G.add_edge(tokens[i], tokens[i+1], weight=logprobs[i+1] - logprobs[i])
394
+ fig_graph, ax_graph = plt.subplots(figsize=(10, 5))
395
+ pos = nx.spring_layout(G)
396
+ nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=500, edge_color='gray', width=1, ax=ax_graph)
397
+ ax_graph.set_title("Graph of Token Transitions")
398
+ buf_graph = io.BytesIO()
399
+ plt.savefig(buf_graph, format="png", bbox_inches="tight", dpi=100)
400
+ buf_graph.seek(0)
401
+ plt.close(fig_graph)
402
+ return buf_graph
403
 
404
  # 15. Dimensionality Reduction (t-SNE)
405
+ def create_tsne_plot():
406
+ if not logprobs or not top_alternatives:
407
+ raise ValueError("No data for t-SNE")
408
+ features = np.array([logprobs + [p[1] for p in alts[:2]] for logprobs, alts in zip([logprobs], top_alternatives)])
409
+ tsne = TSNE(n_components=2, random_state=42)
410
+ tsne_result = tsne.fit_transform(features.T)
411
+ fig_tsne, ax_tsne = plt.subplots(figsize=(10, 5))
412
+ scatter = ax_tsne.scatter(tsne_result[:, 0], tsne_result[:, 1], c=logprobs, cmap='viridis')
413
+ ax_tsne.set_title("t-SNE of Log Probabilities and Top Alternatives")
414
+ ax_tsne.set_xlabel("t-SNE Dimension 1")
415
+ ax_tsne.set_ylabel("t-SNE Dimension 2")
416
+ plt.colorbar(scatter, ax=ax_tsne, label="Log Probability")
417
+ buf_tsne = io.BytesIO()
418
+ plt.savefig(buf_tsne, format="png", bbox_inches="tight", dpi=100)
419
+ buf_tsne.seek(0)
420
+ plt.close(fig_tsne)
421
+ return buf_tsne
422
 
423
  # 16. Interactive Heatmap
424
+ def create_heatmap_plot():
425
+ if not logprobs:
426
+ raise ValueError("No data for heatmap")
427
+ fig_heatmap, ax_heatmap = plt.subplots(figsize=(10, 5))
428
+ im = ax_heatmap.imshow([logprobs], cmap='viridis', aspect='auto')
429
+ ax_heatmap.set_title("Interactive Heatmap of Log Probabilities")
430
+ ax_heatmap.set_xlabel("Token Position")
431
+ ax_heatmap.set_ylabel("Probability Level")
432
+ plt.colorbar(im, ax=ax_heatmap, label="Log Probability")
433
+ buf_heatmap = io.BytesIO()
434
+ plt.savefig(buf_heatmap, format="png", bbox_inches="tight", dpi=100)
435
+ buf_heatmap.seek(0)
436
+ plt.close(fig_heatmap)
437
+ return buf_heatmap
438
 
439
  # 17. Probability Distribution Plots (Box Plots for Top Logprobs)
440
+ def create_dist_plot():
441
+ if not logprobs or not top_alternatives:
442
+ raise ValueError("No data for probability distribution")
443
+ all_top_probs = [p[1] for alts in top_alternatives for p in alts]
444
+ fig_dist, ax_dist = plt.subplots(figsize=(10, 5))
445
+ ax_dist.boxplot([logprobs] + [p[1] for alts in top_alternatives for p in alts[:2]], labels=["Selected"] + ["Alt1", "Alt2"])
446
+ ax_dist.set_title("Probability Distribution of Top Tokens")
447
+ ax_dist.set_xlabel("Token Type")
448
+ ax_dist.set_ylabel("Log Probability")
449
+ ax_dist.grid(True)
450
+ buf_dist = io.BytesIO()
451
+ plt.savefig(buf_dist, format="png", bbox_inches="tight", dpi=100)
452
+ buf_dist.seek(0)
453
+ plt.close(fig_dist)
454
+ return buf_dist
455
+
456
+ # Create all plots safely
457
+ img_main_html = "Placeholder for Log Probability Plot"
458
+ img_cluster_html = "Placeholder for K-Means Clustering"
459
+ img_drops_html = "Placeholder for Probability Drops"
460
+ img_ngram_html = "Placeholder for N-Gram Analysis"
461
+ img_markov_html = "Placeholder for Markov Chain"
462
+ img_anomaly_html = "Placeholder for Anomaly Detection"
463
+ img_autocorr_html = "Placeholder for Autocorrelation"
464
+ img_smoothing_html = "Placeholder for Smoothing (Moving Average)"
465
+ img_uncertainty_html = "Placeholder for Uncertainty Propagation"
466
+ img_corr_html = "Placeholder for Correlation Heatmap"
467
+ img_type_html = "Placeholder for Token Type Correlation"
468
+ img_embed_html = "Placeholder for Embedding Similarity vs. Probability"
469
+ img_bayesian_html = "Placeholder for Bayesian Inference (Entropy)"
470
+ img_graph_html = "Placeholder for Graph of Token Transitions"
471
+ img_tsne_html = "Placeholder for t-SNE of Log Probabilities"
472
+ img_heatmap_html = "Placeholder for Interactive Heatmap"
473
+ img_dist_html = "Placeholder for Probability Distribution"
474
+
475
+ try:
476
+ buf_main = create_main_plot()
477
+ img_main_bytes = buf_main.getvalue()
478
+ img_main_base64 = base64.b64encode(img_main_bytes).decode("utf-8")
479
+ img_main_html = f'<img src="data:image/png;base64,{img_main_base64}" style="max-width: 100%; height: auto;">'
480
+ except Exception as e:
481
+ logger.error("Failed to create main plot: %s", str(e))
482
+
483
+ try:
484
+ buf_cluster = create_cluster_plot()
485
+ img_cluster_bytes = buf_cluster.getvalue()
486
+ img_cluster_base64 = base64.b64encode(img_cluster_bytes).decode("utf-8")
487
+ img_cluster_html = f'<img src="data:image/png;base64,{img_cluster_base64}" style="max-width: 100%; height: auto;">'
488
+ except Exception as e:
489
+ logger.error("Failed to create cluster plot: %s", str(e))
490
+
491
+ try:
492
+ buf_drops = create_drops_plot()
493
+ img_drops_bytes = buf_drops.getvalue()
494
+ img_drops_base64 = base64.b64encode(img_drops_bytes).decode("utf-8")
495
+ img_drops_html = f'<img src="data:image/png;base64,{img_drops_base64}" style="max-width: 100%; height: auto;">'
496
+ except Exception as e:
497
+ logger.error("Failed to create drops plot: %s", str(e))
498
+
499
+ try:
500
+ buf_ngram = create_ngram_plot()
501
+ img_ngram_bytes = buf_ngram.getvalue()
502
+ img_ngram_base64 = base64.b64encode(img_ngram_bytes).decode("utf-8")
503
+ img_ngram_html = f'<img src="data:image/png;base64,{img_ngram_base64}" style="max-width: 100%; height: auto;">'
504
+ except Exception as e:
505
+ logger.error("Failed to create ngram plot: %s", str(e))
506
+
507
+ try:
508
+ buf_markov = create_markov_plot()
509
+ img_markov_bytes = buf_markov.getvalue()
510
+ img_markov_base64 = base64.b64encode(img_markov_bytes).decode("utf-8")
511
+ img_markov_html = f'<img src="data:image/png;base64,{img_markov_base64}" style="max-width: 100%; height: auto;">'
512
+ except Exception as e:
513
+ logger.error("Failed to create markov plot: %s", str(e))
514
+
515
+ try:
516
+ buf_anomaly = create_anomaly_plot()
517
+ img_anomaly_bytes = buf_anomaly.getvalue()
518
+ img_anomaly_base64 = base64.b64encode(img_anomaly_bytes).decode("utf-8")
519
+ img_anomaly_html = f'<img src="data:image/png;base64,{img_anomaly_base64}" style="max-width: 100%; height: auto;">'
520
+ except Exception as e:
521
+ logger.error("Failed to create anomaly plot: %s", str(e))
522
+
523
+ try:
524
+ buf_autocorr = create_autocorr_plot()
525
+ img_autocorr_bytes = buf_autocorr.getvalue()
526
+ img_autocorr_base64 = base64.b64encode(img_autocorr_bytes).decode("utf-8")
527
+ img_autocorr_html = f'<img src="data:image/png;base64,{img_autocorr_base64}" style="max-width: 100%; height: auto;">'
528
+ except Exception as e:
529
+ logger.error("Failed to create autocorr plot: %s", str(e))
530
+
531
+ try:
532
+ buf_smoothing = create_smoothing_plot()
533
+ img_smoothing_bytes = buf_smoothing.getvalue()
534
+ img_smoothing_base64 = base64.b64encode(img_smoothing_bytes).decode("utf-8")
535
+ img_smoothing_html = f'<img src="data:image/png;base64,{img_smoothing_base64}" style="max-width: 100%; height: auto;">'
536
+ except Exception as e:
537
+ logger.error("Failed to create smoothing plot: %s", str(e))
538
+
539
+ try:
540
+ buf_uncertainty = create_uncertainty_plot()
541
+ img_uncertainty_bytes = buf_uncertainty.getvalue()
542
+ img_uncertainty_base64 = base64.b64encode(img_uncertainty_bytes).decode("utf-8")
543
+ img_uncertainty_html = f'<img src="data:image/png;base64,{img_uncertainty_base64}" style="max-width: 100%; height: auto;">'
544
+ except Exception as e:
545
+ logger.error("Failed to create uncertainty plot: %s", str(e))
546
+
547
+ try:
548
+ buf_corr = create_corr_plot()
549
+ img_corr_bytes = buf_corr.getvalue()
550
+ img_corr_base64 = base64.b64encode(img_corr_bytes).decode("utf-8")
551
+ img_corr_html = f'<img src="data:image/png;base64,{img_corr_base64}" style="max-width: 100%; height: auto;">'
552
+ except Exception as e:
553
+ logger.error("Failed to create correlation plot: %s", str(e))
554
+
555
+ try:
556
+ buf_type = create_type_plot()
557
+ img_type_bytes = buf_type.getvalue()
558
+ img_type_base64 = base64.b64encode(img_type_bytes).decode("utf-8")
559
+ img_type_html = f'<img src="data:image/png;base64,{img_type_base64}" style="max-width: 100%; height: auto;">'
560
+ except Exception as e:
561
+ logger.error("Failed to create type plot: %s", str(e))
562
+
563
+ try:
564
+ buf_embed = create_embed_plot()
565
+ img_embed_bytes = buf_embed.getvalue()
566
+ img_embed_base64 = base64.b64encode(img_embed_bytes).decode("utf-8")
567
+ img_embed_html = f'<img src="data:image/png;base64,{img_embed_base64}" style="max-width: 100%; height: auto;">'
568
+ except Exception as e:
569
+ logger.error("Failed to create embed plot: %s", str(e))
570
+
571
+ try:
572
+ buf_bayesian = create_bayesian_plot()
573
+ img_bayesian_bytes = buf_bayesian.getvalue()
574
+ img_bayesian_base64 = base64.b64encode(img_bayesian_bytes).decode("utf-8")
575
+ img_bayesian_html = f'<img src="data:image/png;base64,{img_bayesian_base64}" style="max-width: 100%; height: auto;">'
576
+ except Exception as e:
577
+ logger.error("Failed to create bayesian plot: %s", str(e))
578
+
579
+ try:
580
+ buf_graph = create_graph_plot()
581
+ img_graph_bytes = buf_graph.getvalue()
582
+ img_graph_base64 = base64.b64encode(img_graph_bytes).decode("utf-8")
583
+ img_graph_html = f'<img src="data:image/png;base64,{img_graph_base64}" style="max-width: 100%; height: auto;">'
584
+ except Exception as e:
585
+ logger.error("Failed to create graph plot: %s", str(e))
586
+
587
+ try:
588
+ buf_tsne = create_tsne_plot()
589
+ img_tsne_bytes = buf_tsne.getvalue()
590
+ img_tsne_base64 = base64.b64encode(img_tsne_bytes).decode("utf-8")
591
+ img_tsne_html = f'<img src="data:image/png;base64,{img_tsne_base64}" style="max-width: 100%; height: auto;">'
592
+ except Exception as e:
593
+ logger.error("Failed to create tsne plot: %s", str(e))
594
+
595
+ try:
596
+ buf_heatmap = create_heatmap_plot()
597
+ img_heatmap_bytes = buf_heatmap.getvalue()
598
+ img_heatmap_base64 = base64.b64encode(img_heatmap_bytes).decode("utf-8")
599
+ img_heatmap_html = f'<img src="data:image/png;base64,{img_heatmap_base64}" style="max-width: 100%; height: auto;">'
600
+ except Exception as e:
601
+ logger.error("Failed to create heatmap plot: %s", str(e))
602
+
603
+ try:
604
+ buf_dist = create_dist_plot()
605
+ img_dist_bytes = buf_dist.getvalue()
606
+ img_dist_base64 = base64.b64encode(img_dist_bytes).decode("utf-8")
607
+ img_dist_html = f'<img src="data:image/png;base64,{img_dist_base64}" style="max-width: 100%; height: auto;">'
608
+ except Exception as e:
609
+ logger.error("Failed to create distribution plot: %s", str(e))
610
 
611
  # Create DataFrame for the table
612
  table_data = []
 
680
  alt_viz_html += "</li>"
681
  alt_viz_html += "</ul>"
682
 
683
+ # Convert buffers to HTML for Gradio
684
+ def buffer_to_html(buf):
685
+ if isinstance(buf, str): # If it's an error message
686
+ return buf
687
+ img_bytes = buf.getvalue()
688
+ img_base64 = base64.b64encode(img_bytes).decode("utf-8")
689
+ return f'<img src="data:image/png;base64,{img_base64}" style="max-width: 100%; height: auto;">'
690
+
691
+ return (
692
+ buffer_to_html(img_main_html), df, colored_text_html, alt_viz_html,
693
+ buffer_to_html(img_cluster_html), buffer_to_html(img_drops_html), buffer_to_html(img_ngram_html),
694
+ buffer_to_html(img_markov_html), buffer_to_html(img_anomaly_html), buffer_to_html(img_autocorr_html),
695
+ buffer_to_html(img_smoothing_html), buffer_to_html(img_uncertainty_html), buffer_to_html(img_corr_html),
696
+ buffer_to_html(img_type_html), buffer_to_html(img_embed_html), buffer_to_html(img_bayesian_html),
697
+ buffer_to_html(img_graph_html), buffer_to_html(img_tsne_html), buffer_to_html(img_heatmap_html),
698
+ buffer_to_html(img_dist_html)
699
+ )
700
 
701
  except Exception as e:
702
  logger.error("Visualization failed: %s", str(e))
703
+ return (
704
+ f"Error: {str(e)}", None, None, None, "Placeholder for K-Means Clustering", "Placeholder for Probability Drops",
705
+ "Placeholder for N-Gram Analysis", "Placeholder for Markov Chain", "Placeholder for Anomaly Detection",
706
+ "Placeholder for Autocorrelation", "Placeholder for Smoothing (Moving Average)", "Placeholder for Uncertainty Propagation",
707
+ "Placeholder for Correlation Heatmap", "Placeholder for Token Type Correlation", "Placeholder for Embedding Similarity vs. Probability",
708
+ "Placeholder for Bayesian Inference (Entropy)", "Placeholder for Graph of Token Transitions", "Placeholder for t-SNE of Log Probabilities",
709
+ "Placeholder for Interactive Heatmap", "Placeholder for Probability Distribution"
710
+ )
711
 
712
+ # Gradio interface with improved layout and placeholders
713
  with gr.Blocks(title="Log Probability Visualizer") as app:
714
  gr.Markdown("# Log Probability Visualizer")
715
  gr.Markdown(
 
717
  )
718
 
719
  with gr.Row():
720
+ with gr.Column(scale=1):
721
+ json_input = gr.Textbox(
722
+ label="JSON Input",
723
+ lines=10,
724
+ placeholder="Paste your JSON (e.g., {\"content\": [...]}) or Python dict (e.g., {'content': [...]}) here...",
725
+ )
726
+ with gr.Column(scale=1):
727
+ prob_filter = gr.Slider(minimum=-1e9, maximum=0, value=-1e9, label="Log Probability Filter (≥)")
728
+
729
+ with gr.Tabs():
730
+ with gr.Tab("Core Visualizations"):
731
+ with gr.Row():
732
+ plot_output = gr.HTML(label="Log Probability Plot (Click for Tokens)", value="Placeholder for Log Probability Plot")
733
+ table_output = gr.Dataframe(label="Token Log Probabilities and Top Alternatives", value=None)
734
+ with gr.Row():
735
+ text_output = gr.HTML(label="Colored Text (Confidence Visualization)", value="Placeholder for Colored Text (Confidence Visualization)")
736
+ alt_viz_output = gr.HTML(label="Top 3 Token Log Probabilities", value="Placeholder for Top 3 Token Log Probabilities")
737
+
738
+ with gr.Tab("Clustering & Patterns"):
739
+ with gr.Row():
740
+ cluster_output = gr.HTML(label="K-Means Clustering", value="Placeholder for K-Means Clustering")
741
+ drops_output = gr.HTML(label="Probability Drops", value="Placeholder for Probability Drops")
742
+ with gr.Row():
743
+ ngram_output = gr.HTML(label="N-Gram Analysis", value="Placeholder for N-Gram Analysis")
744
+ markov_output = gr.HTML(label="Markov Chain", value="Placeholder for Markov Chain")
745
+
746
+ with gr.Tab("Time Series & Anomalies"):
747
+ with gr.Row():
748
+ anomaly_output = gr.HTML(label="Anomaly Detection", value="Placeholder for Anomaly Detection")
749
+ autocorr_output = gr.HTML(label="Autocorrelation", value="Placeholder for Autocorrelation")
750
+ with gr.Row():
751
+ smoothing_output = gr.HTML(label="Smoothing (Moving Average)", value="Placeholder for Smoothing (Moving Average)")
752
+ uncertainty_output = gr.HTML(label="Uncertainty Propagation", value="Placeholder for Uncertainty Propagation")
753
+
754
+ with gr.Tab("Correlation & Types"):
755
+ with gr.Row():
756
+ corr_output = gr.HTML(label="Correlation Heatmap", value="Placeholder for Correlation Heatmap")
757
+ type_output = gr.HTML(label="Token Type Correlation", value="Placeholder for Token Type Correlation")
758
+
759
+ with gr.Tab("Advanced Analyses"):
760
+ with gr.Row():
761
+ embed_output = gr.HTML(label="Embedding Similarity vs. Probability", value="Placeholder for Embedding Similarity vs. Probability")
762
+ bayesian_output = gr.HTML(label="Bayesian Inference (Entropy)", value="Placeholder for Bayesian Inference (Entropy)")
763
+ with gr.Row():
764
+ graph_output = gr.HTML(label="Graph of Token Transitions", value="Placeholder for Graph of Token Transitions")
765
+ tsne_output = gr.HTML(label="t-SNE of Log Probabilities", value="Placeholder for t-SNE of Log Probabilities")
766
+
767
+ with gr.Tab("Enhanced Visualizations"):
768
+ with gr.Row():
769
+ heatmap_output = gr.HTML(label="Interactive Heatmap", value="Placeholder for Interactive Heatmap")
770
+ dist_output = gr.HTML(label="Probability Distribution", value="Placeholder for Probability Distribution")
771
 
772
  btn = gr.Button("Visualize")
773
  btn.click(