import gradio as gr
import json
import matplotlib.pyplot as plt
import pandas as pd
import io
import base64
import math
import ast
import logging
import numpy as np
import plotly.graph_objects as go
import asyncio
import anyio
# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# Function to safely parse JSON or Python dictionary input
def parse_input(json_input):
logger.debug("Attempting to parse input: %s", json_input)
try:
# Try to parse as JSON first
data = json.loads(json_input)
logger.debug("Successfully parsed as JSON")
return data
except json.JSONDecodeError as e:
logger.error("JSON parsing failed: %s (Input: %s)", str(e), json_input[:100] + "..." if len(json_input) > 100 else json_input)
raise ValueError(f"Malformed input: {str(e)}. Ensure property names are in double quotes (e.g., \"content\") and the format matches JSON (e.g., {{\"content\": [...]}}).")
# Function to ensure a value is a float, converting from string if necessary
def ensure_float(value):
if value is None:
logger.debug("Replacing None logprob with 0.0")
return 0.0 # Default to 0.0 for None to ensure visualization
if isinstance(value, str):
try:
return float(value)
except ValueError:
logger.error("Failed to convert string '%s' to float", value)
return 0.0 # Default to 0.0 for invalid strings
if isinstance(value, (int, float)):
return float(value)
return 0.0 # Default for any other type
# Function to get or generate a token value (default to "Unknown" if missing)
def get_token(entry):
token = entry.get("token", "Unknown")
if token == "Unknown":
logger.warning("Missing 'token' key for entry: %s, using 'Unknown'", entry)
return token
# Function to create an empty Plotly figure
def create_empty_figure(title):
return go.Figure().update_layout(title=title, xaxis_title="", yaxis_title="", showlegend=False)
# Precompute the next chunk asynchronously
async def precompute_chunk(json_input, chunk_size, current_chunk):
try:
data = parse_input(json_input)
content = data.get("content", []) if isinstance(data, dict) else data
if not isinstance(content, list):
raise ValueError("Content must be a list of entries")
tokens = []
logprobs = []
top_alternatives = []
for entry in content:
if not isinstance(entry, dict):
logger.warning("Skipping non-dictionary entry: %s", entry)
continue
logprob = ensure_float(entry.get("logprob", None))
if logprob >= -100000: # Include all entries with default 0.0
tokens.append(get_token(entry))
logprobs.append(logprob)
top_probs = entry.get("top_logprobs", {})
if top_probs is None:
logger.debug("top_logprobs is None for token: %s, using empty dict", get_token(entry))
top_probs = {}
finite_top_probs = []
for key, value in top_probs.items():
float_value = ensure_float(value)
if float_value is not None and math.isfinite(float_value):
finite_top_probs.append((key, float_value))
sorted_probs = sorted(finite_top_probs, key=lambda x: x[1], reverse=True)
top_alternatives.append(sorted_probs)
if not tokens or not logprobs:
return None, None, None
next_chunk = current_chunk + 1
start_idx = next_chunk * chunk_size
end_idx = min((next_chunk + 1) * chunk_size, len(tokens))
if start_idx >= len(tokens):
return None, None, None
paginated_tokens = tokens[start_idx:end_idx]
paginated_logprobs = logprobs[start_idx:end_idx]
paginated_alternatives = top_alternatives[start_idx:end_idx]
return paginated_tokens, paginated_logprobs, paginated_alternatives
except Exception as e:
logger.error("Precomputation failed for chunk %d: %s", current_chunk + 1, str(e))
return None, None, None
# Function to process and visualize a chunk of log probs with dynamic top_logprobs
def visualize_logprobs(json_input, chunk=0, chunk_size=100):
try:
# Parse the input (handles JSON only)
data = parse_input(json_input)
# Ensure data is a dictionary with 'content' key containing a list
if isinstance(data, dict) and "content" in data:
content = data["content"]
if not isinstance(content, list):
raise ValueError("Content must be a list of entries")
elif isinstance(data, list):
content = data # Handle direct list input (though only JSON is expected)
else:
raise ValueError("Input must be a dictionary with 'content' key or a list of entries")
# Extract tokens, log probs, and top alternatives, skipping non-finite values with fixed filter of -100000
tokens = []
logprobs = []
top_alternatives = [] # List to store all top_logprobs (dynamic length)
for entry in content:
if not isinstance(entry, dict):
logger.warning("Skipping non-dictionary entry: %s", entry)
continue
logprob = ensure_float(entry.get("logprob", None))
if logprob >= -100000: # Include all entries with default 0.0
tokens.append(get_token(entry))
logprobs.append(logprob)
# Get top_logprobs, default to empty dict if None
top_probs = entry.get("top_logprobs", {})
if top_probs is None:
logger.debug("top_logprobs is None for token: %s, using empty dict", get_token(entry))
top_probs = {} # Default to empty dict for None
# Ensure all values in top_logprobs are floats and create a list of tuples
finite_top_probs = []
for key, value in top_probs.items():
float_value = ensure_float(value)
if float_value is not None and math.isfinite(float_value):
finite_top_probs.append((key, float_value))
# Sort by log probability (descending) to get all alternatives
sorted_probs = sorted(finite_top_probs, key=lambda x: x[1], reverse=True)
top_alternatives.append(sorted_probs) # Store all alternatives, dynamic length
else:
logger.debug("Skipping entry with logprob: %s (type: %s)", entry.get("logprob"), type(entry.get("logprob", None)))
# Check if there's valid data after filtering
if not logprobs or not tokens:
return (create_empty_figure("Log Probabilities of Generated Tokens"), None, "No tokens to display.", create_empty_figure("Top Token Log Probabilities"), create_empty_figure("Significant Probability Drops"), 1, 0)
# Paginate data for chunks of 100 tokens
total_chunks = max(1, (len(logprobs) + chunk_size - 1) // chunk_size)
start_idx = chunk * chunk_size
end_idx = min((chunk + 1) * chunk_size, len(logprobs))
paginated_tokens = tokens[start_idx:end_idx]
paginated_logprobs = logprobs[start_idx:end_idx]
paginated_alternatives = top_alternatives[start_idx:end_idx] if top_alternatives else []
# 1. Main Log Probability Plot (Interactive Plotly)
main_fig = go.Figure()
main_fig.add_trace(go.Scatter(x=list(range(len(paginated_logprobs))), y=paginated_logprobs, mode='markers+lines', name='Log Prob', marker=dict(color='blue')))
main_fig.update_layout(
title="Log Probabilities of Generated Tokens (Chunk %d)" % (chunk + 1),
xaxis_title="Token Position (within chunk)",
yaxis_title="Log Probability",
hovermode="closest",
clickmode='event+select'
)
main_fig.update_traces(
customdata=[f"Token: {tok}, Log Prob: {prob:.4f}, Position: {i+start_idx}" for i, (tok, prob) in enumerate(zip(paginated_tokens, paginated_logprobs))],
hovertemplate='%{customdata}'
)
# 2. Probability Drop Analysis (Interactive Plotly)
if len(paginated_logprobs) < 2:
drops_fig = create_empty_figure("Significant Probability Drops (Chunk %d)" % (chunk + 1))
else:
drops = [paginated_logprobs[i+1] - paginated_logprobs[i] for i in range(len(paginated_logprobs)-1)]
drops_fig = go.Figure()
drops_fig.add_trace(go.Bar(x=list(range(len(drops))), y=drops, name='Drop', marker_color='red'))
drops_fig.update_layout(
title="Significant Probability Drops (Chunk %d)" % (chunk + 1),
xaxis_title="Token Position (within chunk)",
yaxis_title="Log Probability Drop",
hovermode="closest",
clickmode='event+select'
)
drops_fig.update_traces(
customdata=[f"Drop: {drop:.4f}, From: {paginated_tokens[i]} to {paginated_tokens[i+1]}, Position: {i+start_idx}" for i, drop in enumerate(drops)],
hovertemplate='%{customdata}'
)
# Create DataFrame for the table with dynamic top_logprobs
table_data = []
max_alternatives = max(len(alts) for alts in paginated_alternatives) if paginated_alternatives else 0
for i, entry in enumerate(content[start_idx:end_idx]):
if not isinstance(entry, dict):
continue
logprob = ensure_float(entry.get("logprob", None))
if logprob >= -100000 and "top_logprobs" in entry: # Include all entries with default 0.0
token = get_token(entry)
top_logprobs = entry.get("top_logprobs", {})
if top_logprobs is None:
logger.debug("top_logprobs is None for token: %s, using empty dict", token)
top_logprobs = {} # Default to empty dict for None
# Ensure all values in top_logprobs are floats
finite_top_probs = []
for key, value in top_logprobs.items():
float_value = ensure_float(value)
if float_value is not None and math.isfinite(float_value):
finite_top_probs.append((key, float_value))
# Sort by log probability (descending)
sorted_probs = sorted(finite_top_probs, key=lambda x: x[1], reverse=True)
row = [token, f"{logprob:.4f}"]
for alt_token, alt_logprob in sorted_probs[:max_alternatives]: # Use max number of alternatives
row.append(f"{alt_token}: {alt_logprob:.4f}")
# Pad with empty strings if fewer alternatives than max
while len(row) < 2 + max_alternatives:
row.append("")
table_data.append(row)
df = (
pd.DataFrame(
table_data,
columns=["Token", "Log Prob"] + [f"Alt {i+1}" for i in range(max_alternatives)],
)
if table_data
else None
)
# Generate colored text (for the current chunk)
if paginated_logprobs:
min_logprob = min(paginated_logprobs)
max_logprob = max(paginated_logprobs)
if max_logprob == min_logprob:
normalized_probs = [0.5] * len(paginated_logprobs)
else:
normalized_probs = [
(lp - min_logprob) / (max_logprob - min_logprob) for lp in paginated_logprobs
]
colored_text = ""
for i, (token, norm_prob) in enumerate(zip(paginated_tokens, normalized_probs)):
r = int(255 * (1 - norm_prob)) # Red for low confidence
g = int(255 * norm_prob) # Green for high confidence
b = 0
color = f"rgb({r}, {g}, {b})"
colored_text += f'{token}'
if i < len(paginated_tokens) - 1:
colored_text += " "
colored_text_html = f"
{colored_text}
"
else:
colored_text_html = "No tokens to display in this chunk."
# Top Token Log Probabilities (Interactive Plotly, dynamic length, for the current chunk)
alt_viz_fig = create_empty_figure("Top Token Log Probabilities (Chunk %d)" % (chunk + 1)) if not paginated_logprobs or not paginated_alternatives else go.Figure()
if paginated_logprobs and paginated_alternatives:
for i, (token, probs) in enumerate(zip(paginated_tokens, paginated_alternatives)):
for j, (alt_tok, prob) in enumerate(probs):
alt_viz_fig.add_trace(go.Bar(x=[f"{token} (Pos {i+start_idx})"], y=[prob], name=f"{alt_tok}", marker_color=['blue', 'green', 'red', 'purple', 'orange'][:len(probs)]))
alt_viz_fig.update_layout(
title="Top Token Log Probabilities (Chunk %d)" % (chunk + 1),
xaxis_title="Token (Position)",
yaxis_title="Log Probability",
barmode='stack',
hovermode="closest",
clickmode='event+select'
)
alt_viz_fig.update_traces(
customdata=[f"Token: {tok}, Alt: {alt}, Log Prob: {prob:.4f}, Position: {i+start_idx}" for i, (tok, alts) in enumerate(zip(paginated_tokens, paginated_alternatives)) for alt, prob in alts],
hovertemplate='%{customdata}'
)
return (main_fig, df, colored_text_html, alt_viz_fig, drops_fig, total_chunks, chunk)
except Exception as e:
logger.error("Visualization failed: %s (Input: %s)", str(e), json_input[:100] + "..." if len(json_input) > 100 else json_input)
return (create_empty_figure("Log Probabilities of Generated Tokens"), None, "No finite log probabilities to display.", create_empty_figure("Top Token Log Probabilities"), create_empty_figure("Significant Probability Drops"), 1, 0)
# Analysis functions for detecting correct vs. incorrect traces
def analyze_confidence_signature(logprobs, tokens):
if not logprobs or not tokens:
return "No data for confidence signature analysis.", None
# Track moving average of top token probability
top_probs = [lps[0][1] if lps else -float('inf') for lps in logprobs] # Extract top probability, handle empty
moving_avg = np.convolve(
top_probs,
np.ones(20) / 20, # 20-token window
mode='valid'
)
# Detect significant drops (potential error points)
drops = np.where(np.diff(moving_avg) < -0.15)[0]
if not drops.size:
return "No significant confidence drops detected.", None
drop_positions = [(i, tokens[i + 19] if i + 19 < len(tokens) else "End of trace") for i in drops] # Adjust for convolution window
return "Significant confidence drops detected at positions:", drop_positions
def detect_interpretation_pivots(logprobs, tokens):
if not logprobs or not tokens:
return "No data for interpretation pivot detection.", None
pivots = []
reconsideration_tokens = ["wait", "but", "actually", "however", "hmm"]
for i, (token, lps) in enumerate(zip(tokens, logprobs)):
# Check if reconsideration tokens have unusually high probability
for rt in reconsideration_tokens:
for t, p in lps:
if t.lower() == rt and p > -2.5: # High probability
# Look back to find what's being reconsidered
context = tokens[max(0, i-50):i]
pivots.append((i, rt, context))
if not pivots:
return "No interpretation pivots detected.", None
return "Interpretation pivots detected:", pivots
def calculate_decision_entropy(logprobs):
if not logprobs:
return "No data for entropy spike detection.", None
# Calculate entropy at each token position
entropies = []
for lps in logprobs:
if not lps:
entropies.append(0.0)
continue
# Calculate entropy: -sum(p * log(p)) for each probability
probs = [math.exp(p) for _, p in lps] # Convert log probs to probabilities
if not probs or sum(probs) == 0:
entropies.append(0.0)
continue
entropy = -sum(p * math.log(p) for p in probs if p > 0)
entropies.append(entropy)
# Detect significant entropy spikes
baseline = np.percentile(entropies, 75) if entropies else 0.0
spikes = [i for i, e in enumerate(entropies) if e > baseline * 1.5 if baseline > 0]
if not spikes:
return "No entropy spikes detected at decision points.", None
return "Entropy spikes detected at positions:", spikes
def analyze_conclusion_competition(logprobs, tokens):
if not logprobs or not tokens:
return "No data for conclusion competition analysis.", None
# Find tokens related to conclusion
conclusion_indices = [i for i, t in enumerate(tokens)
if any(marker in t.lower() for marker in
["therefore", "thus", "boxed", "answer"])]
if not conclusion_indices:
return "No conclusion markers found in trace.", None
# Analyze probability gap between top and second choices near conclusion
gaps = []
conclusion_idx = conclusion_indices[-1]
end_range = min(conclusion_idx + 50, len(logprobs))
for idx in range(conclusion_idx, end_range):
if idx < len(logprobs) and len(logprobs[idx]) >= 2:
top_prob = logprobs[idx][0][1] if logprobs[idx] else -float('inf')
second_prob = logprobs[idx][1][1] if len(logprobs[idx]) > 1 else -float('inf')
gap = top_prob - second_prob if top_prob != -float('inf') and second_prob != -float('inf') else 0.0
gaps.append(gap)
if not gaps:
return "No conclusion competition data available.", None
mean_gap = np.mean(gaps)
return f"Mean probability gap at conclusion: {mean_gap:.4f} (higher indicates more confident conclusion)", None
def analyze_verification_signals(logprobs, tokens):
if not logprobs or not tokens:
return "No data for verification signal analysis.", None
verification_terms = ["verify", "check", "confirm", "ensure", "double"]
verification_probs = []
for lps in logprobs:
# Look for verification terms in top-k tokens
max_v_prob = -float('inf')
for token, prob in lps:
if any(v_term in token.lower() for v_term in verification_terms):
max_v_prob = max(max_v_prob, prob)
if max_v_prob > -float('inf'):
verification_probs.append(max_v_prob)
if not verification_probs:
return "No verification signals detected.", None
count, mean_prob = len(verification_probs), np.mean(verification_probs)
return f"Verification signals found: {count} instances, mean probability: {mean_prob:.4f}", None
def detect_semantic_inversions(logprobs, tokens):
if not logprobs or not tokens:
return "No data for semantic inversion detection.", None
inversion_pairs = [
("more", "less"), ("larger", "smaller"),
("winning", "losing"), ("increase", "decrease"),
("greater", "lesser"), ("positive", "negative")
]
inversions = []
for i, (token, lps) in enumerate(zip(tokens, logprobs)):
for pos, neg in inversion_pairs:
if token.lower() == pos:
# Check if negative term has high probability
for t, p in lps:
if t.lower() == neg and p > -3.0: # High competitor
inversions.append((i, pos, neg, p))
elif token.lower() == neg:
# Check if positive term has high probability
for t, p in lps:
if t.lower() == pos and p > -3.0: # High competitor
inversions.append((i, neg, pos, p))
if not inversions:
return "No semantic inversions detected.", None
return "Semantic inversions detected:", inversions
# Function to perform full trace analysis
def analyze_full_trace(json_input):
try:
data = parse_input(json_input)
content = data.get("content", []) if isinstance(data, dict) else data
if not isinstance(content, list):
raise ValueError("Content must be a list of entries")
tokens = []
logprobs = []
for entry in content:
if not isinstance(entry, dict):
logger.warning("Skipping non-dictionary entry: %s", entry)
continue
logprob = ensure_float(entry.get("logprob", None))
if logprob >= -100000: # Include all entries with default 0.0
tokens.append(get_token(entry))
top_probs = entry.get("top_logprobs", {})
if top_probs is None:
top_probs = {}
finite_top_probs = []
for key, value in top_probs.items():
float_value = ensure_float(value)
if float_value is not None and math.isfinite(float_value):
finite_top_probs.append((key, float_value))
logprobs.append(finite_top_probs)
if not logprobs or not tokens:
return "No valid data for trace analysis.", None, None, None, None, None
# Perform all analyses
confidence_result, confidence_data = analyze_confidence_signature(logprobs, tokens)
pivot_result, pivot_data = detect_interpretation_pivots(logprobs, tokens)
entropy_result, entropy_data = calculate_decision_entropy(logprobs)
conclusion_result, conclusion_data = analyze_conclusion_competition(logprobs, tokens)
verification_result, verification_data = analyze_verification_signals(logprobs, tokens)
inversion_result, inversion_data = detect_semantic_inversions(logprobs, tokens)
# Format results for display
analysis_html = f"""
Trace Analysis Results
- Confidence Signature: {confidence_result}
{f"- Positions: {', '.join(str(pos) for pos, tok in confidence_data)}
" if confidence_data else ""}
- Interpretation Pivots: {pivot_result}
{f"- Positions: {', '.join(str(pos) for pos, _, _ in pivot_data)}
" if pivot_data else ""}
- Decision Entropy Spikes: {entropy_result}
{f"- Positions: {', '.join(str(pos) for pos in entropy_data)}
" if entropy_data else ""}
- Conclusion Competition: {conclusion_result}
- Verification Signals: {verification_result}
- Semantic Inversions: {inversion_result}
{f"- Positions: {', '.join(str(pos) for pos, _, _, _ in inversion_data)}
" if inversion_data else ""}
"""
return analysis_html, None, None, None, None, None
# Gradio interface with two tabs: Trace Analysis and Visualization
with gr.Blocks(title="Log Probability Visualizer") as app:
gr.Markdown("# Log Probability Visualizer")
gr.Markdown(
"Paste your JSON log prob data below to analyze reasoning traces and visualize tokens in chunks of 100. Fixed filter ≥ -100000, dynamic number of top_logprobs, handles missing or null fields. Next chunk is precomputed proactively."
)
with gr.Tabs():
with gr.Tab("Trace Analysis"):
with gr.Row():
json_input_analysis = gr.Textbox(
label="JSON Input for Trace Analysis",
lines=10,
placeholder="Paste your JSON (e.g., {\"content\": [{\"bytes\": [44], \"logprob\": 0.0, \"token\": \",\", \"top_logprobs\": {\" so\": -13.8046875, \".\": -13.8046875, \",\": -13.640625}}]}).",
)
with gr.Row():
analysis_output = gr.HTML(label="Trace Analysis Results")
btn_analyze = gr.Button("Analyze Trace")
btn_analyze.click(
fn=analyze_full_trace,
inputs=[json_input_analysis],
outputs=[analysis_output, gr.State(), gr.State(), gr.State(), gr.State(), gr.State()],
)
with gr.Tab("Visualization"):
with gr.Row():
json_input_viz = gr.Textbox(
label="JSON Input for Visualization",
lines=10,
placeholder="Paste your JSON (e.g., {\"content\": [{\"bytes\": [44], \"logprob\": 0.0, \"token\": \",\", \"top_logprobs\": {\" so\": -13.8046875, \".\": -13.8046875, \",\": -13.640625}}]}).",
)
chunk = gr.Number(value=0, label="Current Chunk", precision=0, minimum=0)
with gr.Row():
plot_output = gr.Plot(label="Log Probability Plot (Click for Tokens)")
drops_output = gr.Plot(label="Probability Drops (Click for Details)")
with gr.Row():
table_output = gr.Dataframe(label="Token Log Probabilities and Top Alternatives")
alt_viz_output = gr.Plot(label="Top Token Log Probabilities (Click for Details)")
with gr.Row():
text_output = gr.HTML(label="Colored Text (Confidence Visualization)")
with gr.Row():
prev_btn = gr.Button("Previous Chunk")
next_btn = gr.Button("Next Chunk")
total_chunks_output = gr.Number(label="Total Chunks", interactive=False)
# Precomputed next chunk state (hidden)
precomputed_next = gr.State(value=None)
btn_viz = gr.Button("Visualize")
btn_viz.click(
fn=visualize_logprobs,
inputs=[json_input_viz, chunk],
outputs=[plot_output, table_output, text_output, alt_viz_output, drops_output, total_chunks_output, chunk],
)
# Precompute next chunk proactively when on current chunk
async def precompute_next_chunk(json_input, current_chunk, precomputed_next):
if precomputed_next is not None:
return precomputed_next # Use cached precomputed chunk if available
next_tokens, next_logprobs, next_alternatives = await precompute_chunk(json_input, 100, current_chunk)
if next_tokens is None or next_logprobs is None or next_alternatives is None:
return None
return (next_tokens, next_logprobs, next_alternatives)
# Update chunk on button clicks
def update_chunk(json_input, current_chunk, action, precomputed_next=None):
total_chunks = visualize_logprobs(json_input, 0)[5] # Get total chunks
if action == "prev" and current_chunk > 0:
current_chunk -= 1
elif action == "next" and current_chunk < total_chunks - 1:
current_chunk += 1
# If precomputed next chunk exists, use it; otherwise, compute it
if precomputed_next:
next_tokens, next_logprobs, next_alternatives = precomputed_next
if next_tokens and next_logprobs and next_alternatives:
logger.debug("Using precomputed next chunk for chunk %d", current_chunk)
return visualize_logprobs(json_input, current_chunk)
return visualize_logprobs(json_input, current_chunk)
prev_btn.click(
fn=update_chunk,
inputs=[json_input_viz, chunk, gr.State(value="prev"), precomputed_next],
outputs=[plot_output, table_output, text_output, alt_viz_output, drops_output, total_chunks_output, chunk],
)
next_btn.click(
fn=update_chunk,
inputs=[json_input_viz, chunk, gr.State(value="next"), precomputed_next],
outputs=[plot_output, table_output, text_output, alt_viz_output, drops_output, total_chunks_output, chunk],
)
# Trigger precomputation when chunk changes (via button clicks or initial load)
def trigger_precomputation(json_input, current_chunk):
asyncio.create_task(precompute_next_chunk(json_input, current_chunk, None))
return gr.update(value=current_chunk)
# Use a dummy event to trigger precomputation on chunk change (simplified for Gradio)
chunk.change(
fn=trigger_precomputation,
inputs=[json_input_viz, chunk],
outputs=[chunk],
)
app.launch()