import gradio as gr
import json
import matplotlib.pyplot as plt
import pandas as pd
import io
import base64
import math
import ast
import logging
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# Function to safely parse JSON or Python dictionary input
def parse_input(json_input):
logger.debug("Attempting to parse input: %s", json_input)
try:
# Try to parse as JSON first
data = json.loads(json_input)
logger.debug("Successfully parsed as JSON")
return data
except json.JSONDecodeError as e:
logger.error("JSON parsing failed: %s", str(e))
try:
# If JSON fails, try to parse as Python literal (e.g., with single quotes)
data = ast.literal_eval(json_input)
logger.debug("Successfully parsed as Python literal")
# Convert Python dictionary to JSON-compatible format (replace single quotes with double quotes)
def dict_to_json(obj):
if isinstance(obj, dict):
return {str(k): dict_to_json(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [dict_to_json(item) for item in obj]
else:
return obj
converted_data = dict_to_json(data)
logger.debug("Converted to JSON-compatible format")
return converted_data
except (SyntaxError, ValueError) as e:
logger.error("Python literal parsing failed: %s", str(e))
raise ValueError(f"Malformed input: {str(e)}. Ensure property names are in double quotes (e.g., \"content\") or correct Python dictionary format.")
# Function to ensure a value is a float, converting from string if necessary
def ensure_float(value):
if value is None:
return None
if isinstance(value, str):
try:
return float(value)
except ValueError:
logger.error("Failed to convert string '%s' to float", value)
return None
if isinstance(value, (int, float)):
return float(value)
return None
# Function to process and visualize log probs with interactive Plotly plots
def visualize_logprobs(json_input, prob_filter=-1e9, page_size=50, page=0):
try:
# Parse the input (handles both JSON and Python dictionaries)
data = parse_input(json_input)
# Ensure data is a list or dictionary with 'content'
if isinstance(data, dict) and "content" in data:
content = data["content"]
elif isinstance(data, list):
content = data
else:
raise ValueError("Input must be a list or dictionary with 'content' key")
# Extract tokens, log probs, and top alternatives, skipping None or non-finite values
tokens = []
logprobs = []
top_alternatives = [] # List to store top 3 log probs (selected token + 2 alternatives)
for entry in content:
logprob = ensure_float(entry.get("logprob", None))
if logprob is not None and math.isfinite(logprob) and logprob >= prob_filter:
tokens.append(entry["token"])
logprobs.append(logprob)
# Get top_logprobs, default to empty dict if None
top_probs = entry.get("top_logprobs", {})
# Ensure all values in top_logprobs are floats
finite_top_probs = {}
for key, value in top_probs.items():
float_value = ensure_float(value)
if float_value is not None and math.isfinite(float_value):
finite_top_probs[key] = float_value
# Get the top 3 log probs (including the selected token)
all_probs = {entry["token"]: logprob} # Add the selected token's logprob
all_probs.update(finite_top_probs) # Add alternatives
sorted_probs = sorted(all_probs.items(), key=lambda x: x[1], reverse=True)
top_3 = sorted_probs[:3] # Top 3 log probs (highest to lowest)
top_alternatives.append(top_3)
else:
logger.debug("Skipping entry with logprob: %s (type: %s)", entry.get("logprob"), type(entry.get("logprob", None)))
# Check if there's valid data after filtering
if not logprobs or not tokens:
return (gr.update(value="No finite log probabilities or tokens to visualize after filtering"), None, None, None, 1, 0)
# Paginate data for large inputs
total_pages = max(1, (len(logprobs) + page_size - 1) // page_size)
start_idx = page * page_size
end_idx = min((page + 1) * page_size, len(logprobs))
paginated_tokens = tokens[start_idx:end_idx]
paginated_logprobs = logprobs[start_idx:end_idx]
paginated_alternatives = top_alternatives[start_idx:end_idx] if top_alternatives else []
# 1. Main Log Probability Plot (Interactive Plotly)
main_fig = go.Figure()
main_fig.add_trace(go.Scatter(x=list(range(len(paginated_logprobs))), y=paginated_logprobs, mode='markers+lines', name='Log Prob', marker=dict(color='blue')))
main_fig.update_layout(
title="Log Probabilities of Generated Tokens",
xaxis_title="Token Position",
yaxis_title="Log Probability",
hovermode="closest",
clickmode='event+select'
)
main_fig.update_traces(
customdata=[f"Token: {tok}, Log Prob: {prob:.4f}, Position: {i+start_idx}" for i, (tok, prob) in enumerate(zip(paginated_tokens, paginated_logprobs))],
hovertemplate='%{customdata}
{colored_text}
" else: colored_text_html = "No finite log probabilities to display." # Top 3 Token Log Probabilities (paginated) alt_viz_html = "" if paginated_logprobs and paginated_alternatives: alt_viz_html = "