Spaces:

codelion
/

LogProbsVisualizer

Running

App Files Files Community

codelion commited on 29 days ago

Commit

9ba1537

verified ·

1 Parent(s): 0d41503

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -62

app.py CHANGED Viewed

@@ -9,6 +9,8 @@ import ast
 import logging
 import numpy as np
 import plotly.graph_objects as go
 # Set up logging
 logging.basicConfig(level=logging.DEBUG)
@@ -24,24 +26,7 @@ def parse_input(json_input):
         return data
     except json.JSONDecodeError as e:
         logger.error("JSON parsing failed: %s (Input: %s)", str(e), json_input[:100] + "..." if len(json_input) > 100 else json_input)
-        try:
-            # If JSON fails, try to parse as Python literal (e.g., with single quotes), but only for JSON-like strings
-            data = ast.literal_eval(json_input)
-            logger.debug("Successfully parsed as Python literal")
-            # Convert Python dictionary to JSON-compatible format (replace single quotes with double quotes)
-            def dict_to_json(obj):
-                if isinstance(obj, dict):
-                    return {str(k): dict_to_json(v) for k, v in obj.items()}
-                elif isinstance(obj, list):
-                    return [dict_to_json(item) for item in obj]
-                else:
-                    return obj
-            converted_data = dict_to_json(data)
-            logger.debug("Converted to JSON-compatible format")
-            return converted_data
-        except (SyntaxError, ValueError) as e:
-            logger.error("Python literal parsing failed: %s (Input: %s)", str(e), json_input[:100] + "..." if len(json_input) > 100 else json_input)
-            raise ValueError(f"Malformed input: {str(e)}. Ensure property names are in double quotes (e.g., \"content\") and the format matches JSON (e.g., {{\"content\": [...]}}).")
 # Function to ensure a value is a float, converting from string if necessary
 def ensure_float(value):
@@ -69,10 +54,59 @@ def get_token(entry):
 def create_empty_figure(title):
     return go.Figure().update_layout(title=title, xaxis_title="", yaxis_title="", showlegend=False)
-# Function to process and visualize the full log probs with dynamic top_logprobs, handling missing tokens and JSON structure
-def visualize_logprobs(json_input):
     try:
-        # Parse the input (handles JSON only, as specified)
         data = parse_input(json_input)
         # Ensure data is a dictionary with 'content' key containing a list
@@ -94,14 +128,13 @@ def visualize_logprobs(json_input):
                 logger.warning("Skipping non-dictionary entry: %s", entry)
                 continue
             logprob = ensure_float(entry.get("logprob", None))
-            if logprob >= -100000:  # Include all entries with default 0.0, removing math.isfinite check
-                token = get_token(entry)  # Safely get token, defaulting to "Unknown" if missing
-                tokens.append(token)
                 logprobs.append(logprob)
                 # Get top_logprobs, default to empty dict if None
                 top_probs = entry.get("top_logprobs", {})
                 if top_probs is None:
-                    logger.debug("top_logprobs is None for token: %s, using empty dict", token)
                     top_probs = {}  # Default to empty dict for None
                 # Ensure all values in top_logprobs are floats and create a list of tuples
                 finite_top_probs = []
@@ -115,53 +148,61 @@ def visualize_logprobs(json_input):
             else:
                 logger.debug("Skipping entry with logprob: %s (type: %s)", entry.get("logprob"), type(entry.get("logprob", None)))
-        # Check if there's valid data after filtering (including default 0.0)
         if not logprobs or not tokens:
-            return (create_empty_figure("Log Probabilities of Generated Tokens"), None, "No tokens to display.", create_empty_figure("Top Token Log Probabilities"), create_empty_figure("Significant Probability Drops"))
         # 1. Main Log Probability Plot (Interactive Plotly)
         main_fig = go.Figure()
-        main_fig.add_trace(go.Scatter(x=list(range(len(logprobs))), y=logprobs, mode='markers+lines', name='Log Prob', marker=dict(color='blue')))
         main_fig.update_layout(
-            title="Log Probabilities of Generated Tokens",
-            xaxis_title="Token Position",
             yaxis_title="Log Probability",
             hovermode="closest",
             clickmode='event+select'
         )
         main_fig.update_traces(
-            customdata=[f"Token: {tok}, Log Prob: {prob:.4f}, Position: {i}" for i, (tok, prob) in enumerate(zip(tokens, logprobs))],
             hovertemplate='<b>%{customdata}</b><extra></extra>'
         )
         # 2. Probability Drop Analysis (Interactive Plotly)
-        if len(logprobs) < 2:
-            drops_fig = create_empty_figure("Significant Probability Drops")
         else:
-            drops = [logprobs[i+1] - logprobs[i] for i in range(len(logprobs)-1)]
             drops_fig = go.Figure()
             drops_fig.add_trace(go.Bar(x=list(range(len(drops))), y=drops, name='Drop', marker_color='red'))
             drops_fig.update_layout(
-                title="Significant Probability Drops",
-                xaxis_title="Token Position",
                 yaxis_title="Log Probability Drop",
                 hovermode="closest",
                 clickmode='event+select'
             )
             drops_fig.update_traces(
-                customdata=[f"Drop: {drop:.4f}, From: {tokens[i]} to {tokens[i+1]}, Position: {i}" for i, drop in enumerate(drops)],
                 hovertemplate='<b>%{customdata}</b><extra></extra>'
             )
         # Create DataFrame for the table with dynamic top_logprobs
         table_data = []
-        max_alternatives = max(len(alts) for alts in top_alternatives) if top_alternatives else 0
-        for i, entry in enumerate(content):
             if not isinstance(entry, dict):
                 continue
             logprob = ensure_float(entry.get("logprob", None))
             if logprob >= -100000 and "top_logprobs" in entry:  # Include all entries with default 0.0
-                token = get_token(entry)  # Safely get token, defaulting to "Unknown" if missing
                 top_logprobs = entry.get("top_logprobs", {})
                 if top_logprobs is None:
                     logger.debug("top_logprobs is None for token: %s, using empty dict", token)
@@ -191,38 +232,38 @@ def visualize_logprobs(json_input):
             else None
         )
-        # Generate colored text
-        if logprobs:
-            min_logprob = min(logprobs)
-            max_logprob = max(logprobs)
             if max_logprob == min_logprob:
-                normalized_probs = [0.5] * len(logprobs)
             else:
                 normalized_probs = [
-                    (lp - min_logprob) / (max_logprob - min_logprob) for lp in logprobs
                 ]
             colored_text = ""
-            for i, (token, norm_prob) in enumerate(zip(tokens, normalized_probs)):
                 r = int(255 * (1 - norm_prob))  # Red for low confidence
                 g = int(255 * norm_prob)        # Green for high confidence
                 b = 0
                 color = f"rgb({r}, {g}, {b})"
                 colored_text += f'<span style="color: {color}; font-weight: bold;">{token}</span>'
-                if i < len(tokens) - 1:
                     colored_text += " "
             colored_text_html = f"<p>{colored_text}</p>"
         else:
-            colored_text_html = "No tokens to display."
-        # Top Token Log Probabilities (Interactive Plotly, dynamic length)
-        alt_viz_fig = create_empty_figure("Top Token Log Probabilities") if not logprobs or not top_alternatives else go.Figure()
-        if logprobs and top_alternatives:
-            for i, (token, probs) in enumerate(zip(tokens, top_alternatives)):
                 for j, (alt_tok, prob) in enumerate(probs):
-                    alt_viz_fig.add_trace(go.Bar(x=[f"{token} (Pos {i})"], y=[prob], name=f"{alt_tok}", marker_color=['blue', 'green', 'red', 'purple', 'orange'][:len(probs)]))
             alt_viz_fig.update_layout(
-                title="Top Token Log Probabilities",
                 xaxis_title="Token (Position)",
                 yaxis_title="Log Probability",
                 barmode='stack',
@@ -230,21 +271,21 @@ def visualize_logprobs(json_input):
                 clickmode='event+select'
             )
             alt_viz_fig.update_traces(
-                customdata=[f"Token: {tok}, Alt: {alt}, Log Prob: {prob:.4f}, Position: {i}" for i, (tok, alts) in enumerate(zip(tokens, top_alternatives)) for alt, prob in alts],
                 hovertemplate='<b>%{customdata}</b><extra></extra>'
             )
-        return (main_fig, df, colored_text_html, alt_viz_fig, drops_fig)
     except Exception as e:
         logger.error("Visualization failed: %s (Input: %s)", str(e), json_input[:100] + "..." if len(json_input) > 100 else json_input)
-        return (create_empty_figure("Log Probabilities of Generated Tokens"), None, "No finite log probabilities to display.", create_empty_figure("Top Token Log Probabilities"), create_empty_figure("Significant Probability Drops"))
-# Gradio interface with full dataset visualization, dynamic top_logprobs, and robust JSON handling
 with gr.Blocks(title="Log Probability Visualizer") as app:
     gr.Markdown("# Log Probability Visualizer")
     gr.Markdown(
-        "Paste your JSON log prob data below to visualize all tokens at once. Fixed filter ≥ -100000, dynamic number of top_logprobs, handles missing or null fields."
     )
     with gr.Row():
@@ -253,6 +294,7 @@ with gr.Blocks(title="Log Probability Visualizer") as app:
             lines=10,
             placeholder="Paste your JSON (e.g., {\"content\": [{\"bytes\": [44], \"logprob\": 0.0, \"token\": \",\", \"top_logprobs\": {\" so\": -13.8046875, \".\": -13.8046875, \"，\": -13.640625}}]}).",
         )
     with gr.Row():
         plot_output = gr.Plot(label="Log Probability Plot (Click for Tokens)")
@@ -265,11 +307,67 @@ with gr.Blocks(title="Log Probability Visualizer") as app:
     with gr.Row():
         text_output = gr.HTML(label="Colored Text (Confidence Visualization)")
     btn = gr.Button("Visualize")
     btn.click(
         fn=visualize_logprobs,
-        inputs=[json_input],
-        outputs=[plot_output, table_output, text_output, alt_viz_output, drops_output],
     )
 app.launch()

 import logging
 import numpy as np
 import plotly.graph_objects as go
+import asyncio
+import anyio
 # Set up logging
 logging.basicConfig(level=logging.DEBUG)
         return data
     except json.JSONDecodeError as e:
         logger.error("JSON parsing failed: %s (Input: %s)", str(e), json_input[:100] + "..." if len(json_input) > 100 else json_input)
+        raise ValueError(f"Malformed input: {str(e)}. Ensure property names are in double quotes (e.g., \"content\") and the format matches JSON (e.g., {{\"content\": [...]}}).")
 # Function to ensure a value is a float, converting from string if necessary
 def ensure_float(value):
 def create_empty_figure(title):
     return go.Figure().update_layout(title=title, xaxis_title="", yaxis_title="", showlegend=False)
+# Precompute the next chunk asynchronously
+async def precompute_chunk(json_input, chunk_size, current_chunk):
+    try:
+        data = parse_input(json_input)
+        content = data.get("content", []) if isinstance(data, dict) else data
+        if not isinstance(content, list):
+            raise ValueError("Content must be a list of entries")
+        tokens = []
+        logprobs = []
+        top_alternatives = []
+        for entry in content:
+            if not isinstance(entry, dict):
+                logger.warning("Skipping non-dictionary entry: %s", entry)
+                continue
+            logprob = ensure_float(entry.get("logprob", None))
+            if logprob >= -100000:  # Include all entries with default 0.0
+                tokens.append(get_token(entry))
+                logprobs.append(logprob)
+                top_probs = entry.get("top_logprobs", {})
+                if top_probs is None:
+                    logger.debug("top_logprobs is None for token: %s, using empty dict", get_token(entry))
+                    top_probs = {}
+                finite_top_probs = []
+                for key, value in top_probs.items():
+                    float_value = ensure_float(value)
+                    if float_value is not None and math.isfinite(float_value):
+                        finite_top_probs.append((key, float_value))
+                sorted_probs = sorted(finite_top_probs, key=lambda x: x[1], reverse=True)
+                top_alternatives.append(sorted_probs)
+        if not tokens or not logprobs:
+            return None, None, None
+        next_chunk = current_chunk + 1
+        start_idx = next_chunk * chunk_size
+        end_idx = min((next_chunk + 1) * chunk_size, len(tokens))
+        if start_idx >= len(tokens):
+            return None, None, None
+        paginated_tokens = tokens[start_idx:end_idx]
+        paginated_logprobs = logprobs[start_idx:end_idx]
+        paginated_alternatives = top_alternatives[start_idx:end_idx]
+        return paginated_tokens, paginated_logprobs, paginated_alternatives
+    except Exception as e:
+        logger.error("Precomputation failed for chunk %d: %s", current_chunk + 1, str(e))
+        return None, None, None
+# Function to process and visualize a chunk of log probs with dynamic top_logprobs
+def visualize_logprobs(json_input, chunk=0, chunk_size=1000):
     try:
+        # Parse the input (handles JSON only)
         data = parse_input(json_input)
         # Ensure data is a dictionary with 'content' key containing a list
                 logger.warning("Skipping non-dictionary entry: %s", entry)
                 continue
             logprob = ensure_float(entry.get("logprob", None))
+            if logprob >= -100000:  # Include all entries with default 0.0
+                tokens.append(get_token(entry))
                 logprobs.append(logprob)
                 # Get top_logprobs, default to empty dict if None
                 top_probs = entry.get("top_logprobs", {})
                 if top_probs is None:
+                    logger.debug("top_logprobs is None for token: %s, using empty dict", get_token(entry))
                     top_probs = {}  # Default to empty dict for None
                 # Ensure all values in top_logprobs are floats and create a list of tuples
                 finite_top_probs = []
             else:
                 logger.debug("Skipping entry with logprob: %s (type: %s)", entry.get("logprob"), type(entry.get("logprob", None)))
+        # Check if there's valid data after filtering
         if not logprobs or not tokens:
+            return (create_empty_figure("Log Probabilities of Generated Tokens"), None, "No tokens to display.", create_empty_figure("Top Token Log Probabilities"), create_empty_figure("Significant Probability Drops"), 1, 0)
+        # Paginate data for chunks of 1,000 tokens
+        total_chunks = max(1, (len(logprobs) + chunk_size - 1) // chunk_size)
+        start_idx = chunk * chunk_size
+        end_idx = min((chunk + 1) * chunk_size, len(logprobs))
+        paginated_tokens = tokens[start_idx:end_idx]
+        paginated_logprobs = logprobs[start_idx:end_idx]
+        paginated_alternatives = top_alternatives[start_idx:end_idx] if top_alternatives else []
         # 1. Main Log Probability Plot (Interactive Plotly)
         main_fig = go.Figure()
+        main_fig.add_trace(go.Scatter(x=list(range(len(paginated_logprobs))), y=paginated_logprobs, mode='markers+lines', name='Log Prob', marker=dict(color='blue')))
         main_fig.update_layout(
+            title="Log Probabilities of Generated Tokens (Chunk %d)" % (chunk + 1),
+            xaxis_title="Token Position (within chunk)",
             yaxis_title="Log Probability",
             hovermode="closest",
             clickmode='event+select'
         )
         main_fig.update_traces(
+            customdata=[f"Token: {tok}, Log Prob: {prob:.4f}, Position: {i+start_idx}" for i, (tok, prob) in enumerate(zip(paginated_tokens, paginated_logprobs))],
             hovertemplate='<b>%{customdata}</b><extra></extra>'
         )
         # 2. Probability Drop Analysis (Interactive Plotly)
+        if len(paginated_logprobs) < 2:
+            drops_fig = create_empty_figure("Significant Probability Drops (Chunk %d)" % (chunk + 1))
         else:
+            drops = [paginated_logprobs[i+1] - paginated_logprobs[i] for i in range(len(paginated_logprobs)-1)]
             drops_fig = go.Figure()
             drops_fig.add_trace(go.Bar(x=list(range(len(drops))), y=drops, name='Drop', marker_color='red'))
             drops_fig.update_layout(
+                title="Significant Probability Drops (Chunk %d)" % (chunk + 1),
+                xaxis_title="Token Position (within chunk)",
                 yaxis_title="Log Probability Drop",
                 hovermode="closest",
                 clickmode='event+select'
             )
             drops_fig.update_traces(
+                customdata=[f"Drop: {drop:.4f}, From: {paginated_tokens[i]} to {paginated_tokens[i+1]}, Position: {i+start_idx}" for i, drop in enumerate(drops)],
                 hovertemplate='<b>%{customdata}</b><extra></extra>'
             )
         # Create DataFrame for the table with dynamic top_logprobs
         table_data = []
+        max_alternatives = max(len(alts) for alts in paginated_alternatives) if paginated_alternatives else 0
+        for i, entry in enumerate(content[start_idx:end_idx]):
             if not isinstance(entry, dict):
                 continue
             logprob = ensure_float(entry.get("logprob", None))
             if logprob >= -100000 and "top_logprobs" in entry:  # Include all entries with default 0.0
+                token = get_token(entry)
                 top_logprobs = entry.get("top_logprobs", {})
                 if top_logprobs is None:
                     logger.debug("top_logprobs is None for token: %s, using empty dict", token)
             else None
         )
+        # Generate colored text (for the current chunk)
+        if paginated_logprobs:
+            min_logprob = min(paginated_logprobs)
+            max_logprob = max(paginated_logprobs)
             if max_logprob == min_logprob:
+                normalized_probs = [0.5] * len(paginated_logprobs)
             else:
                 normalized_probs = [
+                    (lp - min_logprob) / (max_logprob - min_logprob) for lp in paginated_logprobs
                 ]
             colored_text = ""
+            for i, (token, norm_prob) in enumerate(zip(paginated_tokens, normalized_probs)):
                 r = int(255 * (1 - norm_prob))  # Red for low confidence
                 g = int(255 * norm_prob)        # Green for high confidence
                 b = 0
                 color = f"rgb({r}, {g}, {b})"
                 colored_text += f'<span style="color: {color}; font-weight: bold;">{token}</span>'
+                if i < len(paginated_tokens) - 1:
                     colored_text += " "
             colored_text_html = f"<p>{colored_text}</p>"
         else:
+            colored_text_html = "No tokens to display in this chunk."
+        # Top Token Log Probabilities (Interactive Plotly, dynamic length, for the current chunk)
+        alt_viz_fig = create_empty_figure("Top Token Log Probabilities (Chunk %d)" % (chunk + 1)) if not paginated_logprobs or not paginated_alternatives else go.Figure()
+        if paginated_logprobs and paginated_alternatives:
+            for i, (token, probs) in enumerate(zip(paginated_tokens, paginated_alternatives)):
                 for j, (alt_tok, prob) in enumerate(probs):
+                    alt_viz_fig.add_trace(go.Bar(x=[f"{token} (Pos {i+start_idx})"], y=[prob], name=f"{alt_tok}", marker_color=['blue', 'green', 'red', 'purple', 'orange'][:len(probs)]))
             alt_viz_fig.update_layout(
+                title="Top Token Log Probabilities (Chunk %d)" % (chunk + 1),
                 xaxis_title="Token (Position)",
                 yaxis_title="Log Probability",
                 barmode='stack',
                 clickmode='event+select'
             )
             alt_viz_fig.update_traces(
+                customdata=[f"Token: {tok}, Alt: {alt}, Log Prob: {prob:.4f}, Position: {i+start_idx}" for i, (tok, alts) in enumerate(zip(paginated_tokens, paginated_alternatives)) for alt, prob in alts],
                 hovertemplate='<b>%{customdata}</b><extra></extra>'
             )
+        return (main_fig, df, colored_text_html, alt_viz_fig, drops_fig, total_chunks, chunk)
     except Exception as e:
         logger.error("Visualization failed: %s (Input: %s)", str(e), json_input[:100] + "..." if len(json_input) > 100 else json_input)
+        return (create_empty_figure("Log Probabilities of Generated Tokens"), None, "No finite log probabilities to display.", create_empty_figure("Top Token Log Probabilities"), create_empty_figure("Significant Probability Drops"), 1, 0)
+# Gradio interface with chunked visualization and proactive precomputation
 with gr.Blocks(title="Log Probability Visualizer") as app:
     gr.Markdown("# Log Probability Visualizer")
     gr.Markdown(
+        "Paste your JSON log prob data below to visualize tokens in chunks of 1,000. Fixed filter ≥ -100000, dynamic number of top_logprobs, handles missing or null fields. Next chunk is precomputed proactively."
     )
     with gr.Row():
             lines=10,
             placeholder="Paste your JSON (e.g., {\"content\": [{\"bytes\": [44], \"logprob\": 0.0, \"token\": \",\", \"top_logprobs\": {\" so\": -13.8046875, \".\": -13.8046875, \"，\": -13.640625}}]}).",
         )
+        chunk = gr.Number(value=0, label="Current Chunk", precision=0, minimum=0)
     with gr.Row():
         plot_output = gr.Plot(label="Log Probability Plot (Click for Tokens)")
     with gr.Row():
         text_output = gr.HTML(label="Colored Text (Confidence Visualization)")
+    with gr.Row():
+        prev_btn = gr.Button("Previous Chunk")
+        next_btn = gr.Button("Next Chunk")
+        total_chunks_output = gr.Number(label="Total Chunks", interactive=False)
+    # Precomputed next chunk state (hidden)
+    precomputed_next = gr.State(value=None)
     btn = gr.Button("Visualize")
     btn.click(
         fn=visualize_logprobs,
+        inputs=[json_input, chunk],
+        outputs=[plot_output, table_output, text_output, alt_viz_output, drops_output, total_chunks_output, chunk],
+    )
+    # Precompute next chunk proactively when on current chunk
+    async def precompute_next_chunk(json_input, current_chunk, precomputed_next):
+        if precomputed_next is not None:
+            return precomputed_next  # Use cached precomputed chunk if available
+        next_tokens, next_logprobs, next_alternatives = await precompute_chunk(json_input, 1000, current_chunk)
+        if next_tokens is None or next_logprobs is None or next_alternatives is None:
+            return None
+        return (next_tokens, next_logprobs, next_alternatives)
+    # Update chunk on button clicks
+    def update_chunk(json_input, current_chunk, action, precomputed_next=None):
+        total_chunks = visualize_logprobs(json_input, 0)[5]  # Get total chunks
+        if action == "prev" and current_chunk > 0:
+            current_chunk -= 1
+        elif action == "next" and current_chunk < total_chunks - 1:
+            current_chunk += 1
+            # If precomputed next chunk exists, use it; otherwise, compute it
+            if precomputed_next:
+                next_tokens, next_logprobs, next_alternatives = precomputed_next
+                if next_tokens and next_logprobs and next_alternatives:
+                    logger.debug("Using precomputed next chunk for chunk %d", current_chunk)
+                    return visualize_logprobs(json_input, current_chunk)
+        return visualize_logprobs(json_input, current_chunk)
+    prev_btn.click(
+        fn=update_chunk,
+        inputs=[json_input, chunk, gr.State(value="prev"), precomputed_next],
+        outputs=[plot_output, table_output, text_output, alt_viz_output, drops_output, total_chunks_output, chunk],
+    )
+    next_btn.click(
+        fn=update_chunk,
+        inputs=[json_input, chunk, gr.State(value="next"), precomputed_next],
+        outputs=[plot_output, table_output, text_output, alt_viz_output, drops_output, total_chunks_output, chunk],
+    )
+    # Trigger precomputation when chunk changes (via button clicks or initial load)
+    def trigger_precomputation(json_input, current_chunk):
+        asyncio.create_task(precompute_next_chunk(json_input, current_chunk, None))
+        return gr.update(value=current_chunk)
+    # Use a dummy event to trigger precomputation on chunk change (simplified for Gradio)
+    chunk.change(
+        fn=trigger_precomputation,
+        inputs=[json_input, chunk],
+        outputs=[chunk],
     )
 app.launch()