Spaces:

codelion
/

LogProbsVisualizer

Running

App Files Files Community

codelion commited on Feb 26

Commit

a83f370

verified ·

1 Parent(s): 527fd08

Update app.py (#1)

Browse files

- Update app.py (b7159a07f2b7b4eb2ba22113df0d4c04de90a6bc)

Files changed (1) hide show

app.py +82 -46

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import base64
 import math
 import ast
 import logging
 # Set up logging
 logging.basicConfig(level=logging.DEBUG)
@@ -55,7 +56,7 @@ def ensure_float(value):
         return float(value)
     return None
-# Function to process and visualize log probs
 def visualize_logprobs(json_input):
     try:
         # Parse the input (handles both JSON and Python dictionaries)
@@ -69,30 +70,82 @@ def visualize_logprobs(json_input):
         else:
             raise ValueError("Input must be a list or dictionary with 'content' key")
-        # Extract tokens and log probs, skipping None or non-finite values
         tokens = []
         logprobs = []
         for entry in content:
             logprob = ensure_float(entry.get("logprob", None))
             if logprob is not None and math.isfinite(logprob):
                 tokens.append(entry["token"])
                 logprobs.append(logprob)
             else:
                 logger.debug("Skipping entry with logprob: %s (type: %s)", entry.get("logprob"), type(entry.get("logprob", None)))
-        # Prepare table data, handling None in top_logprobs
         table_data = []
-        for entry in content:
             logprob = ensure_float(entry.get("logprob", None))
-            # Only include entries with finite logprob and non-None top_logprobs
-            if (
-                logprob is not None
-                and math.isfinite(logprob)
-                and "top_logprobs" in entry
-                and entry["top_logprobs"] is not None
-            ):
                 token = entry["token"]
-                logger.debug("Processing token: %s, logprob: %s (type: %s)", token, logprob, type(logprob))
                 top_logprobs = entry["top_logprobs"]
                 # Ensure all values in top_logprobs are floats
                 finite_top_logprobs = {}
@@ -100,44 +153,15 @@ def visualize_logprobs(json_input):
                     float_value = ensure_float(value)
                     if float_value is not None and math.isfinite(float_value):
                         finite_top_logprobs[key] = float_value
                 # Extract top 3 alternatives from top_logprobs
-                top_3 = sorted(
-                    finite_top_logprobs.items(), key=lambda x: x[1], reverse=True
-                )[:3]
                 row = [token, f"{logprob:.4f}"]
                 for alt_token, alt_logprob in top_3:
                     row.append(f"{alt_token}: {alt_logprob:.4f}")
-                # Pad with empty strings if fewer than 3 alternatives
                 while len(row) < 5:
                     row.append("")
                 table_data.append(row)
-        # Create the plot
-        if logprobs:
-            plt.figure(figsize=(10, 5))
-            plt.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b")
-            plt.title("Log Probabilities of Generated Tokens")
-            plt.xlabel("Token Position")
-            plt.ylabel("Log Probability")
-            plt.grid(True)
-            plt.xticks(range(len(logprobs)), tokens, rotation=45, ha="right")
-            plt.tight_layout()
-            # Save plot to a bytes buffer
-            buf = io.BytesIO()
-            plt.savefig(buf, format="png", bbox_inches="tight")
-            buf.seek(0)
-            plt.close()
-            # Convert to base64 for Gradio
-            img_bytes = buf.getvalue()
-            img_base64 = base64.b64encode(img_bytes).decode("utf-8")
-            img_html = f'<img src="data:image/png;base64,{img_base64}" style="max-width: 100%; height: auto;">'
-        else:
-            img_html = "No finite log probabilities to plot."
-        # Create DataFrame for the table
         df = (
             pd.DataFrame(
                 table_data,
@@ -177,11 +201,22 @@ def visualize_logprobs(json_input):
         else:
             colored_text_html = "No finite log probabilities to display."
-        return img_html, df, colored_text_html
     except Exception as e:
         logger.error("Visualization failed: %s", str(e))
-        return f"Error: {str(e)}", None, None
 # Gradio interface
 with gr.Blocks(title="Log Probability Visualizer") as app:
@@ -196,15 +231,16 @@ with gr.Blocks(title="Log Probability Visualizer") as app:
         placeholder="Paste your JSON (e.g., {\"content\": [...]}) or Python dict (e.g., {'content': [...]}) here...",
     )
-    plot_output = gr.HTML(label="Log Probability Plot")
     table_output = gr.Dataframe(label="Token Log Probabilities and Top Alternatives")
     text_output = gr.HTML(label="Colored Text (Confidence Visualization)")
     btn = gr.Button("Visualize")
     btn.click(
         fn=visualize_logprobs,
         inputs=json_input,
-        outputs=[plot_output, table_output, text_output],
     )
 app.launch()

 import math
 import ast
 import logging
+from matplotlib.widgets import Cursor
 # Set up logging
 logging.basicConfig(level=logging.DEBUG)
         return float(value)
     return None
+# Function to process and visualize log probs with hover and alternatives
 def visualize_logprobs(json_input):
     try:
         # Parse the input (handles both JSON and Python dictionaries)
         else:
             raise ValueError("Input must be a list or dictionary with 'content' key")
+        # Extract tokens, log probs, and top alternatives, skipping None or non-finite values
         tokens = []
         logprobs = []
+        top_alternatives = []  # List to store top 3 log probs (selected token + 2 alternatives)
         for entry in content:
             logprob = ensure_float(entry.get("logprob", None))
             if logprob is not None and math.isfinite(logprob):
                 tokens.append(entry["token"])
                 logprobs.append(logprob)
+                # Get top_logprobs, default to empty dict if None
+                top_probs = entry.get("top_logprobs", {})
+                # Ensure all values in top_logprobs are floats
+                finite_top_probs = {}
+                for key, value in top_probs.items():
+                    float_value = ensure_float(value)
+                    if float_value is not None and math.isfinite(float_value):
+                        finite_top_probs[key] = float_value
+                # Get the top 3 log probs (including the selected token)
+                all_probs = {entry["token"]: logprob}  # Add the selected token's logprob
+                all_probs.update(finite_top_probs)  # Add alternatives
+                sorted_probs = sorted(all_probs.items(), key=lambda x: x[1], reverse=True)
+                top_3 = sorted_probs[:3]  # Top 3 log probs (highest to lowest)
+                top_alternatives.append(top_3)
             else:
                 logger.debug("Skipping entry with logprob: %s (type: %s)", entry.get("logprob"), type(entry.get("logprob", None)))
+        # Create the plot with hover functionality
+        if logprobs:
+            fig, ax = plt.subplots(figsize=(10, 5))
+            scatter = ax.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b", label="Selected Token")[0]
+            ax.set_title("Log Probabilities of Generated Tokens")
+            ax.set_xlabel("Token Position")
+            ax.set_ylabel("Log Probability")
+            ax.grid(True)
+            ax.set_xticks([])  # Hide X-axis labels by default
+            # Add hover functionality using Matplotlib's Cursor for tooltips
+            cursor = Cursor(ax, useblit=True, color='red', linewidth=1)
+            token_annotations = []
+            for i, (x, y) in enumerate(zip(range(len(logprobs)), logprobs)):
+                annotation = ax.annotate('', (x, y), xytext=(10, 10), textcoords='offset points', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8), visible=False)
+                token_annotations.append(annotation)
+            def on_hover(event):
+                if event.inaxes == ax:
+                    for i, (x, y) in enumerate(zip(range(len(logprobs)), logprobs)):
+                        contains, _ = scatter.contains(event)
+                        if contains and abs(event.xdata - x) < 0.5 and abs(event.ydata - y) < 0.5:
+                            token_annotations[i].set_text(tokens[i])
+                            token_annotations[i].set_visible(True)
+                            fig.canvas.draw_idle()
+                        else:
+                            token_annotations[i].set_visible(False)
+                            fig.canvas.draw_idle()
+            fig.canvas.mpl_connect('motion_notify_event', on_hover)
+            # Save plot to a bytes buffer
+            buf = io.BytesIO()
+            plt.savefig(buf, format="png", bbox_inches="tight", dpi=100)
+            buf.seek(0)
+            plt.close()
+            # Convert to base64 for Gradio
+            img_bytes = buf.getvalue()
+            img_base64 = base64.b64encode(img_bytes).decode("utf-8")
+            img_html = f'<img src="data:image/png;base64,{img_base64}" style="max-width: 100%; height: auto;">'
+        else:
+            img_html = "No finite log probabilities to plot."
+        # Create DataFrame for the table
         table_data = []
+        for i, entry in enumerate(content):
             logprob = ensure_float(entry.get("logprob", None))
+            if logprob is not None and math.isfinite(logprob) and "top_logprobs" in entry and entry["top_logprobs"] is not None:
                 token = entry["token"]
                 top_logprobs = entry["top_logprobs"]
                 # Ensure all values in top_logprobs are floats
                 finite_top_logprobs = {}
                     float_value = ensure_float(value)
                     if float_value is not None and math.isfinite(float_value):
                         finite_top_logprobs[key] = float_value
                 # Extract top 3 alternatives from top_logprobs
+                top_3 = sorted(finite_top_logprobs.items(), key=lambda x: x[1], reverse=True)[:3]
                 row = [token, f"{logprob:.4f}"]
                 for alt_token, alt_logprob in top_3:
                     row.append(f"{alt_token}: {alt_logprob:.4f}")
                 while len(row) < 5:
                     row.append("")
                 table_data.append(row)
         df = (
             pd.DataFrame(
                 table_data,
         else:
             colored_text_html = "No finite log probabilities to display."
+        # Create an alternative visualization for top 3 tokens
+        alt_viz_html = ""
+        if logprobs and top_alternatives:
+            alt_viz_html = "<h3>Top 3 Token Log Probabilities</h3><ul>"
+            for i, (token, probs) in enumerate(zip(tokens, top_alternatives)):
+                alt_viz_html += f"<li>Position {i} (Token: {token}):<br>"
+                for tok, prob in probs:
+                    alt_viz_html += f"{tok}: {prob:.4f}<br>"
+                alt_viz_html += "</li>"
+            alt_viz_html += "</ul>"
+        return img_html, df, colored_text_html, alt_viz_html
     except Exception as e:
         logger.error("Visualization failed: %s", str(e))
+        return f"Error: {str(e)}", None, None, None
 # Gradio interface
 with gr.Blocks(title="Log Probability Visualizer") as app:
         placeholder="Paste your JSON (e.g., {\"content\": [...]}) or Python dict (e.g., {'content': [...]}) here...",
     )
+    plot_output = gr.HTML(label="Log Probability Plot (Hover for Tokens)")
     table_output = gr.Dataframe(label="Token Log Probabilities and Top Alternatives")
     text_output = gr.HTML(label="Colored Text (Confidence Visualization)")
+    alt_viz_output = gr.HTML(label="Top 3 Token Log Probabilities")
     btn = gr.Button("Visualize")
     btn.click(
         fn=visualize_logprobs,
         inputs=json_input,
+        outputs=[plot_output, table_output, text_output, alt_viz_output],
     )
 app.launch()