Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -238,15 +238,58 @@ def visualize_logprobs(json_input, chunk=0, chunk_size=100):
|
|
238 |
def analyze_confidence_signature(logprobs, tokens):
|
239 |
if not logprobs or not tokens:
|
240 |
return "No data for confidence signature analysis.", None
|
241 |
-
|
|
|
|
|
242 |
if not any(p != -float('inf') for p in top_probs):
|
243 |
return "No valid log probabilities for confidence analysis.", None
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
|
251 |
def detect_interpretation_pivots(logprobs, tokens):
|
252 |
if not logprobs or not tokens:
|
|
|
238 |
def analyze_confidence_signature(logprobs, tokens):
|
239 |
if not logprobs or not tokens:
|
240 |
return "No data for confidence signature analysis.", None
|
241 |
+
|
242 |
+
# Extract top probabilities
|
243 |
+
top_probs = [lps[0][1] if lps and lps[0][1] is not None else -float('inf') for lps in logprobs]
|
244 |
if not any(p != -float('inf') for p in top_probs):
|
245 |
return "No valid log probabilities for confidence analysis.", None
|
246 |
+
|
247 |
+
# Use a larger window for smoother trends
|
248 |
+
window_size = 30 # Increased from 20
|
249 |
+
moving_avg = np.convolve(top_probs, np.ones(window_size) / window_size, mode='valid')
|
250 |
+
|
251 |
+
# Calculate drop magnitudes
|
252 |
+
drops = np.diff(moving_avg)
|
253 |
+
|
254 |
+
# Use adaptive thresholding - only flag drops in the bottom 5% of all changes
|
255 |
+
drop_threshold = np.percentile(drops, 5) # More selective
|
256 |
+
significant_drops = np.where(drops < drop_threshold)[0]
|
257 |
+
|
258 |
+
# Cluster nearby drops (within 10 tokens) to avoid reporting multiple points in the same reasoning shift
|
259 |
+
if len(significant_drops) > 0:
|
260 |
+
clustered_drops = [significant_drops[0]]
|
261 |
+
for drop in significant_drops[1:]:
|
262 |
+
if drop - clustered_drops[-1] > 10: # At least 10 tokens apart
|
263 |
+
clustered_drops.append(drop)
|
264 |
+
else:
|
265 |
+
clustered_drops = []
|
266 |
+
|
267 |
+
# Look for context markers near drops
|
268 |
+
filtered_drops = []
|
269 |
+
reasoning_markers = ["therefore", "thus", "so", "hence", "wait", "but", "however", "actually"]
|
270 |
+
|
271 |
+
for drop in clustered_drops:
|
272 |
+
# Adjust index for convolution window
|
273 |
+
token_idx = drop + window_size - 1
|
274 |
+
|
275 |
+
# Check surrounding context (10 tokens before and after)
|
276 |
+
start_idx = max(0, token_idx - 10)
|
277 |
+
end_idx = min(len(tokens), token_idx + 10)
|
278 |
+
context = " ".join(tokens[start_idx:end_idx])
|
279 |
+
|
280 |
+
# Only keep drops near reasoning transition markers
|
281 |
+
if any(marker in context.lower() for marker in reasoning_markers):
|
282 |
+
drop_magnitude = drops[drop]
|
283 |
+
filtered_drops.append((token_idx, drop_magnitude, tokens[token_idx] if token_idx < len(tokens) else "End of trace"))
|
284 |
+
|
285 |
+
# Sort by drop magnitude (largest drops first)
|
286 |
+
filtered_drops.sort(key=lambda x: x[1])
|
287 |
+
|
288 |
+
if not filtered_drops:
|
289 |
+
return "No significant confidence shifts at reasoning transitions detected.", None
|
290 |
+
|
291 |
+
# Return at most 3 most significant drops
|
292 |
+
return "Significant confidence shifts detected at reasoning transitions:", filtered_drops[:3]
|
293 |
|
294 |
def detect_interpretation_pivots(logprobs, tokens):
|
295 |
if not logprobs or not tokens:
|