Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Pratik Bhavsar
commited on
Commit
·
b8ddec2
1
Parent(s):
a64af65
improved looks
Browse files- data_loader.py +452 -287
- results.csv +1 -1
- tabs/leaderboard.py +3 -3
data_loader.py
CHANGED
@@ -40,48 +40,6 @@ CATEGORIES = {
|
|
40 |
"Composite": ["BFCL_v3_multi_turn_composite"],
|
41 |
}
|
42 |
|
43 |
-
METHODOLOGY = """# Methodology
|
44 |
-
## Overview
|
45 |
-
The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
|
46 |
-
The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
|
47 |
-
|
48 |
-
## Tool Selection Quality Metric
|
49 |
-
Models are evaluated on their ability to:
|
50 |
-
- Correctly identify when tools are needed
|
51 |
-
- Select the appropriate tool for the task
|
52 |
-
- Handle cases where no suitable tool exists
|
53 |
-
- Maintain context across multiple interactions
|
54 |
-
|
55 |
-
## Dataset Structure
|
56 |
-
| Type | Samples | Category | Dataset Name | Purpose |
|
57 |
-
|------|---------|-----------|--------------|----------|
|
58 |
-
| Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
|
59 |
-
| | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
|
60 |
-
| | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
|
61 |
-
| | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
|
62 |
-
| Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
|
63 |
-
| | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
|
64 |
-
| | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
|
65 |
-
| | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
|
66 |
-
| | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
|
67 |
-
"""
|
68 |
-
|
69 |
-
|
70 |
-
INSIGHTS = """
|
71 |
-
# Key Insights from Agent Leaderboard
|
72 |
-
|
73 |
-
| Category | Finding | Implications |
|
74 |
-
|----------|---------|--------------|
|
75 |
-
| Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
|
76 |
-
| Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
|
77 |
-
| Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
|
78 |
-
| Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
|
79 |
-
| Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
|
80 |
-
| Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
|
81 |
-
|
82 |
-
**Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
|
83 |
-
"""
|
84 |
-
|
85 |
|
86 |
chat_css = """
|
87 |
/* Container styles */
|
@@ -275,265 +233,48 @@ h3 {
|
|
275 |
}
|
276 |
"""
|
277 |
|
278 |
-
|
279 |
-
# Updated header and cards with theme awareness
|
280 |
-
|
281 |
-
HEADER_CONTENT = """
|
282 |
<style>
|
283 |
@media (prefers-color-scheme: dark) {
|
284 |
:root {
|
285 |
-
--bg-primary:
|
286 |
-
--bg-secondary: rgba(
|
287 |
-
--bg-hover: rgba(
|
288 |
--text-primary: #ffffff;
|
289 |
-
--text-secondary: #
|
290 |
--text-tertiary: #e2e8f0;
|
291 |
-
--border-color: rgba(
|
292 |
-
--border-hover: rgba(
|
293 |
-
--card-bg: rgba(17, 17, 27, 0.
|
294 |
-
--accent-color: #
|
295 |
--accent-bg: rgba(79, 70, 229, 0.1);
|
|
|
|
|
|
|
|
|
296 |
}
|
297 |
}
|
298 |
|
299 |
@media (prefers-color-scheme: light) {
|
300 |
:root {
|
301 |
-
--bg-primary:
|
302 |
-
--bg-secondary: rgba(243, 244, 246, 0.
|
303 |
--bg-hover: rgba(229, 231, 235, 0.95);
|
304 |
-
--text-primary: #
|
305 |
-
--text-secondary: #
|
306 |
-
--text-tertiary: #
|
307 |
-
--border-color: rgba(
|
308 |
-
--border-hover: rgba(
|
309 |
-
--card-bg: rgba(249, 250, 251, 0.
|
310 |
--accent-color: #4F46E5;
|
311 |
--accent-bg: rgba(79, 70, 229, 0.1);
|
|
|
|
|
|
|
|
|
312 |
}
|
313 |
}
|
314 |
-
|
315 |
-
.header-wrapper {
|
316 |
-
padding: 3rem 2rem;
|
317 |
-
background: var(--bg-primary);
|
318 |
-
border-radius: 16px;
|
319 |
-
display: flex;
|
320 |
-
flex-direction: column;
|
321 |
-
align-items: center;
|
322 |
-
text-align: center;
|
323 |
-
}
|
324 |
-
|
325 |
-
.header-wrapper a {
|
326 |
-
color: var(--text-primary) !important;
|
327 |
-
text-decoration: none !important;
|
328 |
-
}
|
329 |
-
|
330 |
-
.description {
|
331 |
-
color: var(--text-primary);
|
332 |
-
font-size: 1.1rem;
|
333 |
-
line-height: 1.6;
|
334 |
-
max-width: 800px;
|
335 |
-
margin: 0 auto 2rem;
|
336 |
-
text-align: center;
|
337 |
-
}
|
338 |
-
|
339 |
-
.actions {
|
340 |
-
display: flex;
|
341 |
-
gap: 1rem;
|
342 |
-
justify-content: center;
|
343 |
-
margin-bottom: 2rem;
|
344 |
-
color: var(--text-primary);
|
345 |
-
}
|
346 |
-
|
347 |
-
.action-button {
|
348 |
-
display: flex;
|
349 |
-
align-items: center;
|
350 |
-
gap: 0.5rem;
|
351 |
-
padding: 0.75rem 1.5rem;
|
352 |
-
background: var(--bg-secondary);
|
353 |
-
border: 1px solid var(--border-color);
|
354 |
-
border-radius: 100px;
|
355 |
-
color: var(--text-primary) !important;
|
356 |
-
text-decoration: none !important;
|
357 |
-
font-size: 0.95rem;
|
358 |
-
transition: all 0.2s ease;
|
359 |
-
}
|
360 |
-
|
361 |
-
.action-button:hover {
|
362 |
-
background: var(--bg-hover);
|
363 |
-
border-color: var(--border-hover);
|
364 |
-
color: var(--text-primary) !important;
|
365 |
-
}
|
366 |
-
|
367 |
-
.update-info {
|
368 |
-
color: var(--text-secondary);
|
369 |
-
font-size: 0.9rem;
|
370 |
-
margin-bottom: 3rem;
|
371 |
-
}
|
372 |
-
|
373 |
-
.features-grid {
|
374 |
-
display: grid;
|
375 |
-
grid-template-columns: repeat(3, 1fr);
|
376 |
-
gap: 1.5rem;
|
377 |
-
width: 100%;
|
378 |
-
max-width: 1200px;
|
379 |
-
}
|
380 |
-
|
381 |
-
.feature-card {
|
382 |
-
background: var(--card-bg);
|
383 |
-
border: 1px solid var(--border-color);
|
384 |
-
border-radius: 16px;
|
385 |
-
padding: 2rem;
|
386 |
-
text-align: left;
|
387 |
-
}
|
388 |
-
|
389 |
-
.feature-icon {
|
390 |
-
background: var(--accent-bg);
|
391 |
-
width: 40px;
|
392 |
-
height: 40px;
|
393 |
-
border-radius: 12px;
|
394 |
-
display: flex;
|
395 |
-
align-items: center;
|
396 |
-
justify-content: center;
|
397 |
-
margin-bottom: 1.5rem;
|
398 |
-
}
|
399 |
-
|
400 |
-
.feature-title {
|
401 |
-
color: var(--text-primary);
|
402 |
-
font-size: 1.25rem;
|
403 |
-
font-weight: 600;
|
404 |
-
margin-bottom: 1rem;
|
405 |
-
}
|
406 |
-
|
407 |
-
.feature-description {
|
408 |
-
color: var(--text-secondary);
|
409 |
-
font-size: 0.95rem;
|
410 |
-
margin-bottom: 1.5rem;
|
411 |
-
}
|
412 |
-
|
413 |
-
.feature-list {
|
414 |
-
list-style: none;
|
415 |
-
padding: 0;
|
416 |
-
margin: 0;
|
417 |
-
display: flex;
|
418 |
-
flex-direction: column;
|
419 |
-
gap: 0.75rem;
|
420 |
-
}
|
421 |
-
|
422 |
-
.feature-list li {
|
423 |
-
color: var(--text-tertiary);
|
424 |
-
font-size: 0.95rem;
|
425 |
-
display: flex;
|
426 |
-
align-items: center;
|
427 |
-
gap: 0.5rem;
|
428 |
-
}
|
429 |
-
|
430 |
-
.feature-list li::before {
|
431 |
-
content: '';
|
432 |
-
width: 6px;
|
433 |
-
height: 6px;
|
434 |
-
background: var(--accent-color);
|
435 |
-
border-radius: 50%;
|
436 |
-
flex-shrink: 0;
|
437 |
-
}
|
438 |
-
|
439 |
-
/* Force all links to match theme */
|
440 |
-
.header-wrapper a:link,
|
441 |
-
.header-wrapper a:visited,
|
442 |
-
.header-wrapper a:hover,
|
443 |
-
.header-wrapper a:active {
|
444 |
-
color: var(--text-primary) !important;
|
445 |
-
}
|
446 |
-
|
447 |
-
/* Title specific styles */
|
448 |
-
.main-title {
|
449 |
-
color: var(--text-primary);
|
450 |
-
font-size: 48px;
|
451 |
-
font-weight: 700;
|
452 |
-
margin: 40px 0;
|
453 |
-
text-align: center;
|
454 |
-
}
|
455 |
-
|
456 |
-
.subtitle {
|
457 |
-
color: var(--text-secondary);
|
458 |
-
margin-bottom: 2rem;
|
459 |
-
}
|
460 |
</style>
|
461 |
-
|
462 |
-
<div class="header-wrapper">
|
463 |
-
<h1 class="main-title">Agent Leaderboard</h1>
|
464 |
-
<h2 class="subtitle">Comprehensive multi-benchmark evaluation for tool calling</h2>
|
465 |
-
|
466 |
-
<div class="actions">
|
467 |
-
<a href="#" class="action-button">
|
468 |
-
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
469 |
-
<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
|
470 |
-
<line x1="8" y1="12" x2="16" y2="12"/>
|
471 |
-
</svg>
|
472 |
-
Blog
|
473 |
-
</a>
|
474 |
-
<a href="#" class="action-button">
|
475 |
-
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
476 |
-
<path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
|
477 |
-
</svg>
|
478 |
-
GitHub
|
479 |
-
</a>
|
480 |
-
<a href="#" class="action-button">
|
481 |
-
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
482 |
-
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
|
483 |
-
<polyline points="7 10 12 15 17 10"/>
|
484 |
-
<line x1="12" y1="15" x2="12" y2="3"/>
|
485 |
-
</svg>
|
486 |
-
Dataset
|
487 |
-
</a>
|
488 |
-
</div>
|
489 |
-
"""
|
490 |
-
|
491 |
-
CARDS = """
|
492 |
-
<div class="features-grid">
|
493 |
-
<div class="feature-card">
|
494 |
-
<div class="feature-icon">
|
495 |
-
<svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
|
496 |
-
<path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
|
497 |
-
</svg>
|
498 |
-
</div>
|
499 |
-
<h3 class="feature-title">Make Better Decisions</h3>
|
500 |
-
<ul class="feature-list">
|
501 |
-
<li>Cost-effectiveness analysis</li>
|
502 |
-
<li>Business impact metrics</li>
|
503 |
-
<li>Vendor strategy insights</li>
|
504 |
-
</ul>
|
505 |
-
</div>
|
506 |
-
|
507 |
-
<div class="feature-card">
|
508 |
-
<div class="feature-icon">
|
509 |
-
<svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
|
510 |
-
<path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
|
511 |
-
</svg>
|
512 |
-
</div>
|
513 |
-
<h3 class="feature-title">360° Domain Evaluation</h3>
|
514 |
-
<ul class="feature-list">
|
515 |
-
<li>Cross-domain evaluation</li>
|
516 |
-
<li>Real-world use cases</li>
|
517 |
-
<li>Edge case evaluation</li>
|
518 |
-
</ul>
|
519 |
-
</div>
|
520 |
-
|
521 |
-
<div class="feature-card">
|
522 |
-
<div class="feature-icon">
|
523 |
-
<svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
|
524 |
-
<path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
|
525 |
-
</svg>
|
526 |
-
</div>
|
527 |
-
<h3 class="feature-title">Updated Periodically</h3>
|
528 |
-
<ul class="feature-list">
|
529 |
-
<li>11 private models evaluated</li>
|
530 |
-
<li>5 open source models included</li>
|
531 |
-
<li>Monthly model additions</li>
|
532 |
-
</ul>
|
533 |
-
</div>
|
534 |
-
</div>
|
535 |
-
|
536 |
-
</div>
|
537 |
"""
|
538 |
|
539 |
DESCRIPTION_HTML = """
|
@@ -558,7 +299,7 @@ DESCRIPTION_HTML = """
|
|
558 |
">
|
559 |
🎯 Purpose
|
560 |
<span style="
|
561 |
-
background: var(--accent-
|
562 |
color: white;
|
563 |
padding: 4px 12px;
|
564 |
border-radius: 100px;
|
@@ -570,8 +311,7 @@ DESCRIPTION_HTML = """
|
|
570 |
margin: 0;
|
571 |
line-height: 1.6;
|
572 |
">
|
573 |
-
|
574 |
-
language models' ability to effectively utilize tools and functions in complex scenarios.
|
575 |
</p>
|
576 |
|
577 |
<div style="
|
@@ -636,3 +376,428 @@ DESCRIPTION_HTML = """
|
|
636 |
</div>
|
637 |
</div>
|
638 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
"Composite": ["BFCL_v3_multi_turn_composite"],
|
41 |
}
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
chat_css = """
|
45 |
/* Container styles */
|
|
|
233 |
}
|
234 |
"""
|
235 |
|
236 |
+
COMMON = """
|
|
|
|
|
|
|
237 |
<style>
|
238 |
@media (prefers-color-scheme: dark) {
|
239 |
:root {
|
240 |
+
--bg-primary: #0B0B19;
|
241 |
+
--bg-secondary: rgba(19, 19, 37, 0.4);
|
242 |
+
--bg-hover: rgba(30, 30, 45, 0.95);
|
243 |
--text-primary: #ffffff;
|
244 |
+
--text-secondary: #e2e8f0;
|
245 |
--text-tertiary: #e2e8f0;
|
246 |
+
--border-color: rgba(31, 41, 55, 0.5);
|
247 |
+
--border-hover: rgba(79, 70, 229, 0.4);
|
248 |
+
--card-bg: rgba(17, 17, 27, 0.4);
|
249 |
+
--accent-color: #ffffff;
|
250 |
--accent-bg: rgba(79, 70, 229, 0.1);
|
251 |
+
--blue-gradient: linear-gradient(45deg, #60A5FA, #3B82F6);
|
252 |
+
--purple-gradient: linear-gradient(45deg, #A78BFA, #8B5CF6);
|
253 |
+
--pink-gradient: linear-gradient(45deg, #F472B6, #EC4899);
|
254 |
+
--shadow-color: rgba(0, 0, 0, 0.2);
|
255 |
}
|
256 |
}
|
257 |
|
258 |
@media (prefers-color-scheme: light) {
|
259 |
:root {
|
260 |
+
--bg-primary: #ffffff;
|
261 |
+
--bg-secondary: rgba(243, 244, 246, 0.4);
|
262 |
--bg-hover: rgba(229, 231, 235, 0.95);
|
263 |
+
--text-primary: #1F2937;
|
264 |
+
--text-secondary: #4B5563;
|
265 |
+
--text-tertiary: #6B7280;
|
266 |
+
--border-color: rgba(209, 213, 219, 0.5);
|
267 |
+
--border-hover: rgba(79, 70, 229, 0.4);
|
268 |
+
--card-bg: rgba(249, 250, 251, 0.4);
|
269 |
--accent-color: #4F46E5;
|
270 |
--accent-bg: rgba(79, 70, 229, 0.1);
|
271 |
+
--blue-gradient: linear-gradient(45deg, #3B82F6, #2563EB);
|
272 |
+
--purple-gradient: linear-gradient(45deg, #8B5CF6, #EF43CD);
|
273 |
+
--pink-gradient: linear-gradient(45deg, #EC4899, #DB2777);
|
274 |
+
--shadow-color: rgba(0, 0, 0, 0.1);
|
275 |
}
|
276 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
</style>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
"""
|
279 |
|
280 |
DESCRIPTION_HTML = """
|
|
|
299 |
">
|
300 |
🎯 Purpose
|
301 |
<span style="
|
302 |
+
background: linear-gradient(to right, var(--accent-blue), var(--accent-purple));
|
303 |
color: white;
|
304 |
padding: 4px 12px;
|
305 |
border-radius: 100px;
|
|
|
311 |
margin: 0;
|
312 |
line-height: 1.6;
|
313 |
">
|
314 |
+
This comprehensive benchmark evaluates language models' ability to effectively utilize tools and functions in complex scenarios.
|
|
|
315 |
</p>
|
316 |
|
317 |
<div style="
|
|
|
376 |
</div>
|
377 |
</div>
|
378 |
"""
|
379 |
+
|
380 |
+
|
381 |
+
HEADER_CONTENT = (
|
382 |
+
COMMON
|
383 |
+
+ """
|
384 |
+
<style>
|
385 |
+
|
386 |
+
.header-wrapper {
|
387 |
+
background: var(--bg-primary);
|
388 |
+
padding: 4rem 2rem;
|
389 |
+
border-radius: 16px;
|
390 |
+
margin-bottom: 0;
|
391 |
+
transition: all 0.3s ease;
|
392 |
+
}
|
393 |
+
|
394 |
+
.header-content {
|
395 |
+
max-width: 72rem;
|
396 |
+
margin: 0 auto;
|
397 |
+
}
|
398 |
+
|
399 |
+
.title-section {
|
400 |
+
text-align: center;
|
401 |
+
margin-bottom: 4rem;
|
402 |
+
}
|
403 |
+
|
404 |
+
.title-gradient {
|
405 |
+
font-size: 5rem;
|
406 |
+
font-weight: 800;
|
407 |
+
line-height: 1.1;
|
408 |
+
background: var(--purple-gradient);
|
409 |
+
-webkit-background-clip: text;
|
410 |
+
-webkit-text-fill-color: transparent;
|
411 |
+
margin-bottom: 0.5rem;
|
412 |
+
}
|
413 |
+
|
414 |
+
.subtitle-white {
|
415 |
+
font-size: 5rem;
|
416 |
+
font-weight: 800;
|
417 |
+
line-height: 1.1;
|
418 |
+
color: var(--text-primary);
|
419 |
+
margin-bottom: 3rem;
|
420 |
+
transition: color 0.3s ease;
|
421 |
+
}
|
422 |
+
|
423 |
+
.description {
|
424 |
+
color: var(--text-secondary);
|
425 |
+
font-size: 1.25rem;
|
426 |
+
line-height: 1.75;
|
427 |
+
max-width: 800px;
|
428 |
+
margin: 0 auto;
|
429 |
+
text-align: center;
|
430 |
+
transition: color 0.3s ease;
|
431 |
+
}
|
432 |
+
|
433 |
+
.highlight-question {
|
434 |
+
background: var(--blue-gradient);
|
435 |
+
-webkit-background-clip: text;
|
436 |
+
-webkit-text-fill-color: transparent;
|
437 |
+
display: block;
|
438 |
+
margin-top: 1rem;
|
439 |
+
font-size: 1.5rem;
|
440 |
+
font-weight: 500;
|
441 |
+
}
|
442 |
+
|
443 |
+
.metrics-grid {
|
444 |
+
display: grid;
|
445 |
+
grid-template-columns: repeat(3, 1fr);
|
446 |
+
gap: 1.5rem;
|
447 |
+
margin-top: 4rem;
|
448 |
+
}
|
449 |
+
|
450 |
+
.metric-card {
|
451 |
+
background: var(--bg-secondary);
|
452 |
+
border: 1px solid var(--border-color);
|
453 |
+
border-radius: 1rem;
|
454 |
+
padding: 2rem;
|
455 |
+
transition: all 0.3s ease;
|
456 |
+
}
|
457 |
+
|
458 |
+
.metric-card:hover {
|
459 |
+
transform: translateY(-5px);
|
460 |
+
border-color: var(--border-hover);
|
461 |
+
box-shadow: 0 4px 20px var(--shadow-color);
|
462 |
+
}
|
463 |
+
|
464 |
+
.metric-number {
|
465 |
+
font-size: 4rem;
|
466 |
+
font-weight: 800;
|
467 |
+
margin-bottom: 1rem;
|
468 |
+
}
|
469 |
+
|
470 |
+
.metric-blue {
|
471 |
+
background: var(--blue-gradient);
|
472 |
+
-webkit-background-clip: text;
|
473 |
+
-webkit-text-fill-color: transparent;
|
474 |
+
}
|
475 |
+
|
476 |
+
.metric-purple {
|
477 |
+
background: var(--purple-gradient);
|
478 |
+
-webkit-background-clip: text;
|
479 |
+
-webkit-text-fill-color: transparent;
|
480 |
+
}
|
481 |
+
|
482 |
+
.metric-pink {
|
483 |
+
background: var(--pink-gradient);
|
484 |
+
-webkit-background-clip: text;
|
485 |
+
-webkit-text-fill-color: transparent;
|
486 |
+
}
|
487 |
+
|
488 |
+
.metric-label {
|
489 |
+
color: var(--text-secondary);
|
490 |
+
font-size: 1.5rem;
|
491 |
+
margin-bottom: 1.5rem;
|
492 |
+
transition: color 0.3s ease;
|
493 |
+
}
|
494 |
+
|
495 |
+
.metric-detail {
|
496 |
+
font-size: 1.125rem;
|
497 |
+
line-height: 1.75;
|
498 |
+
margin-top: 0.5rem;
|
499 |
+
transition: color 0.3s ease;
|
500 |
+
}
|
501 |
+
|
502 |
+
.metric-detail.primary {
|
503 |
+
color: var(--accent-color);
|
504 |
+
}
|
505 |
+
|
506 |
+
.metric-detail.secondary {
|
507 |
+
color: var(--text-secondary);
|
508 |
+
}
|
509 |
+
|
510 |
+
.actions {
|
511 |
+
display: flex;
|
512 |
+
gap: 1rem;
|
513 |
+
justify-content: center;
|
514 |
+
margin-top: 3rem;
|
515 |
+
}
|
516 |
+
|
517 |
+
.action-button {
|
518 |
+
display: flex;
|
519 |
+
align-items: center;
|
520 |
+
gap: 0.5rem;
|
521 |
+
padding: 0.75rem 1.5rem;
|
522 |
+
background: var(--bg-secondary);
|
523 |
+
border: 1px solid var(--border-color);
|
524 |
+
border-radius: 100px;
|
525 |
+
color: var(--text-primary) !important;
|
526 |
+
text-decoration: none !important;
|
527 |
+
font-size: 0.95rem;
|
528 |
+
transition: all 0.3s ease;
|
529 |
+
}
|
530 |
+
|
531 |
+
.action-button:hover {
|
532 |
+
transform: translateY(-2px);
|
533 |
+
border-color: var(--accent-color);
|
534 |
+
background: var(--accent-bg);
|
535 |
+
}
|
536 |
+
|
537 |
+
@media (max-width: 768px) {
|
538 |
+
.title-gradient, .subtitle-white {
|
539 |
+
font-size: 3rem;
|
540 |
+
}
|
541 |
+
.metrics-grid {
|
542 |
+
grid-template-columns: 1fr;
|
543 |
+
}
|
544 |
+
}
|
545 |
+
</style>
|
546 |
+
|
547 |
+
<div class="header-wrapper">
|
548 |
+
<div class="header-content">
|
549 |
+
<div class="title-section">
|
550 |
+
<div class="subtitle-white">Welcome to the</div>
|
551 |
+
<div class="title-gradient">Agent Leaderboard!</div>
|
552 |
+
|
553 |
+
<div class="description">
|
554 |
+
The landscape of AI agents is evolving rapidly, with major tech CEOs predicting 2025 as a pivotal year.
|
555 |
+
We built this leaderboard to answer one simple question:
|
556 |
+
<div class="highlight-question">
|
557 |
+
"How do AI agents perform in real-world agentic scenarios?"
|
558 |
+
</div>
|
559 |
+
</div>
|
560 |
+
</div>
|
561 |
+
|
562 |
+
<div class="actions">
|
563 |
+
<a href="#" class="action-button">
|
564 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
565 |
+
<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
|
566 |
+
<line x1="8" y1="12" x2="16" y2="12"/>
|
567 |
+
</svg>
|
568 |
+
Blog
|
569 |
+
</a>
|
570 |
+
<a href="#" class="action-button">
|
571 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
572 |
+
<path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
|
573 |
+
</svg>
|
574 |
+
GitHub
|
575 |
+
</a>
|
576 |
+
<a href="#" class="action-button">
|
577 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
578 |
+
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
|
579 |
+
<polyline points="7 10 12 15 17 10"/>
|
580 |
+
<line x1="12" y1="15" x2="12" y2="3"/>
|
581 |
+
</svg>
|
582 |
+
Dataset
|
583 |
+
</a>
|
584 |
+
</div>
|
585 |
+
</div>
|
586 |
+
</div>
|
587 |
+
"""
|
588 |
+
)
|
589 |
+
|
590 |
+
CARDS = """ <div class="metrics-grid">
|
591 |
+
<div class="metric-card">
|
592 |
+
<div class="metric-number metric-blue">17</div>
|
593 |
+
<div class="metric-label">Total Models</div>
|
594 |
+
<div class="metric-detail primary">12 Private</div>
|
595 |
+
<div class="metric-detail primary">5 Open Source</div>
|
596 |
+
</div>
|
597 |
+
|
598 |
+
<div class="metric-card">
|
599 |
+
<div class="metric-number metric-purple">14</div>
|
600 |
+
<div class="metric-label">Evaluation Datasets</div>
|
601 |
+
<div class="metric-detail primary">Cross-Domain Testing</div>
|
602 |
+
<div class="metric-detail primary">Real-world use cases</div>
|
603 |
+
</div>
|
604 |
+
|
605 |
+
<div class="metric-card">
|
606 |
+
<div class="metric-number metric-pink">TSQ</div>
|
607 |
+
<div class="metric-label">Evaluation Metric</div>
|
608 |
+
<div class="metric-detail primary">Tool Selection Quality</div>
|
609 |
+
<div class="metric-detail primary">GPT-4o Based Judge</div>
|
610 |
+
</div>
|
611 |
+
</div>"""
|
612 |
+
|
613 |
+
METHODOLOGY = """
|
614 |
+
<style>
|
615 |
+
@media (prefers-color-scheme: dark) {
|
616 |
+
:root {
|
617 |
+
--bg-primary: #0B0B19;
|
618 |
+
--bg-secondary: rgba(19, 19, 37, 0.4);
|
619 |
+
--bg-tertiary: rgba(30, 30, 45, 0.95);
|
620 |
+
--text-primary: #ffffff;
|
621 |
+
--text-secondary: #94A3B8;
|
622 |
+
--text-tertiary: #E2E8F0;
|
623 |
+
--border-primary: rgba(31, 41, 55, 0.5);
|
624 |
+
--border-hover: rgba(79, 70, 229, 0.4);
|
625 |
+
--accent-blue: #60A5FA;
|
626 |
+
--accent-purple: #A78BFA;
|
627 |
+
--accent-pink: #F472B6;
|
628 |
+
--card-hover-bg: rgba(79, 70, 229, 0.1);
|
629 |
+
--shadow-color: rgba(79, 70, 229, 0.1);
|
630 |
+
}
|
631 |
+
}
|
632 |
+
|
633 |
+
@media (prefers-color-scheme: light) {
|
634 |
+
:root {
|
635 |
+
--bg-primary: #ffffff;
|
636 |
+
--bg-secondary: rgba(243, 244, 246, 0.4);
|
637 |
+
--bg-tertiary: rgba(249, 250, 251, 0.95);
|
638 |
+
--text-primary: #111827;
|
639 |
+
--text-secondary: #4B5563;
|
640 |
+
--text-tertiary: #6B7280;
|
641 |
+
--border-primary: rgba(209, 213, 219, 0.5);
|
642 |
+
--border-hover: rgba(79, 70, 229, 0.4);
|
643 |
+
--accent-blue: #3B82F6;
|
644 |
+
--accent-purple: #8B5CF6;
|
645 |
+
--accent-pink: #EC4899;
|
646 |
+
--card-hover-bg: rgba(243, 244, 246, 0.8);
|
647 |
+
--shadow-color: rgba(0, 0, 0, 0.1);
|
648 |
+
}
|
649 |
+
}
|
650 |
+
|
651 |
+
/* [Previous CSS remains the same until features-grid] */
|
652 |
+
|
653 |
+
/* Features Grid Section */
|
654 |
+
.features-grid {
|
655 |
+
display: grid;
|
656 |
+
grid-template-columns: repeat(3, 1fr);
|
657 |
+
gap: 1.5rem;
|
658 |
+
width: 100%;
|
659 |
+
padding: 2rem 0;
|
660 |
+
}
|
661 |
+
|
662 |
+
[Rest of the CSS remains the same]
|
663 |
+
</style>
|
664 |
+
<!-- Methodology Section -->
|
665 |
+
<div class="methodology-section">
|
666 |
+
<h1 class="methodology-title">Methodology</h1>
|
667 |
+
|
668 |
+
<h2 class="methodology-subtitle">Overview</h2>
|
669 |
+
<p class="methodology-text">
|
670 |
+
The Berkeley Function Calling Leaderboard (BFCL) evaluates language models' ability to effectively use tools
|
671 |
+
and maintain coherent multi-turn conversations. Our evaluation focuses on both basic functionality and edge
|
672 |
+
cases that challenge real-world applicability.
|
673 |
+
</p>
|
674 |
+
|
675 |
+
<h2 class="methodology-subtitle">Tool Selection Quality (TSQ) Metric</h2>
|
676 |
+
<ul class="metric-list">
|
677 |
+
<li>Correctly identify when tools are needed</li>
|
678 |
+
<li>Select the appropriate tool for the task</li>
|
679 |
+
<li>Handle cases where no suitable tool exists</li>
|
680 |
+
<li>Maintain context across multiple interactions</li>
|
681 |
+
<li>Consider cost-effectiveness of tool usage</li>
|
682 |
+
<li>Optimize for minimal necessary tool calls</li>
|
683 |
+
</ul>
|
684 |
+
|
685 |
+
<h2 class="methodology-subtitle">Dataset Structure</h2>
|
686 |
+
<div class="table-container">
|
687 |
+
<table class="dataset-table">
|
688 |
+
<thead>
|
689 |
+
<tr>
|
690 |
+
<th>Type</th>
|
691 |
+
<th>Samples</th>
|
692 |
+
<th>Category</th>
|
693 |
+
<th>Dataset Name</th>
|
694 |
+
<th>Purpose</th>
|
695 |
+
</tr>
|
696 |
+
</thead>
|
697 |
+
<tbody>
|
698 |
+
<tr>
|
699 |
+
<td rowspan="4">Single-Turn</td>
|
700 |
+
<td>200</td>
|
701 |
+
<td>Single Function Call</td>
|
702 |
+
<td>xlam_single_tool_single_call</td>
|
703 |
+
<td>Basic ability to read documentation and make single function calls</td>
|
704 |
+
</tr>
|
705 |
+
<tr>
|
706 |
+
<td>250</td>
|
707 |
+
<td>Multiple Function Call</td>
|
708 |
+
<td>xlam_multiple_tool_multiple_call</td>
|
709 |
+
<td>Parallel execution and result aggregation capabilities</td>
|
710 |
+
</tr>
|
711 |
+
<tr>
|
712 |
+
<td>100</td>
|
713 |
+
<td>Irrelevant Query</td>
|
714 |
+
<td>BFCL_v3_irrelevance</td>
|
715 |
+
<td>Recognition of tool mismatches with user needs</td>
|
716 |
+
</tr>
|
717 |
+
<tr>
|
718 |
+
<td>100</td>
|
719 |
+
<td>Long Context</td>
|
720 |
+
<td>tau_long_context</td>
|
721 |
+
<td>Extended interactions and complex instructions</td>
|
722 |
+
</tr>
|
723 |
+
<tr>
|
724 |
+
<td rowspan="5">Multi-Turn</td>
|
725 |
+
<td>80</td>
|
726 |
+
<td>Single Function Call</td>
|
727 |
+
<td>BFCL_v3_multi_turn_base_single_func_call</td>
|
728 |
+
<td>Conversational function calling abilities</td>
|
729 |
+
</tr>
|
730 |
+
<tr>
|
731 |
+
<td>50</td>
|
732 |
+
<td>Multiple Function Call</td>
|
733 |
+
<td>BFCL_v3_multi_turn_base_multi_func_call</td>
|
734 |
+
<td>Multiple function calls in conversation</td>
|
735 |
+
</tr>
|
736 |
+
<tr>
|
737 |
+
<td>100</td>
|
738 |
+
<td>Missing Function</td>
|
739 |
+
<td>BFCL_v3_multi_turn_miss_func</td>
|
740 |
+
<td>Graceful handling of unavailable tools</td>
|
741 |
+
</tr>
|
742 |
+
<tr>
|
743 |
+
<td>100</td>
|
744 |
+
<td>Missing Parameters</td>
|
745 |
+
<td>BFCL_v3_multi_turn_miss_param</td>
|
746 |
+
<td>Parameter collection and incomplete information</td>
|
747 |
+
</tr>
|
748 |
+
<tr>
|
749 |
+
<td>100</td>
|
750 |
+
<td>Composite</td>
|
751 |
+
<td>BFCL_v3_multi_turn_composite</td>
|
752 |
+
<td>Overall robustness in complex scenarios</td>
|
753 |
+
</tr>
|
754 |
+
</tbody>
|
755 |
+
</table>
|
756 |
+
</div>
|
757 |
+
</div>
|
758 |
+
|
759 |
+
<!-- Features Grid Section -->
|
760 |
+
<div class="features-grid">
|
761 |
+
<div class="feature-card">
|
762 |
+
<div class="feature-icon">
|
763 |
+
<svg width="24" height="24" fill="none" stroke="var(--accent-blue)" stroke-width="2" viewBox="0 0 24 24">
|
764 |
+
<path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
|
765 |
+
</svg>
|
766 |
+
</div>
|
767 |
+
<h3 class="feature-title">Make Better Decisions</h3>
|
768 |
+
<ul class="feature-list">
|
769 |
+
<li>Cost-effectiveness analysis</li>
|
770 |
+
<li>Business impact metrics</li>
|
771 |
+
<li>Vendor strategy insights</li>
|
772 |
+
</ul>
|
773 |
+
</div>
|
774 |
+
|
775 |
+
<div class="feature-card">
|
776 |
+
<div class="feature-icon">
|
777 |
+
<svg width="24" height="24" fill="none" stroke="var(--accent-purple)" stroke-width="2" viewBox="0 0 24 24">
|
778 |
+
<path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
|
779 |
+
</svg>
|
780 |
+
</div>
|
781 |
+
<h3 class="feature-title">360° Domain Evaluation</h3>
|
782 |
+
<ul class="feature-list">
|
783 |
+
<li>Cross-domain evaluation</li>
|
784 |
+
<li>Real-world use cases</li>
|
785 |
+
<li>Edge case evaluation</li>
|
786 |
+
</ul>
|
787 |
+
</div>
|
788 |
+
|
789 |
+
<div class="feature-card">
|
790 |
+
<div class="feature-icon">
|
791 |
+
<svg width="24" height="24" fill="none" stroke="var(--accent-pink)" stroke-width="2" viewBox="0 0 24 24">
|
792 |
+
<path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
|
793 |
+
</svg>
|
794 |
+
</div>
|
795 |
+
<h3 class="feature-title">Updated Periodically</h3>
|
796 |
+
<ul class="feature-list">
|
797 |
+
<li>12 private models evaluated</li>
|
798 |
+
<li>5 open source models included</li>
|
799 |
+
<li>Monthly model additions</li>
|
800 |
+
</ul>
|
801 |
+
</div>
|
802 |
+
</div>
|
803 |
+
"""
|
results.csv
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
|
2 |
-
gemini-2.0-flash-exp,Private,Normal,Google,0.
|
3 |
gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
|
4 |
gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
|
5 |
gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
|
|
|
1 |
Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
|
2 |
+
gemini-2.0-flash-exp,Private,Normal,Google,0.1,0.4,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
|
3 |
gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
|
4 |
gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
|
5 |
gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
|
tabs/leaderboard.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
from data_loader import CATEGORIES, DESCRIPTION_HTML
|
4 |
from visualization import (
|
5 |
get_performance_chart,
|
6 |
get_performance_cost_chart,
|
@@ -9,7 +9,6 @@ from utils import (
|
|
9 |
get_rank_badge,
|
10 |
get_score_bar,
|
11 |
get_type_badge,
|
12 |
-
get_output_type_badge,
|
13 |
)
|
14 |
|
15 |
def filter_leaderboard(df, model_type, category, sort_by):
|
@@ -181,7 +180,8 @@ def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
|
|
181 |
output = gr.HTML()
|
182 |
plot1 = gr.Plot()
|
183 |
plot2 = gr.Plot()
|
184 |
-
|
|
|
185 |
|
186 |
for input_comp in [model_type, category, sort_by]:
|
187 |
input_comp.change(
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
+
from data_loader import CATEGORIES, DESCRIPTION_HTML, CARDS
|
4 |
from visualization import (
|
5 |
get_performance_chart,
|
6 |
get_performance_cost_chart,
|
|
|
9 |
get_rank_badge,
|
10 |
get_score_bar,
|
11 |
get_type_badge,
|
|
|
12 |
)
|
13 |
|
14 |
def filter_leaderboard(df, model_type, category, sort_by):
|
|
|
180 |
output = gr.HTML()
|
181 |
plot1 = gr.Plot()
|
182 |
plot2 = gr.Plot()
|
183 |
+
|
184 |
+
gr.HTML(METHODOLOGY)
|
185 |
|
186 |
for input_comp in [model_type, category, sort_by]:
|
187 |
input_comp.change(
|