Pratik Bhavsar commited on
Commit
b8ddec2
·
1 Parent(s): a64af65

improved looks

Browse files
Files changed (3) hide show
  1. data_loader.py +452 -287
  2. results.csv +1 -1
  3. tabs/leaderboard.py +3 -3
data_loader.py CHANGED
@@ -40,48 +40,6 @@ CATEGORIES = {
40
  "Composite": ["BFCL_v3_multi_turn_composite"],
41
  }
42
 
43
- METHODOLOGY = """# Methodology
44
- ## Overview
45
- The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
46
- The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
47
-
48
- ## Tool Selection Quality Metric
49
- Models are evaluated on their ability to:
50
- - Correctly identify when tools are needed
51
- - Select the appropriate tool for the task
52
- - Handle cases where no suitable tool exists
53
- - Maintain context across multiple interactions
54
-
55
- ## Dataset Structure
56
- | Type | Samples | Category | Dataset Name | Purpose |
57
- |------|---------|-----------|--------------|----------|
58
- | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
59
- | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
60
- | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
61
- | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
62
- | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
63
- | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
64
- | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
65
- | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
66
- | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
67
- """
68
-
69
-
70
- INSIGHTS = """
71
- # Key Insights from Agent Leaderboard
72
-
73
- | Category | Finding | Implications |
74
- |----------|---------|--------------|
75
- | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
76
- | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
77
- | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
78
- | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
79
- | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
80
- | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
81
-
82
- **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
83
- """
84
-
85
 
86
  chat_css = """
87
  /* Container styles */
@@ -275,265 +233,48 @@ h3 {
275
  }
276
  """
277
 
278
-
279
- # Updated header and cards with theme awareness
280
-
281
- HEADER_CONTENT = """
282
  <style>
283
  @media (prefers-color-scheme: dark) {
284
  :root {
285
- --bg-primary: rgb(17, 17, 27);
286
- --bg-secondary: rgba(30, 30, 45, 0.95);
287
- --bg-hover: rgba(40, 40, 55, 0.95);
288
  --text-primary: #ffffff;
289
- --text-secondary: #94a3b8;
290
  --text-tertiary: #e2e8f0;
291
- --border-color: rgba(255, 255, 255, 0.1);
292
- --border-hover: rgba(255, 255, 255, 0.2);
293
- --card-bg: rgba(17, 17, 27, 0.6);
294
- --accent-color: #4F46E5;
295
  --accent-bg: rgba(79, 70, 229, 0.1);
 
 
 
 
296
  }
297
  }
298
 
299
  @media (prefers-color-scheme: light) {
300
  :root {
301
- --bg-primary: rgb(255, 255, 255);
302
- --bg-secondary: rgba(243, 244, 246, 0.95);
303
  --bg-hover: rgba(229, 231, 235, 0.95);
304
- --text-primary: #000000;
305
- --text-secondary: #4b5563;
306
- --text-tertiary: #1f2937;
307
- --border-color: rgba(0, 0, 0, 0.1);
308
- --border-hover: rgba(0, 0, 0, 0.2);
309
- --card-bg: rgba(249, 250, 251, 0.6);
310
  --accent-color: #4F46E5;
311
  --accent-bg: rgba(79, 70, 229, 0.1);
 
 
 
 
312
  }
313
  }
314
-
315
- .header-wrapper {
316
- padding: 3rem 2rem;
317
- background: var(--bg-primary);
318
- border-radius: 16px;
319
- display: flex;
320
- flex-direction: column;
321
- align-items: center;
322
- text-align: center;
323
- }
324
-
325
- .header-wrapper a {
326
- color: var(--text-primary) !important;
327
- text-decoration: none !important;
328
- }
329
-
330
- .description {
331
- color: var(--text-primary);
332
- font-size: 1.1rem;
333
- line-height: 1.6;
334
- max-width: 800px;
335
- margin: 0 auto 2rem;
336
- text-align: center;
337
- }
338
-
339
- .actions {
340
- display: flex;
341
- gap: 1rem;
342
- justify-content: center;
343
- margin-bottom: 2rem;
344
- color: var(--text-primary);
345
- }
346
-
347
- .action-button {
348
- display: flex;
349
- align-items: center;
350
- gap: 0.5rem;
351
- padding: 0.75rem 1.5rem;
352
- background: var(--bg-secondary);
353
- border: 1px solid var(--border-color);
354
- border-radius: 100px;
355
- color: var(--text-primary) !important;
356
- text-decoration: none !important;
357
- font-size: 0.95rem;
358
- transition: all 0.2s ease;
359
- }
360
-
361
- .action-button:hover {
362
- background: var(--bg-hover);
363
- border-color: var(--border-hover);
364
- color: var(--text-primary) !important;
365
- }
366
-
367
- .update-info {
368
- color: var(--text-secondary);
369
- font-size: 0.9rem;
370
- margin-bottom: 3rem;
371
- }
372
-
373
- .features-grid {
374
- display: grid;
375
- grid-template-columns: repeat(3, 1fr);
376
- gap: 1.5rem;
377
- width: 100%;
378
- max-width: 1200px;
379
- }
380
-
381
- .feature-card {
382
- background: var(--card-bg);
383
- border: 1px solid var(--border-color);
384
- border-radius: 16px;
385
- padding: 2rem;
386
- text-align: left;
387
- }
388
-
389
- .feature-icon {
390
- background: var(--accent-bg);
391
- width: 40px;
392
- height: 40px;
393
- border-radius: 12px;
394
- display: flex;
395
- align-items: center;
396
- justify-content: center;
397
- margin-bottom: 1.5rem;
398
- }
399
-
400
- .feature-title {
401
- color: var(--text-primary);
402
- font-size: 1.25rem;
403
- font-weight: 600;
404
- margin-bottom: 1rem;
405
- }
406
-
407
- .feature-description {
408
- color: var(--text-secondary);
409
- font-size: 0.95rem;
410
- margin-bottom: 1.5rem;
411
- }
412
-
413
- .feature-list {
414
- list-style: none;
415
- padding: 0;
416
- margin: 0;
417
- display: flex;
418
- flex-direction: column;
419
- gap: 0.75rem;
420
- }
421
-
422
- .feature-list li {
423
- color: var(--text-tertiary);
424
- font-size: 0.95rem;
425
- display: flex;
426
- align-items: center;
427
- gap: 0.5rem;
428
- }
429
-
430
- .feature-list li::before {
431
- content: '';
432
- width: 6px;
433
- height: 6px;
434
- background: var(--accent-color);
435
- border-radius: 50%;
436
- flex-shrink: 0;
437
- }
438
-
439
- /* Force all links to match theme */
440
- .header-wrapper a:link,
441
- .header-wrapper a:visited,
442
- .header-wrapper a:hover,
443
- .header-wrapper a:active {
444
- color: var(--text-primary) !important;
445
- }
446
-
447
- /* Title specific styles */
448
- .main-title {
449
- color: var(--text-primary);
450
- font-size: 48px;
451
- font-weight: 700;
452
- margin: 40px 0;
453
- text-align: center;
454
- }
455
-
456
- .subtitle {
457
- color: var(--text-secondary);
458
- margin-bottom: 2rem;
459
- }
460
  </style>
461
-
462
- <div class="header-wrapper">
463
- <h1 class="main-title">Agent Leaderboard</h1>
464
- <h2 class="subtitle">Comprehensive multi-benchmark evaluation for tool calling</h2>
465
-
466
- <div class="actions">
467
- <a href="#" class="action-button">
468
- <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
469
- <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
470
- <line x1="8" y1="12" x2="16" y2="12"/>
471
- </svg>
472
- Blog
473
- </a>
474
- <a href="#" class="action-button">
475
- <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
476
- <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
477
- </svg>
478
- GitHub
479
- </a>
480
- <a href="#" class="action-button">
481
- <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
482
- <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
483
- <polyline points="7 10 12 15 17 10"/>
484
- <line x1="12" y1="15" x2="12" y2="3"/>
485
- </svg>
486
- Dataset
487
- </a>
488
- </div>
489
- """
490
-
491
- CARDS = """
492
- <div class="features-grid">
493
- <div class="feature-card">
494
- <div class="feature-icon">
495
- <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
496
- <path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
497
- </svg>
498
- </div>
499
- <h3 class="feature-title">Make Better Decisions</h3>
500
- <ul class="feature-list">
501
- <li>Cost-effectiveness analysis</li>
502
- <li>Business impact metrics</li>
503
- <li>Vendor strategy insights</li>
504
- </ul>
505
- </div>
506
-
507
- <div class="feature-card">
508
- <div class="feature-icon">
509
- <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
510
- <path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
511
- </svg>
512
- </div>
513
- <h3 class="feature-title">360° Domain Evaluation</h3>
514
- <ul class="feature-list">
515
- <li>Cross-domain evaluation</li>
516
- <li>Real-world use cases</li>
517
- <li>Edge case evaluation</li>
518
- </ul>
519
- </div>
520
-
521
- <div class="feature-card">
522
- <div class="feature-icon">
523
- <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
524
- <path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
525
- </svg>
526
- </div>
527
- <h3 class="feature-title">Updated Periodically</h3>
528
- <ul class="feature-list">
529
- <li>11 private models evaluated</li>
530
- <li>5 open source models included</li>
531
- <li>Monthly model additions</li>
532
- </ul>
533
- </div>
534
- </div>
535
-
536
- </div>
537
  """
538
 
539
  DESCRIPTION_HTML = """
@@ -558,7 +299,7 @@ DESCRIPTION_HTML = """
558
  ">
559
  🎯 Purpose
560
  <span style="
561
- background: var(--accent-color, #4F46E5);
562
  color: white;
563
  padding: 4px 12px;
564
  border-radius: 100px;
@@ -570,8 +311,7 @@ DESCRIPTION_HTML = """
570
  margin: 0;
571
  line-height: 1.6;
572
  ">
573
- Welcome to the AI Agent Tool Calling Leaderboard! This comprehensive benchmark evaluates
574
- language models' ability to effectively utilize tools and functions in complex scenarios.
575
  </p>
576
 
577
  <div style="
@@ -636,3 +376,428 @@ DESCRIPTION_HTML = """
636
  </div>
637
  </div>
638
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  "Composite": ["BFCL_v3_multi_turn_composite"],
41
  }
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  chat_css = """
45
  /* Container styles */
 
233
  }
234
  """
235
 
236
+ COMMON = """
 
 
 
237
  <style>
238
  @media (prefers-color-scheme: dark) {
239
  :root {
240
+ --bg-primary: #0B0B19;
241
+ --bg-secondary: rgba(19, 19, 37, 0.4);
242
+ --bg-hover: rgba(30, 30, 45, 0.95);
243
  --text-primary: #ffffff;
244
+ --text-secondary: #e2e8f0;
245
  --text-tertiary: #e2e8f0;
246
+ --border-color: rgba(31, 41, 55, 0.5);
247
+ --border-hover: rgba(79, 70, 229, 0.4);
248
+ --card-bg: rgba(17, 17, 27, 0.4);
249
+ --accent-color: #ffffff;
250
  --accent-bg: rgba(79, 70, 229, 0.1);
251
+ --blue-gradient: linear-gradient(45deg, #60A5FA, #3B82F6);
252
+ --purple-gradient: linear-gradient(45deg, #A78BFA, #8B5CF6);
253
+ --pink-gradient: linear-gradient(45deg, #F472B6, #EC4899);
254
+ --shadow-color: rgba(0, 0, 0, 0.2);
255
  }
256
  }
257
 
258
  @media (prefers-color-scheme: light) {
259
  :root {
260
+ --bg-primary: #ffffff;
261
+ --bg-secondary: rgba(243, 244, 246, 0.4);
262
  --bg-hover: rgba(229, 231, 235, 0.95);
263
+ --text-primary: #1F2937;
264
+ --text-secondary: #4B5563;
265
+ --text-tertiary: #6B7280;
266
+ --border-color: rgba(209, 213, 219, 0.5);
267
+ --border-hover: rgba(79, 70, 229, 0.4);
268
+ --card-bg: rgba(249, 250, 251, 0.4);
269
  --accent-color: #4F46E5;
270
  --accent-bg: rgba(79, 70, 229, 0.1);
271
+ --blue-gradient: linear-gradient(45deg, #3B82F6, #2563EB);
272
+ --purple-gradient: linear-gradient(45deg, #8B5CF6, #EF43CD);
273
+ --pink-gradient: linear-gradient(45deg, #EC4899, #DB2777);
274
+ --shadow-color: rgba(0, 0, 0, 0.1);
275
  }
276
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  </style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  """
279
 
280
  DESCRIPTION_HTML = """
 
299
  ">
300
  🎯 Purpose
301
  <span style="
302
+ background: linear-gradient(to right, var(--accent-blue), var(--accent-purple));
303
  color: white;
304
  padding: 4px 12px;
305
  border-radius: 100px;
 
311
  margin: 0;
312
  line-height: 1.6;
313
  ">
314
+ This comprehensive benchmark evaluates language models' ability to effectively utilize tools and functions in complex scenarios.
 
315
  </p>
316
 
317
  <div style="
 
376
  </div>
377
  </div>
378
  """
379
+
380
+
381
+ HEADER_CONTENT = (
382
+ COMMON
383
+ + """
384
+ <style>
385
+
386
+ .header-wrapper {
387
+ background: var(--bg-primary);
388
+ padding: 4rem 2rem;
389
+ border-radius: 16px;
390
+ margin-bottom: 0;
391
+ transition: all 0.3s ease;
392
+ }
393
+
394
+ .header-content {
395
+ max-width: 72rem;
396
+ margin: 0 auto;
397
+ }
398
+
399
+ .title-section {
400
+ text-align: center;
401
+ margin-bottom: 4rem;
402
+ }
403
+
404
+ .title-gradient {
405
+ font-size: 5rem;
406
+ font-weight: 800;
407
+ line-height: 1.1;
408
+ background: var(--purple-gradient);
409
+ -webkit-background-clip: text;
410
+ -webkit-text-fill-color: transparent;
411
+ margin-bottom: 0.5rem;
412
+ }
413
+
414
+ .subtitle-white {
415
+ font-size: 5rem;
416
+ font-weight: 800;
417
+ line-height: 1.1;
418
+ color: var(--text-primary);
419
+ margin-bottom: 3rem;
420
+ transition: color 0.3s ease;
421
+ }
422
+
423
+ .description {
424
+ color: var(--text-secondary);
425
+ font-size: 1.25rem;
426
+ line-height: 1.75;
427
+ max-width: 800px;
428
+ margin: 0 auto;
429
+ text-align: center;
430
+ transition: color 0.3s ease;
431
+ }
432
+
433
+ .highlight-question {
434
+ background: var(--blue-gradient);
435
+ -webkit-background-clip: text;
436
+ -webkit-text-fill-color: transparent;
437
+ display: block;
438
+ margin-top: 1rem;
439
+ font-size: 1.5rem;
440
+ font-weight: 500;
441
+ }
442
+
443
+ .metrics-grid {
444
+ display: grid;
445
+ grid-template-columns: repeat(3, 1fr);
446
+ gap: 1.5rem;
447
+ margin-top: 4rem;
448
+ }
449
+
450
+ .metric-card {
451
+ background: var(--bg-secondary);
452
+ border: 1px solid var(--border-color);
453
+ border-radius: 1rem;
454
+ padding: 2rem;
455
+ transition: all 0.3s ease;
456
+ }
457
+
458
+ .metric-card:hover {
459
+ transform: translateY(-5px);
460
+ border-color: var(--border-hover);
461
+ box-shadow: 0 4px 20px var(--shadow-color);
462
+ }
463
+
464
+ .metric-number {
465
+ font-size: 4rem;
466
+ font-weight: 800;
467
+ margin-bottom: 1rem;
468
+ }
469
+
470
+ .metric-blue {
471
+ background: var(--blue-gradient);
472
+ -webkit-background-clip: text;
473
+ -webkit-text-fill-color: transparent;
474
+ }
475
+
476
+ .metric-purple {
477
+ background: var(--purple-gradient);
478
+ -webkit-background-clip: text;
479
+ -webkit-text-fill-color: transparent;
480
+ }
481
+
482
+ .metric-pink {
483
+ background: var(--pink-gradient);
484
+ -webkit-background-clip: text;
485
+ -webkit-text-fill-color: transparent;
486
+ }
487
+
488
+ .metric-label {
489
+ color: var(--text-secondary);
490
+ font-size: 1.5rem;
491
+ margin-bottom: 1.5rem;
492
+ transition: color 0.3s ease;
493
+ }
494
+
495
+ .metric-detail {
496
+ font-size: 1.125rem;
497
+ line-height: 1.75;
498
+ margin-top: 0.5rem;
499
+ transition: color 0.3s ease;
500
+ }
501
+
502
+ .metric-detail.primary {
503
+ color: var(--accent-color);
504
+ }
505
+
506
+ .metric-detail.secondary {
507
+ color: var(--text-secondary);
508
+ }
509
+
510
+ .actions {
511
+ display: flex;
512
+ gap: 1rem;
513
+ justify-content: center;
514
+ margin-top: 3rem;
515
+ }
516
+
517
+ .action-button {
518
+ display: flex;
519
+ align-items: center;
520
+ gap: 0.5rem;
521
+ padding: 0.75rem 1.5rem;
522
+ background: var(--bg-secondary);
523
+ border: 1px solid var(--border-color);
524
+ border-radius: 100px;
525
+ color: var(--text-primary) !important;
526
+ text-decoration: none !important;
527
+ font-size: 0.95rem;
528
+ transition: all 0.3s ease;
529
+ }
530
+
531
+ .action-button:hover {
532
+ transform: translateY(-2px);
533
+ border-color: var(--accent-color);
534
+ background: var(--accent-bg);
535
+ }
536
+
537
+ @media (max-width: 768px) {
538
+ .title-gradient, .subtitle-white {
539
+ font-size: 3rem;
540
+ }
541
+ .metrics-grid {
542
+ grid-template-columns: 1fr;
543
+ }
544
+ }
545
+ </style>
546
+
547
+ <div class="header-wrapper">
548
+ <div class="header-content">
549
+ <div class="title-section">
550
+ <div class="subtitle-white">Welcome to the</div>
551
+ <div class="title-gradient">Agent Leaderboard!</div>
552
+
553
+ <div class="description">
554
+ The landscape of AI agents is evolving rapidly, with major tech CEOs predicting 2025 as a pivotal year.
555
+ We built this leaderboard to answer one simple question:
556
+ <div class="highlight-question">
557
+ "How do AI agents perform in real-world agentic scenarios?"
558
+ </div>
559
+ </div>
560
+ </div>
561
+
562
+ <div class="actions">
563
+ <a href="#" class="action-button">
564
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
565
+ <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
566
+ <line x1="8" y1="12" x2="16" y2="12"/>
567
+ </svg>
568
+ Blog
569
+ </a>
570
+ <a href="#" class="action-button">
571
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
572
+ <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
573
+ </svg>
574
+ GitHub
575
+ </a>
576
+ <a href="#" class="action-button">
577
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
578
+ <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
579
+ <polyline points="7 10 12 15 17 10"/>
580
+ <line x1="12" y1="15" x2="12" y2="3"/>
581
+ </svg>
582
+ Dataset
583
+ </a>
584
+ </div>
585
+ </div>
586
+ </div>
587
+ """
588
+ )
589
+
590
+ CARDS = """ <div class="metrics-grid">
591
+ <div class="metric-card">
592
+ <div class="metric-number metric-blue">17</div>
593
+ <div class="metric-label">Total Models</div>
594
+ <div class="metric-detail primary">12 Private</div>
595
+ <div class="metric-detail primary">5 Open Source</div>
596
+ </div>
597
+
598
+ <div class="metric-card">
599
+ <div class="metric-number metric-purple">14</div>
600
+ <div class="metric-label">Evaluation Datasets</div>
601
+ <div class="metric-detail primary">Cross-Domain Testing</div>
602
+ <div class="metric-detail primary">Real-world use cases</div>
603
+ </div>
604
+
605
+ <div class="metric-card">
606
+ <div class="metric-number metric-pink">TSQ</div>
607
+ <div class="metric-label">Evaluation Metric</div>
608
+ <div class="metric-detail primary">Tool Selection Quality</div>
609
+ <div class="metric-detail primary">GPT-4o Based Judge</div>
610
+ </div>
611
+ </div>"""
612
+
613
+ METHODOLOGY = """
614
+ <style>
615
+ @media (prefers-color-scheme: dark) {
616
+ :root {
617
+ --bg-primary: #0B0B19;
618
+ --bg-secondary: rgba(19, 19, 37, 0.4);
619
+ --bg-tertiary: rgba(30, 30, 45, 0.95);
620
+ --text-primary: #ffffff;
621
+ --text-secondary: #94A3B8;
622
+ --text-tertiary: #E2E8F0;
623
+ --border-primary: rgba(31, 41, 55, 0.5);
624
+ --border-hover: rgba(79, 70, 229, 0.4);
625
+ --accent-blue: #60A5FA;
626
+ --accent-purple: #A78BFA;
627
+ --accent-pink: #F472B6;
628
+ --card-hover-bg: rgba(79, 70, 229, 0.1);
629
+ --shadow-color: rgba(79, 70, 229, 0.1);
630
+ }
631
+ }
632
+
633
+ @media (prefers-color-scheme: light) {
634
+ :root {
635
+ --bg-primary: #ffffff;
636
+ --bg-secondary: rgba(243, 244, 246, 0.4);
637
+ --bg-tertiary: rgba(249, 250, 251, 0.95);
638
+ --text-primary: #111827;
639
+ --text-secondary: #4B5563;
640
+ --text-tertiary: #6B7280;
641
+ --border-primary: rgba(209, 213, 219, 0.5);
642
+ --border-hover: rgba(79, 70, 229, 0.4);
643
+ --accent-blue: #3B82F6;
644
+ --accent-purple: #8B5CF6;
645
+ --accent-pink: #EC4899;
646
+ --card-hover-bg: rgba(243, 244, 246, 0.8);
647
+ --shadow-color: rgba(0, 0, 0, 0.1);
648
+ }
649
+ }
650
+
651
+ /* [Previous CSS remains the same until features-grid] */
652
+
653
+ /* Features Grid Section */
654
+ .features-grid {
655
+ display: grid;
656
+ grid-template-columns: repeat(3, 1fr);
657
+ gap: 1.5rem;
658
+ width: 100%;
659
+ padding: 2rem 0;
660
+ }
661
+
662
+ [Rest of the CSS remains the same]
663
+ </style>
664
+ <!-- Methodology Section -->
665
+ <div class="methodology-section">
666
+ <h1 class="methodology-title">Methodology</h1>
667
+
668
+ <h2 class="methodology-subtitle">Overview</h2>
669
+ <p class="methodology-text">
670
+ The Berkeley Function Calling Leaderboard (BFCL) evaluates language models' ability to effectively use tools
671
+ and maintain coherent multi-turn conversations. Our evaluation focuses on both basic functionality and edge
672
+ cases that challenge real-world applicability.
673
+ </p>
674
+
675
+ <h2 class="methodology-subtitle">Tool Selection Quality (TSQ) Metric</h2>
676
+ <ul class="metric-list">
677
+ <li>Correctly identify when tools are needed</li>
678
+ <li>Select the appropriate tool for the task</li>
679
+ <li>Handle cases where no suitable tool exists</li>
680
+ <li>Maintain context across multiple interactions</li>
681
+ <li>Consider cost-effectiveness of tool usage</li>
682
+ <li>Optimize for minimal necessary tool calls</li>
683
+ </ul>
684
+
685
+ <h2 class="methodology-subtitle">Dataset Structure</h2>
686
+ <div class="table-container">
687
+ <table class="dataset-table">
688
+ <thead>
689
+ <tr>
690
+ <th>Type</th>
691
+ <th>Samples</th>
692
+ <th>Category</th>
693
+ <th>Dataset Name</th>
694
+ <th>Purpose</th>
695
+ </tr>
696
+ </thead>
697
+ <tbody>
698
+ <tr>
699
+ <td rowspan="4">Single-Turn</td>
700
+ <td>200</td>
701
+ <td>Single Function Call</td>
702
+ <td>xlam_single_tool_single_call</td>
703
+ <td>Basic ability to read documentation and make single function calls</td>
704
+ </tr>
705
+ <tr>
706
+ <td>250</td>
707
+ <td>Multiple Function Call</td>
708
+ <td>xlam_multiple_tool_multiple_call</td>
709
+ <td>Parallel execution and result aggregation capabilities</td>
710
+ </tr>
711
+ <tr>
712
+ <td>100</td>
713
+ <td>Irrelevant Query</td>
714
+ <td>BFCL_v3_irrelevance</td>
715
+ <td>Recognition of tool mismatches with user needs</td>
716
+ </tr>
717
+ <tr>
718
+ <td>100</td>
719
+ <td>Long Context</td>
720
+ <td>tau_long_context</td>
721
+ <td>Extended interactions and complex instructions</td>
722
+ </tr>
723
+ <tr>
724
+ <td rowspan="5">Multi-Turn</td>
725
+ <td>80</td>
726
+ <td>Single Function Call</td>
727
+ <td>BFCL_v3_multi_turn_base_single_func_call</td>
728
+ <td>Conversational function calling abilities</td>
729
+ </tr>
730
+ <tr>
731
+ <td>50</td>
732
+ <td>Multiple Function Call</td>
733
+ <td>BFCL_v3_multi_turn_base_multi_func_call</td>
734
+ <td>Multiple function calls in conversation</td>
735
+ </tr>
736
+ <tr>
737
+ <td>100</td>
738
+ <td>Missing Function</td>
739
+ <td>BFCL_v3_multi_turn_miss_func</td>
740
+ <td>Graceful handling of unavailable tools</td>
741
+ </tr>
742
+ <tr>
743
+ <td>100</td>
744
+ <td>Missing Parameters</td>
745
+ <td>BFCL_v3_multi_turn_miss_param</td>
746
+ <td>Parameter collection and incomplete information</td>
747
+ </tr>
748
+ <tr>
749
+ <td>100</td>
750
+ <td>Composite</td>
751
+ <td>BFCL_v3_multi_turn_composite</td>
752
+ <td>Overall robustness in complex scenarios</td>
753
+ </tr>
754
+ </tbody>
755
+ </table>
756
+ </div>
757
+ </div>
758
+
759
+ <!-- Features Grid Section -->
760
+ <div class="features-grid">
761
+ <div class="feature-card">
762
+ <div class="feature-icon">
763
+ <svg width="24" height="24" fill="none" stroke="var(--accent-blue)" stroke-width="2" viewBox="0 0 24 24">
764
+ <path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
765
+ </svg>
766
+ </div>
767
+ <h3 class="feature-title">Make Better Decisions</h3>
768
+ <ul class="feature-list">
769
+ <li>Cost-effectiveness analysis</li>
770
+ <li>Business impact metrics</li>
771
+ <li>Vendor strategy insights</li>
772
+ </ul>
773
+ </div>
774
+
775
+ <div class="feature-card">
776
+ <div class="feature-icon">
777
+ <svg width="24" height="24" fill="none" stroke="var(--accent-purple)" stroke-width="2" viewBox="0 0 24 24">
778
+ <path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
779
+ </svg>
780
+ </div>
781
+ <h3 class="feature-title">360° Domain Evaluation</h3>
782
+ <ul class="feature-list">
783
+ <li>Cross-domain evaluation</li>
784
+ <li>Real-world use cases</li>
785
+ <li>Edge case evaluation</li>
786
+ </ul>
787
+ </div>
788
+
789
+ <div class="feature-card">
790
+ <div class="feature-icon">
791
+ <svg width="24" height="24" fill="none" stroke="var(--accent-pink)" stroke-width="2" viewBox="0 0 24 24">
792
+ <path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
793
+ </svg>
794
+ </div>
795
+ <h3 class="feature-title">Updated Periodically</h3>
796
+ <ul class="feature-list">
797
+ <li>12 private models evaluated</li>
798
+ <li>5 open source models included</li>
799
+ <li>Monthly model additions</li>
800
+ </ul>
801
+ </div>
802
+ </div>
803
+ """
results.csv CHANGED
@@ -1,5 +1,5 @@
1
  Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
2
- gemini-2.0-flash-exp,Private,Normal,Google,0.075,0.3,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
3
  gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
4
  gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
5
  gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
 
1
  Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
2
+ gemini-2.0-flash-exp,Private,Normal,Google,0.1,0.4,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
3
  gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
4
  gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
5
  gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
tabs/leaderboard.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
 
3
- from data_loader import CATEGORIES, DESCRIPTION_HTML
4
  from visualization import (
5
  get_performance_chart,
6
  get_performance_cost_chart,
@@ -9,7 +9,6 @@ from utils import (
9
  get_rank_badge,
10
  get_score_bar,
11
  get_type_badge,
12
- get_output_type_badge,
13
  )
14
 
15
  def filter_leaderboard(df, model_type, category, sort_by):
@@ -181,7 +180,8 @@ def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
181
  output = gr.HTML()
182
  plot1 = gr.Plot()
183
  plot2 = gr.Plot()
184
- gr.Markdown(METHODOLOGY)
 
185
 
186
  for input_comp in [model_type, category, sort_by]:
187
  input_comp.change(
 
1
  import gradio as gr
2
 
3
+ from data_loader import CATEGORIES, DESCRIPTION_HTML, CARDS
4
  from visualization import (
5
  get_performance_chart,
6
  get_performance_cost_chart,
 
9
  get_rank_badge,
10
  get_score_bar,
11
  get_type_badge,
 
12
  )
13
 
14
  def filter_leaderboard(df, model_type, category, sort_by):
 
180
  output = gr.HTML()
181
  plot1 = gr.Plot()
182
  plot2 = gr.Plot()
183
+
184
+ gr.HTML(METHODOLOGY)
185
 
186
  for input_comp in [model_type, category, sort_by]:
187
  input_comp.change(