import pandas as pd from glob import glob import numpy as np from pathlib import Path DATASETS = [Path(file).stem for file in glob("datasets/*.parquet")] SCORES = [round(x, 2) for x in np.arange(0, 1.1, 0.1).tolist()] def load_data(): """Load and preprocess the data.""" df = pd.read_csv("results.csv").dropna() # Add combined I/O cost column with 3:1 ratio df["IO Cost"] = ( df["Input cost per million token"] * 0.75 + df["Output cost per million token"] * 0.25 ) return df # categories.py CATEGORIES = { "Overall": ["Model Avg"], "Overall single turn": ["single turn perf"], "Overall multi turn": ["multi turn perf"], "Single func call": [ "xlam_single_tool_single_call", "xlam_multiple_tool_single_call", ], "Multiple func call": [ "xlam_multiple_tool_multiple_call", "xlam_single_tool_multiple_call", "BFCL_v3_multi_turn_base_multi_func_call", ], "Irrelevant query": ["BFCL_v3_irrelevance"], "Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"], "Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"], "Missing params": ["BFCL_v3_multi_turn_miss_param"], "Composite": ["BFCL_v3_multi_turn_composite"], } chat_css = """ /* Container styles */ .container { display: flex; gap: 1.5rem; height: calc(100vh - 100px); padding: 1rem; } /* Chat panel styles */ .chat-panel { flex: 2; background: #1a1f2c; border-radius: 1rem; padding: 1rem; overflow-y: auto; max-height: calc(100vh - 120px); } /* Message styles */ .message { padding: 1.2rem; margin: 0.8rem; border-radius: 1rem; font-family: monospace; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); } .system { background: linear-gradient(135deg, #8e44ad, #9b59b6); } .user { background: linear-gradient(135deg, #2c3e50, #3498db); margin-left: 2rem; } .assistant { background: linear-gradient(135deg, #27ae60, #2ecc71); margin-right: 2rem; } .role-badge { display: inline-block; padding: 0.3rem 0.8rem; border-radius: 0.5rem; font-weight: bold; margin-bottom: 0.8rem; font-size: 0.9rem; text-transform: uppercase; letter-spacing: 0.05em; } .system-role { background-color: #8e44ad; color: white; } .user-role { background-color: #3498db; color: white; } .assistant-role { background-color: #27ae60; color: white; } .content { white-space: pre-wrap; word-break: break-word; color: #f5f6fa; line-height: 1.5; } /* Metrics panel styles */ .metrics-panel { flex: 1; display: flex; flex-direction: column; gap: 2rem; padding: 1.5rem; background: #1a1f2c; border-radius: 1rem; } .metric-section { background: #1E293B; padding: 1.5rem; border-radius: 1rem; } .score-section { text-align: center; } .score-display { font-size: 3rem; font-weight: bold; color: #4ADE80; line-height: 1; margin: 0.5rem 0; } .explanation-text { color: #E2E8F0; line-height: 1.6; font-size: 0.95rem; } /* Tool info panel styles */ .tool-info-panel { background: #1a1f2c; padding: 1.5rem; border-radius: 1rem; color: #f5f6fa; } .tool-section { margin-bottom: 1.5rem; } .tool-name { font-size: 1.2rem; color: #4ADE80; font-weight: bold; margin-bottom: 0.5rem; } .tool-description { color: #E2E8F0; line-height: 1.6; margin-bottom: 1rem; } .tool-parameters .parameter { margin: 0.5rem 0; padding: 0.5rem; background: rgba(255, 255, 255, 0.05); border-radius: 0.5rem; } .param-name { color: #63B3ED; font-weight: bold; margin-right: 0.5rem; } .tool-examples .example { margin: 0.5rem 0; padding: 0.5rem; background: rgba(255, 255, 255, 0.05); border-radius: 0.5rem; font-family: monospace; } /* Custom scrollbar */ ::-webkit-scrollbar { width: 8px; } ::-webkit-scrollbar-track { background: rgba(255, 255, 255, 0.1); border-radius: 4px; } ::-webkit-scrollbar-thumb { background: linear-gradient(45deg, #3498db, #2ecc71); border-radius: 4px; } /* Title styles */ .title { color: #63B3ED; font-size: 2rem; font-weight: bold; text-align: center; margin-bottom: 1.5rem; padding: 1rem; } /* Headers */ h3 { color: #63B3ED; margin: 0 0 1rem 0; font-size: 1.1rem; font-weight: 500; letter-spacing: 0.05em; } """ COMMON = """ """ DESCRIPTION_HTML = """
🎯 Purpose Latest Update: Feb 2025

This comprehensive benchmark evaluates language models' ability to effectively utilize tools and functions in complex scenarios.

🔍 What We Evaluate
🔄 Single/Multi-turn Interactions
🧩 Function Composition
⚡ Error Handling
📊 Key Results
✅ Accuracy Performance
💰 Open Vs Closed Source
⚖️ Overall Effectiveness
💡 Use the filters below to explore different aspects of the evaluation and compare model performance across various dimensions.
""" HEADER_CONTENT = ( COMMON + """
Welcome to the
Agent Leaderboard!
The landscape of AI agents is evolving rapidly, with major tech CEOs predicting 2025 as a pivotal year. We built this leaderboard to answer one simple question:
"How do AI agents perform in real-world agentic scenarios?"
""" ) CARDS = """
17
Total Models
12 Private
5 Open Source
14
Evaluation Datasets
Cross-Domain Testing
Real-world use cases
TSQ
Evaluation Metric
Tool Selection Quality
GPT-4o Based Judge
""" METHODOLOGY = """

Methodology

Overview

The Berkeley Function Calling Leaderboard (BFCL) evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations. Our evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.

Tool Selection Quality (TSQ) Metric

Dataset Structure

Type Samples Category Dataset Name Purpose
Single-Turn 100 + 100 Single Function Call xlam_single_tool_single_call Evaluates basic ability to read documentation and make single function calls
200 + 50 Multiple Function Call xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call Tests parallel execution and result aggregation capabilities
100 Irrelevant Query BFCL_v3_irrelevance Tests ability to recognize when available tools don't match user needs
100 Long Context tau_long_context Assesses handling of extended interactions and complex instructions
Multi-Turn 50 + 30 Single Function Call BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call Tests basic conversational function calling abilities
50 Multiple Function Call BFCL_v3_multi_turn_base_multi_func_call Evaluates handling of multiple function calls in conversation
100 Missing Function BFCL_v3_multi_turn_miss_func Tests graceful handling of unavailable tools
100 Missing Parameters BFCL_v3_multi_turn_miss_param Assesses parameter collection and handling incomplete information
100 Composite BFCL_v3_multi_turn_composite Tests overall robustness in complex scenarios

Make Better Decisions

  • Cost-effectiveness analysis
  • Business impact metrics
  • Vendor strategy insights

360° Domain Evaluation

  • Cross-domain evaluation
  • Real-world use cases
  • Edge case evaluation

Updated Periodically

  • 12 private models evaluated
  • 5 open source models included
  • Monthly model additions
"""