Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import pandas as pd | |
from glob import glob | |
import numpy as np | |
from pathlib import Path | |
DATASETS = [Path(file).stem for file in glob("datasets/*.parquet")] | |
SCORES = [round(x, 2) for x in np.arange(0, 1.1, 0.1).tolist()] | |
def load_data(): | |
"""Load and preprocess the data.""" | |
df = pd.read_csv("results.csv").dropna() | |
# Add combined I/O cost column with 3:1 ratio | |
df["IO Cost"] = ( | |
df["Input cost per million token"] * 0.75 | |
+ df["Output cost per million token"] * 0.25 | |
) | |
return df | |
# categories.py | |
CATEGORIES = { | |
"Overall": ["Model Avg"], | |
"Overall single turn": ["single turn perf"], | |
"Overall multi turn": ["multi turn perf"], | |
"Single func call": [ | |
"xlam_single_tool_single_call", | |
"xlam_multiple_tool_single_call", | |
], | |
"Multiple func call": [ | |
"xlam_multiple_tool_multiple_call", | |
"xlam_single_tool_multiple_call", | |
"BFCL_v3_multi_turn_base_multi_func_call", | |
], | |
"Irrelevant query": ["BFCL_v3_irrelevance"], | |
"Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"], | |
"Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"], | |
"Missing params": ["BFCL_v3_multi_turn_miss_param"], | |
"Composite": ["BFCL_v3_multi_turn_composite"], | |
} | |
chat_css = """ | |
/* Container styles */ | |
.container { | |
display: flex; | |
gap: 1.5rem; | |
height: calc(100vh - 100px); | |
padding: 1rem; | |
} | |
/* Chat panel styles */ | |
.chat-panel { | |
flex: 2; | |
background: #1a1f2c; | |
border-radius: 1rem; | |
padding: 1rem; | |
overflow-y: auto; | |
max-height: calc(100vh - 120px); | |
} | |
/* Message styles */ | |
.message { | |
padding: 1.2rem; | |
margin: 0.8rem; | |
border-radius: 1rem; | |
font-family: monospace; | |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); | |
} | |
.system { | |
background: linear-gradient(135deg, #8e44ad, #9b59b6); | |
} | |
.user { | |
background: linear-gradient(135deg, #2c3e50, #3498db); | |
margin-left: 2rem; | |
} | |
.assistant { | |
background: linear-gradient(135deg, #27ae60, #2ecc71); | |
margin-right: 2rem; | |
} | |
.role-badge { | |
display: inline-block; | |
padding: 0.3rem 0.8rem; | |
border-radius: 0.5rem; | |
font-weight: bold; | |
margin-bottom: 0.8rem; | |
font-size: 0.9rem; | |
text-transform: uppercase; | |
letter-spacing: 0.05em; | |
} | |
.system-role { | |
background-color: #8e44ad; | |
color: white; | |
} | |
.user-role { | |
background-color: #3498db; | |
color: white; | |
} | |
.assistant-role { | |
background-color: #27ae60; | |
color: white; | |
} | |
.content { | |
white-space: pre-wrap; | |
word-break: break-word; | |
color: #f5f6fa; | |
line-height: 1.5; | |
} | |
/* Metrics panel styles */ | |
.metrics-panel { | |
flex: 1; | |
display: flex; | |
flex-direction: column; | |
gap: 2rem; | |
padding: 1.5rem; | |
background: #1a1f2c; | |
border-radius: 1rem; | |
} | |
.metric-section { | |
background: #1E293B; | |
padding: 1.5rem; | |
border-radius: 1rem; | |
} | |
.score-section { | |
text-align: center; | |
} | |
.score-display { | |
font-size: 3rem; | |
font-weight: bold; | |
color: #4ADE80; | |
line-height: 1; | |
margin: 0.5rem 0; | |
} | |
.explanation-text { | |
color: #E2E8F0; | |
line-height: 1.6; | |
font-size: 0.95rem; | |
} | |
/* Tool info panel styles */ | |
.tool-info-panel { | |
background: #1a1f2c; | |
padding: 1.5rem; | |
border-radius: 1rem; | |
color: #f5f6fa; | |
} | |
.tool-section { | |
margin-bottom: 1.5rem; | |
} | |
.tool-name { | |
font-size: 1.2rem; | |
color: #4ADE80; | |
font-weight: bold; | |
margin-bottom: 0.5rem; | |
} | |
.tool-description { | |
color: #E2E8F0; | |
line-height: 1.6; | |
margin-bottom: 1rem; | |
} | |
.tool-parameters .parameter { | |
margin: 0.5rem 0; | |
padding: 0.5rem; | |
background: rgba(255, 255, 255, 0.05); | |
border-radius: 0.5rem; | |
} | |
.param-name { | |
color: #63B3ED; | |
font-weight: bold; | |
margin-right: 0.5rem; | |
} | |
.tool-examples .example { | |
margin: 0.5rem 0; | |
padding: 0.5rem; | |
background: rgba(255, 255, 255, 0.05); | |
border-radius: 0.5rem; | |
font-family: monospace; | |
} | |
/* Custom scrollbar */ | |
::-webkit-scrollbar { | |
width: 8px; | |
} | |
::-webkit-scrollbar-track { | |
background: rgba(255, 255, 255, 0.1); | |
border-radius: 4px; | |
} | |
::-webkit-scrollbar-thumb { | |
background: linear-gradient(45deg, #3498db, #2ecc71); | |
border-radius: 4px; | |
} | |
/* Title styles */ | |
.title { | |
color: #63B3ED; | |
font-size: 2rem; | |
font-weight: bold; | |
text-align: center; | |
margin-bottom: 1.5rem; | |
padding: 1rem; | |
} | |
/* Headers */ | |
h3 { | |
color: #63B3ED; | |
margin: 0 0 1rem 0; | |
font-size: 1.1rem; | |
font-weight: 500; | |
letter-spacing: 0.05em; | |
} | |
""" | |
COMMON = """ | |
<style> | |
@media (prefers-color-scheme: dark) { | |
:root { | |
--bg-primary: #0B0B19; | |
--bg-secondary: rgba(19, 19, 37, 0.4); | |
--bg-hover: rgba(30, 30, 45, 0.95); | |
--text-primary: #ffffff; | |
--text-secondary: #e2e8f0; | |
--text-tertiary: #e2e8f0; | |
--border-color: rgba(31, 41, 55, 0.5); | |
--border-hover: rgba(79, 70, 229, 0.4); | |
--card-bg: rgba(17, 17, 27, 0.4); | |
--accent-color: #ffffff; | |
--accent-bg: rgba(79, 70, 229, 0.1); | |
--blue-gradient: linear-gradient(45deg, #60A5FA, #3B82F6); | |
--purple-gradient: linear-gradient(45deg, #A78BFA, #8B5CF6); | |
--pink-gradient: linear-gradient(45deg, #F472B6, #EC4899); | |
--shadow-color: rgba(0, 0, 0, 0.2); | |
} | |
} | |
@media (prefers-color-scheme: light) { | |
:root { | |
--bg-primary: #ffffff; | |
--bg-secondary: rgba(243, 244, 246, 0.4); | |
--bg-hover: rgba(229, 231, 235, 0.95); | |
--text-primary: #1F2937; | |
--text-secondary: #4B5563; | |
--text-tertiary: #6B7280; | |
--border-color: rgba(209, 213, 219, 0.5); | |
--border-hover: rgba(79, 70, 229, 0.4); | |
--card-bg: rgba(249, 250, 251, 0.4); | |
--accent-color: #4F46E5; | |
--accent-bg: rgba(79, 70, 229, 0.1); | |
--blue-gradient: linear-gradient(45deg, #3B82F6, #2563EB); | |
--purple-gradient: linear-gradient(45deg, #8B5CF6, #EF43CD); | |
--pink-gradient: linear-gradient(45deg, #EC4899, #DB2777); | |
--shadow-color: rgba(0, 0, 0, 0.1); | |
} | |
} | |
</style> | |
""" | |
DESCRIPTION_HTML = """ | |
<div style=" | |
background: var(--bg-secondary, rgba(30, 30, 45, 0.95)); | |
border-radius: 12px; | |
padding: 24px; | |
margin: 16px 0; | |
"> | |
<div style=" | |
display: flex; | |
flex-direction: column; | |
gap: 16px; | |
"> | |
<div style=" | |
color: var(--text-primary); | |
font-size: 1.1rem; | |
font-weight: 500; | |
display: flex; | |
align-items: center; | |
gap: 8px; | |
"> | |
🎯 Purpose | |
<span style=" | |
background: linear-gradient(to right, var(--accent-blue), var(--accent-purple)); | |
color: white; | |
padding: 4px 12px; | |
border-radius: 100px; | |
font-size: 0.9rem; | |
">Latest Update: Feb 2025</span> | |
</div> | |
<p style=" | |
color: var(--text-secondary); | |
margin: 0; | |
line-height: 1.6; | |
"> | |
This comprehensive benchmark evaluates language models' ability to effectively utilize tools and functions in complex scenarios. | |
</p> | |
<div style=" | |
color: var(--text-primary); | |
font-size: 1.1rem; | |
font-weight: 500; | |
margin-top: 8px; | |
"> | |
🔍 What We Evaluate | |
</div> | |
<div style=" | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); | |
gap: 16px; | |
color: var(--text-secondary); | |
"> | |
<div style="display: flex; gap: 8px; align-items: center;"> | |
🔄 Single/Multi-turn Interactions | |
</div> | |
<div style="display: flex; gap: 8px; align-items: center;"> | |
🧩 Function Composition | |
</div> | |
<div style="display: flex; gap: 8px; align-items: center;"> | |
⚡ Error Handling | |
</div> | |
</div> | |
<div style=" | |
color: var(--text-primary); | |
font-size: 1.1rem; | |
font-weight: 500; | |
margin-top: 8px; | |
"> | |
📊 Key Results | |
</div> | |
<div style=" | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); | |
gap: 16px; | |
color: var(--text-secondary); | |
"> | |
<div style="display: flex; gap: 8px; align-items: center;"> | |
✅ Accuracy Performance | |
</div> | |
<div style="display: flex; gap: 8px; align-items: center;"> | |
💰 Open Vs Closed Source | |
</div> | |
<div style="display: flex; gap: 8px; align-items: center;"> | |
⚖️ Overall Effectiveness | |
</div> | |
</div> | |
<div style=" | |
border-left: 4px solid var(--accent-color, #4F46E5); | |
padding-left: 12px; | |
margin-top: 8px; | |
color: var(--text-secondary); | |
font-style: italic; | |
"> | |
💡 Use the filters below to explore different aspects of the evaluation and compare model performance across various dimensions. | |
</div> | |
</div> | |
</div> | |
""" | |
HEADER_CONTENT = ( | |
COMMON | |
+ """ | |
<style> | |
.header-wrapper { | |
background: var(--bg-primary); | |
padding: 4rem 2rem; | |
border-radius: 16px; | |
margin-bottom: 0; | |
transition: all 0.3s ease; | |
} | |
.header-content { | |
max-width: 72rem; | |
margin: 0 auto; | |
} | |
.title-section { | |
text-align: center; | |
margin-bottom: 4rem; | |
} | |
.title-gradient { | |
font-size: 5rem; | |
font-weight: 800; | |
line-height: 1.1; | |
background: var(--purple-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
margin-bottom: 0.5rem; | |
} | |
.subtitle-white { | |
font-size: 5rem; | |
font-weight: 800; | |
line-height: 1.1; | |
color: var(--text-primary); | |
margin-bottom: 3rem; | |
transition: color 0.3s ease; | |
} | |
.description { | |
color: var(--text-secondary); | |
font-size: 1.25rem; | |
line-height: 1.75; | |
max-width: 800px; | |
margin: 0 auto; | |
text-align: center; | |
transition: color 0.3s ease; | |
} | |
.highlight-question { | |
background: var(--blue-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
display: block; | |
margin-top: 1rem; | |
font-size: 1.5rem; | |
font-weight: 500; | |
} | |
.metrics-grid { | |
display: grid; | |
grid-template-columns: repeat(3, 1fr); | |
gap: 1.5rem; | |
margin-top: 4rem; | |
} | |
.metric-card { | |
background: var(--bg-secondary); | |
border: 1px solid var(--border-color); | |
border-radius: 1rem; | |
padding: 2rem; | |
transition: all 0.3s ease; | |
align-items: center; | |
} | |
.metric-card:hover { | |
transform: translateY(-5px); | |
border-color: var(--border-hover); | |
box-shadow: 0 4px 20px var(--shadow-color); | |
} | |
.metric-number { | |
font-size: 4rem; | |
font-weight: 800; | |
margin-bottom: 1rem; | |
} | |
.metric-blue { | |
background: var(--blue-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
} | |
.metric-purple { | |
background: var(--purple-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
} | |
.metric-pink { | |
background: var(--pink-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
} | |
.metric-label { | |
color: var(--text-secondary); | |
font-size: 1.5rem; | |
margin-bottom: 1.5rem; | |
transition: color 0.3s ease; | |
} | |
.metric-detail { | |
font-size: 1.125rem; | |
line-height: 1.75; | |
margin-top: 0.5rem; | |
transition: color 0.3s ease; | |
} | |
.metric-detail.primary { | |
color: var(--accent-color); | |
} | |
.metric-detail.secondary { | |
color: var(--text-secondary); | |
} | |
.actions { | |
display: flex; | |
gap: 1rem; | |
justify-content: center; | |
margin-top: 3rem; | |
} | |
.action-button { | |
display: flex; | |
align-items: center; | |
gap: 0.5rem; | |
padding: 0.75rem 1.5rem; | |
background: var(--bg-secondary); | |
border: 1px solid var(--border-color); | |
border-radius: 100px; | |
color: var(--text-primary) !important; | |
text-decoration: none !important; | |
font-size: 0.95rem; | |
transition: all 0.3s ease; | |
} | |
.action-button:hover { | |
transform: translateY(-2px); | |
border-color: var(--accent-color); | |
background: var(--accent-bg); | |
} | |
@media (max-width: 768px) { | |
.title-gradient, .subtitle-white { | |
font-size: 3rem; | |
} | |
.metrics-grid { | |
grid-template-columns: 1fr; | |
} | |
} | |
</style> | |
<div class="header-wrapper"> | |
<div class="header-content"> | |
<div class="title-section"> | |
<div class="subtitle-white">Welcome to the</div> | |
<div class="title-gradient">Agent Leaderboard!</div> | |
<div class="description"> | |
The landscape of AI agents is evolving rapidly, with major tech CEOs predicting 2025 as a pivotal year. | |
We built this leaderboard to answer one simple question: | |
<div class="highlight-question"> | |
"How do AI agents perform in real-world agentic scenarios?" | |
</div> | |
</div> | |
</div> | |
<div class="actions"> | |
<a href="#" class="action-button"> | |
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/> | |
<line x1="8" y1="12" x2="16" y2="12"/> | |
</svg> | |
Blog | |
</a> | |
<a href="#" class="action-button"> | |
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
<path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/> | |
</svg> | |
GitHub | |
</a> | |
<a href="#" class="action-button"> | |
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/> | |
<polyline points="7 10 12 15 17 10"/> | |
<line x1="12" y1="15" x2="12" y2="3"/> | |
</svg> | |
Dataset | |
</a> | |
</div> | |
</div> | |
</div> | |
""" | |
) | |
CARDS = """ <div class="metrics-grid"> | |
<div class="metric-card"> | |
<div class="metric-number metric-blue">17</div> | |
<div class="metric-label">Total Models</div> | |
<div class="metric-detail primary">12 Private</div> | |
<div class="metric-detail primary">5 Open Source</div> | |
</div> | |
<div class="metric-card"> | |
<div class="metric-number metric-purple">14</div> | |
<div class="metric-label">Evaluation Datasets</div> | |
<div class="metric-detail primary">Cross-Domain Testing</div> | |
<div class="metric-detail primary">Real-world use cases</div> | |
</div> | |
<div class="metric-card"> | |
<div class="metric-number metric-pink">TSQ</div> | |
<div class="metric-label">Evaluation Metric</div> | |
<div class="metric-detail primary">Tool Selection Quality</div> | |
<div class="metric-detail primary">GPT-4o Based Judge</div> | |
</div> | |
</div>""" | |
METHODOLOGY = """ | |
<style> | |
@media (prefers-color-scheme: dark) { | |
:root { | |
--bg-primary: #0B0B19; | |
--bg-secondary: rgba(19, 19, 37, 0.4); | |
--bg-tertiary: rgba(30, 30, 45, 0.95); | |
--text-primary: #ffffff; | |
--text-secondary: #94A3B8; | |
--text-tertiary: #E2E8F0; | |
--border-primary: rgba(31, 41, 55, 0.5); | |
--border-hover: rgba(79, 70, 229, 0.4); | |
--accent-blue: #60A5FA; | |
--accent-purple: #A78BFA; | |
--accent-pink: #F472B6; | |
--card-hover-bg: rgba(79, 70, 229, 0.1); | |
--shadow-color: rgba(79, 70, 229, 0.1); | |
} | |
} | |
@media (prefers-color-scheme: light) { | |
:root { | |
--bg-primary: #ffffff; | |
--bg-secondary: rgba(243, 244, 246, 0.4); | |
--bg-tertiary: rgba(249, 250, 251, 0.95); | |
--text-primary: #111827; | |
--text-secondary: #4B5563; | |
--text-tertiary: #6B7280; | |
--border-primary: rgba(209, 213, 219, 0.5); | |
--border-hover: rgba(79, 70, 229, 0.4); | |
--accent-blue: #3B82F6; | |
--accent-purple: #8B5CF6; | |
--accent-pink: #EC4899; | |
--card-hover-bg: rgba(243, 244, 246, 0.8); | |
--shadow-color: rgba(0, 0, 0, 0.1); | |
} | |
} | |
/* [Previous CSS remains the same until features-grid] */ | |
/* Features Grid Section */ | |
.features-grid { | |
display: grid; | |
grid-template-columns: repeat(3, 1fr); | |
gap: 1.5rem; | |
width: 100%; | |
padding: 2rem 0; | |
} | |
.dataset-table { | |
width: 100%; | |
border-collapse: separate; | |
border-spacing: 0; | |
margin: 2rem 0; | |
background: var(--bg-tertiary); | |
border-radius: 1rem; | |
overflow: hidden; | |
box-shadow: 0 4px 20px var(--shadow-color); | |
} | |
.dataset-table thead { | |
background: linear-gradient(90deg, var(--accent-blue), var(--accent-purple)); | |
} | |
.dataset-table th { | |
padding: 1.25rem 1rem; | |
text-align: left; | |
color: white; | |
font-weight: 600; | |
font-size: 1rem; | |
} | |
.dataset-table td { | |
padding: 1rem; | |
border-bottom: 1px solid var(--border-primary); | |
color: var(--text-secondary); | |
transition: all 0.2s ease; | |
} | |
.dataset-table tbody tr:hover td { | |
background: var(--card-hover-bg); | |
color: var(--text-primary); | |
} | |
.dataset-table td[rowspan] { | |
background: var(--bg-secondary); | |
color: var(--accent-blue); | |
font-weight: 600; | |
border-right: 1px solid var(--border-primary); | |
} | |
.purpose-cell { | |
max-width: 300px; | |
line-height: 1.5; | |
} | |
.category-cell { | |
color: var(--accent-purple); | |
font-weight: 500; | |
} | |
.dataset-name { | |
font-family: monospace; | |
color: var(--accent-pink); | |
font-size: 0.9rem; | |
} | |
[Rest of the CSS remains the same] | |
</style> | |
<!-- Methodology Section --> | |
<div class="methodology-section"> | |
<h1 class="methodology-title">Methodology</h1> | |
<h2 class="methodology-subtitle">Overview</h2> | |
<p class="methodology-text"> | |
The Berkeley Function Calling Leaderboard (BFCL) evaluates language models' ability to effectively use tools | |
and maintain coherent multi-turn conversations. Our evaluation focuses on both basic functionality and edge | |
cases that challenge real-world applicability. | |
</p> | |
<h2 class="methodology-subtitle">Tool Selection Quality (TSQ) Metric</h2> | |
<ul class="metric-list"> | |
<li>Correctly identify when tools are needed</li> | |
<li>Select the appropriate tool for the task</li> | |
<li>Handle cases where no suitable tool exists</li> | |
<li>Maintain context across multiple interactions</li> | |
<li>Consider cost-effectiveness of tool usage</li> | |
<li>Optimize for minimal necessary tool calls</li> | |
</ul> | |
<h2 class="methodology-subtitle">Dataset Structure</h2> | |
<div class="table-container"> | |
<table class="dataset-table"> | |
<thead> | |
<tr> | |
<th>Type</th> | |
<th>Samples</th> | |
<th>Category</th> | |
<th>Dataset Name</th> | |
<th>Purpose</th> | |
</tr> | |
</thead> | |
<tbody> | |
<tr> | |
<td rowspan="4">Single-Turn</td> | |
<td>100 + 100</td> | |
<td class="category-cell">Single Function Call</td> | |
<td class="dataset-name">xlam_single_tool_single_call</td> | |
<td class="purpose-cell">Evaluates basic ability to read documentation and make single function calls</td> | |
</tr> | |
<tr> | |
<td>200 + 50</td> | |
<td class="category-cell">Multiple Function Call</td> | |
<td class="dataset-name">xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call</td> | |
<td class="purpose-cell">Tests parallel execution and result aggregation capabilities</td> | |
</tr> | |
<tr> | |
<td>100</td> | |
<td class="category-cell">Irrelevant Query</td> | |
<td class="dataset-name">BFCL_v3_irrelevance</td> | |
<td class="purpose-cell">Tests ability to recognize when available tools don't match user needs</td> | |
</tr> | |
<tr> | |
<td>100</td> | |
<td class="category-cell">Long Context</td> | |
<td class="dataset-name">tau_long_context</td> | |
<td class="purpose-cell">Assesses handling of extended interactions and complex instructions</td> | |
</tr> | |
<tr> | |
<td rowspan="5">Multi-Turn</td> | |
<td>50 + 30</td> | |
<td class="category-cell">Single Function Call</td> | |
<td class="dataset-name">BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call</td> | |
<td class="purpose-cell">Tests basic conversational function calling abilities</td> | |
</tr> | |
<tr> | |
<td>50</td> | |
<td class="category-cell">Multiple Function Call</td> | |
<td class="dataset-name">BFCL_v3_multi_turn_base_multi_func_call</td> | |
<td class="purpose-cell">Evaluates handling of multiple function calls in conversation</td> | |
</tr> | |
<tr> | |
<td>100</td> | |
<td class="category-cell">Missing Function</td> | |
<td class="dataset-name">BFCL_v3_multi_turn_miss_func</td> | |
<td class="purpose-cell">Tests graceful handling of unavailable tools</td> | |
</tr> | |
<tr> | |
<td>100</td> | |
<td class="category-cell">Missing Parameters</td> | |
<td class="dataset-name">BFCL_v3_multi_turn_miss_param</td> | |
<td class="purpose-cell">Assesses parameter collection and handling incomplete information</td> | |
</tr> | |
<tr> | |
<td>100</td> | |
<td class="category-cell">Composite</td> | |
<td class="dataset-name">BFCL_v3_multi_turn_composite</td> | |
<td class="purpose-cell">Tests overall robustness in complex scenarios</td> | |
</tr> | |
</tbody> | |
</table> | |
</div> | |
<!-- Features Grid Section --> | |
<div class="features-grid"> | |
<div class="feature-card"> | |
<div class="feature-icon"> | |
<svg width="24" height="24" fill="none" stroke="var(--accent-blue)" stroke-width="2" viewBox="0 0 24 24"> | |
<path d="M22 12h-4l-3 9L9 3l-3 9H2"/> | |
</svg> | |
</div> | |
<h3 class="feature-title">Make Better Decisions</h3> | |
<ul class="feature-list"> | |
<li>Cost-effectiveness analysis</li> | |
<li>Business impact metrics</li> | |
<li>Vendor strategy insights</li> | |
</ul> | |
</div> | |
<div class="feature-card"> | |
<div class="feature-icon"> | |
<svg width="24" height="24" fill="none" stroke="var(--accent-purple)" stroke-width="2" viewBox="0 0 24 24"> | |
<path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/> | |
</svg> | |
</div> | |
<h3 class="feature-title">360° Domain Evaluation</h3> | |
<ul class="feature-list"> | |
<li>Cross-domain evaluation</li> | |
<li>Real-world use cases</li> | |
<li>Edge case evaluation</li> | |
</ul> | |
</div> | |
<div class="feature-card"> | |
<div class="feature-icon"> | |
<svg width="24" height="24" fill="none" stroke="var(--accent-pink)" stroke-width="2" viewBox="0 0 24 24"> | |
<path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/> | |
</svg> | |
</div> | |
<h3 class="feature-title">Updated Periodically</h3> | |
<ul class="feature-list"> | |
<li>12 private models evaluated</li> | |
<li>5 open source models included</li> | |
<li>Monthly model additions</li> | |
</ul> | |
</div> | |
</div> | |
""" | |