Spaces:

galileo-ai
/

agent-leaderboard

Running on CPU Upgrade

agent-leaderboard / data_loader.py

Pratik Bhavsar

improved dataset table

91da2cc 5 months ago

25.8 kB

	import pandas as pd
	from glob import glob
	import numpy as np
	from pathlib import Path


	DATASETS = [Path(file).stem for file in glob("datasets/*.parquet")]
	SCORES = [round(x, 2) for x in np.arange(0, 1.1, 0.1).tolist()]

	def load_data():
	"""Load and preprocess the data."""
	df = pd.read_csv("results.csv").dropna()

	# Add combined I/O cost column with 3:1 ratio
	df["IO Cost"] = (
	df["Input cost per million token"] * 0.75
	+ df["Output cost per million token"] * 0.25
	)
	return df


	# categories.py
	CATEGORIES = {
	"Overall": ["Model Avg"],
	"Overall single turn": ["single turn perf"],
	"Overall multi turn": ["multi turn perf"],
	"Single func call": [
	"xlam_single_tool_single_call",
	"xlam_multiple_tool_single_call",
	],
	"Multiple func call": [
	"xlam_multiple_tool_multiple_call",
	"xlam_single_tool_multiple_call",
	"BFCL_v3_multi_turn_base_multi_func_call",
	],
	"Irrelevant query": ["BFCL_v3_irrelevance"],
	"Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"],
	"Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"],
	"Missing params": ["BFCL_v3_multi_turn_miss_param"],
	"Composite": ["BFCL_v3_multi_turn_composite"],
	}


	chat_css = """
	/* Container styles */
	.container {
	display: flex;
	gap: 1.5rem;
	height: calc(100vh - 100px);
	padding: 1rem;
	}

	/* Chat panel styles */
	.chat-panel {
	flex: 2;
	background: #1a1f2c;
	border-radius: 1rem;
	padding: 1rem;
	overflow-y: auto;
	max-height: calc(100vh - 120px);
	}

	/* Message styles */
	.message {
	padding: 1.2rem;
	margin: 0.8rem;
	border-radius: 1rem;
	font-family: monospace;
	box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
	}

	.system {
	background: linear-gradient(135deg, #8e44ad, #9b59b6);
	}

	.user {
	background: linear-gradient(135deg, #2c3e50, #3498db);
	margin-left: 2rem;
	}

	.assistant {
	background: linear-gradient(135deg, #27ae60, #2ecc71);
	margin-right: 2rem;
	}

	.role-badge {
	display: inline-block;
	padding: 0.3rem 0.8rem;
	border-radius: 0.5rem;
	font-weight: bold;
	margin-bottom: 0.8rem;
	font-size: 0.9rem;
	text-transform: uppercase;
	letter-spacing: 0.05em;
	}

	.system-role {
	background-color: #8e44ad;
	color: white;
	}

	.user-role {
	background-color: #3498db;
	color: white;
	}

	.assistant-role {
	background-color: #27ae60;
	color: white;
	}

	.content {
	white-space: pre-wrap;
	word-break: break-word;
	color: #f5f6fa;
	line-height: 1.5;
	}

	/* Metrics panel styles */
	.metrics-panel {
	flex: 1;
	display: flex;
	flex-direction: column;
	gap: 2rem;
	padding: 1.5rem;
	background: #1a1f2c;
	border-radius: 1rem;
	}

	.metric-section {
	background: #1E293B;
	padding: 1.5rem;
	border-radius: 1rem;
	}

	.score-section {
	text-align: center;
	}

	.score-display {
	font-size: 3rem;
	font-weight: bold;
	color: #4ADE80;
	line-height: 1;
	margin: 0.5rem 0;
	}

	.explanation-text {
	color: #E2E8F0;
	line-height: 1.6;
	font-size: 0.95rem;
	}

	/* Tool info panel styles */
	.tool-info-panel {
	background: #1a1f2c;
	padding: 1.5rem;
	border-radius: 1rem;
	color: #f5f6fa;
	}

	.tool-section {
	margin-bottom: 1.5rem;
	}

	.tool-name {
	font-size: 1.2rem;
	color: #4ADE80;
	font-weight: bold;
	margin-bottom: 0.5rem;
	}

	.tool-description {
	color: #E2E8F0;
	line-height: 1.6;
	margin-bottom: 1rem;
	}

	.tool-parameters .parameter {
	margin: 0.5rem 0;
	padding: 0.5rem;
	background: rgba(255, 255, 255, 0.05);
	border-radius: 0.5rem;
	}

	.param-name {
	color: #63B3ED;
	font-weight: bold;
	margin-right: 0.5rem;
	}

	.tool-examples .example {
	margin: 0.5rem 0;
	padding: 0.5rem;
	background: rgba(255, 255, 255, 0.05);
	border-radius: 0.5rem;
	font-family: monospace;
	}

	/* Custom scrollbar */
	::-webkit-scrollbar {
	width: 8px;
	}

	::-webkit-scrollbar-track {
	background: rgba(255, 255, 255, 0.1);
	border-radius: 4px;
	}

	::-webkit-scrollbar-thumb {
	background: linear-gradient(45deg, #3498db, #2ecc71);
	border-radius: 4px;
	}

	/* Title styles */
	.title {
	color: #63B3ED;
	font-size: 2rem;
	font-weight: bold;
	text-align: center;
	margin-bottom: 1.5rem;
	padding: 1rem;
	}


	/* Headers */
	h3 {
	color: #63B3ED;
	margin: 0 0 1rem 0;
	font-size: 1.1rem;
	font-weight: 500;
	letter-spacing: 0.05em;
	}
	"""

	COMMON = """
	<style>
	@media (prefers-color-scheme: dark) {
	:root {
	--bg-primary: #0B0B19;
	--bg-secondary: rgba(19, 19, 37, 0.4);
	--bg-hover: rgba(30, 30, 45, 0.95);
	--text-primary: #ffffff;
	--text-secondary: #e2e8f0;
	--text-tertiary: #e2e8f0;
	--border-color: rgba(31, 41, 55, 0.5);
	--border-hover: rgba(79, 70, 229, 0.4);
	--card-bg: rgba(17, 17, 27, 0.4);
	--accent-color: #ffffff;
	--accent-bg: rgba(79, 70, 229, 0.1);
	--blue-gradient: linear-gradient(45deg, #60A5FA, #3B82F6);
	--purple-gradient: linear-gradient(45deg, #A78BFA, #8B5CF6);
	--pink-gradient: linear-gradient(45deg, #F472B6, #EC4899);
	--shadow-color: rgba(0, 0, 0, 0.2);
	}
	}

	@media (prefers-color-scheme: light) {
	:root {
	--bg-primary: #ffffff;
	--bg-secondary: rgba(243, 244, 246, 0.4);
	--bg-hover: rgba(229, 231, 235, 0.95);
	--text-primary: #1F2937;
	--text-secondary: #4B5563;
	--text-tertiary: #6B7280;
	--border-color: rgba(209, 213, 219, 0.5);
	--border-hover: rgba(79, 70, 229, 0.4);
	--card-bg: rgba(249, 250, 251, 0.4);
	--accent-color: #4F46E5;
	--accent-bg: rgba(79, 70, 229, 0.1);
	--blue-gradient: linear-gradient(45deg, #3B82F6, #2563EB);
	--purple-gradient: linear-gradient(45deg, #8B5CF6, #EF43CD);
	--pink-gradient: linear-gradient(45deg, #EC4899, #DB2777);
	--shadow-color: rgba(0, 0, 0, 0.1);
	}
	}
	</style>
	"""

	DESCRIPTION_HTML = """
	<div style="
	background: var(--bg-secondary, rgba(30, 30, 45, 0.95));
	border-radius: 12px;
	padding: 24px;
	margin: 16px 0;
	">
	<div style="
	display: flex;
	flex-direction: column;
	gap: 16px;
	">
	<div style="
	color: var(--text-primary);
	font-size: 1.1rem;
	font-weight: 500;
	display: flex;
	align-items: center;
	gap: 8px;
	">
	🎯 Purpose
	<span style="
	background: linear-gradient(to right, var(--accent-blue), var(--accent-purple));
	color: white;
	padding: 4px 12px;
	border-radius: 100px;
	font-size: 0.9rem;
	">Latest Update: Feb 2025</span>
	</div>
	<p style="
	color: var(--text-secondary);
	margin: 0;
	line-height: 1.6;
	">
	This comprehensive benchmark evaluates language models' ability to effectively utilize tools and functions in complex scenarios.
	</p>

	<div style="
	color: var(--text-primary);
	font-size: 1.1rem;
	font-weight: 500;
	margin-top: 8px;
	">
	🔍 What We Evaluate
	</div>
	<div style="
	display: grid;
	grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
	gap: 16px;
	color: var(--text-secondary);
	">
	<div style="display: flex; gap: 8px; align-items: center;">
	🔄 Single/Multi-turn Interactions
	</div>
	<div style="display: flex; gap: 8px; align-items: center;">
	🧩 Function Composition
	</div>
	<div style="display: flex; gap: 8px; align-items: center;">
	⚡ Error Handling
	</div>
	</div>

	<div style="
	color: var(--text-primary);
	font-size: 1.1rem;
	font-weight: 500;
	margin-top: 8px;
	">
	📊 Key Results
	</div>
	<div style="
	display: grid;
	grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
	gap: 16px;
	color: var(--text-secondary);
	">
	<div style="display: flex; gap: 8px; align-items: center;">
	✅ Accuracy Performance
	</div>
	<div style="display: flex; gap: 8px; align-items: center;">
	💰 Open Vs Closed Source
	</div>
	<div style="display: flex; gap: 8px; align-items: center;">
	⚖️ Overall Effectiveness
	</div>
	</div>

	<div style="
	border-left: 4px solid var(--accent-color, #4F46E5);
	padding-left: 12px;
	margin-top: 8px;
	color: var(--text-secondary);
	font-style: italic;
	">
	💡 Use the filters below to explore different aspects of the evaluation and compare model performance across various dimensions.
	</div>
	</div>
	</div>
	"""


	HEADER_CONTENT = (
	COMMON
	+ """
	<style>

	.header-wrapper {
	background: var(--bg-primary);
	padding: 4rem 2rem;
	border-radius: 16px;
	margin-bottom: 0;
	transition: all 0.3s ease;
	}

	.header-content {
	max-width: 72rem;
	margin: 0 auto;
	}

	.title-section {
	text-align: center;
	margin-bottom: 4rem;
	}

	.title-gradient {
	font-size: 5rem;
	font-weight: 800;
	line-height: 1.1;
	background: var(--purple-gradient);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	margin-bottom: 0.5rem;
	}

	.subtitle-white {
	font-size: 5rem;
	font-weight: 800;
	line-height: 1.1;
	color: var(--text-primary);
	margin-bottom: 3rem;
	transition: color 0.3s ease;
	}

	.description {
	color: var(--text-secondary);
	font-size: 1.25rem;
	line-height: 1.75;
	max-width: 800px;
	margin: 0 auto;
	text-align: center;
	transition: color 0.3s ease;
	}

	.highlight-question {
	background: var(--blue-gradient);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	display: block;
	margin-top: 1rem;
	font-size: 1.5rem;
	font-weight: 500;
	}

	.metrics-grid {
	display: grid;
	grid-template-columns: repeat(3, 1fr);
	gap: 1.5rem;
	margin-top: 4rem;
	}

	.metric-card {
	background: var(--bg-secondary);
	border: 1px solid var(--border-color);
	border-radius: 1rem;
	padding: 2rem;
	transition: all 0.3s ease;
	align-items: center;
	}

	.metric-card:hover {
	transform: translateY(-5px);
	border-color: var(--border-hover);
	box-shadow: 0 4px 20px var(--shadow-color);
	}

	.metric-number {
	font-size: 4rem;
	font-weight: 800;
	margin-bottom: 1rem;
	}

	.metric-blue {
	background: var(--blue-gradient);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	}

	.metric-purple {
	background: var(--purple-gradient);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	}

	.metric-pink {
	background: var(--pink-gradient);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	}

	.metric-label {
	color: var(--text-secondary);
	font-size: 1.5rem;
	margin-bottom: 1.5rem;
	transition: color 0.3s ease;
	}

	.metric-detail {
	font-size: 1.125rem;
	line-height: 1.75;
	margin-top: 0.5rem;
	transition: color 0.3s ease;
	}

	.metric-detail.primary {
	color: var(--accent-color);
	}

	.metric-detail.secondary {
	color: var(--text-secondary);
	}

	.actions {
	display: flex;
	gap: 1rem;
	justify-content: center;
	margin-top: 3rem;
	}

	.action-button {
	display: flex;
	align-items: center;
	gap: 0.5rem;
	padding: 0.75rem 1.5rem;
	background: var(--bg-secondary);
	border: 1px solid var(--border-color);
	border-radius: 100px;
	color: var(--text-primary) !important;
	text-decoration: none !important;
	font-size: 0.95rem;
	transition: all 0.3s ease;
	}

	.action-button:hover {
	transform: translateY(-2px);
	border-color: var(--accent-color);
	background: var(--accent-bg);
	}

	@media (max-width: 768px) {
	.title-gradient, .subtitle-white {
	font-size: 3rem;
	}
	.metrics-grid {
	grid-template-columns: 1fr;
	}
	}
	</style>

	<div class="header-wrapper">
	<div class="header-content">
	<div class="title-section">
	<div class="subtitle-white">Welcome to the</div>
	<div class="title-gradient">Agent Leaderboard!</div>

	<div class="description">
	The landscape of AI agents is evolving rapidly, with major tech CEOs predicting 2025 as a pivotal year.
	We built this leaderboard to answer one simple question:
	<div class="highlight-question">
	"How do AI agents perform in real-world agentic scenarios?"
	</div>
	</div>
	</div>

	<div class="actions">
	<a href="#" class="action-button">
	<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
	<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
	<line x1="8" y1="12" x2="16" y2="12"/>
	</svg>
	Blog
	</a>
	<a href="#" class="action-button">
	<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
	<path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
	</svg>
	GitHub
	</a>
	<a href="#" class="action-button">
	<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
	<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
	<polyline points="7 10 12 15 17 10"/>
	<line x1="12" y1="15" x2="12" y2="3"/>
	</svg>
	Dataset
	</a>
	</div>
	</div>
	</div>
	"""
	)

	CARDS = """ <div class="metrics-grid">
	<div class="metric-card">
	<div class="metric-number metric-blue">17</div>
	<div class="metric-label">Total Models</div>
	<div class="metric-detail primary">12 Private</div>
	<div class="metric-detail primary">5 Open Source</div>
	</div>

	<div class="metric-card">
	<div class="metric-number metric-purple">14</div>
	<div class="metric-label">Evaluation Datasets</div>
	<div class="metric-detail primary">Cross-Domain Testing</div>
	<div class="metric-detail primary">Real-world use cases</div>
	</div>

	<div class="metric-card">
	<div class="metric-number metric-pink">TSQ</div>
	<div class="metric-label">Evaluation Metric</div>
	<div class="metric-detail primary">Tool Selection Quality</div>
	<div class="metric-detail primary">GPT-4o Based Judge</div>
	</div>
	</div>"""

	METHODOLOGY = """
	<style>
	@media (prefers-color-scheme: dark) {
	:root {
	--bg-primary: #0B0B19;
	--bg-secondary: rgba(19, 19, 37, 0.4);
	--bg-tertiary: rgba(30, 30, 45, 0.95);
	--text-primary: #ffffff;
	--text-secondary: #94A3B8;
	--text-tertiary: #E2E8F0;
	--border-primary: rgba(31, 41, 55, 0.5);
	--border-hover: rgba(79, 70, 229, 0.4);
	--accent-blue: #60A5FA;
	--accent-purple: #A78BFA;
	--accent-pink: #F472B6;
	--card-hover-bg: rgba(79, 70, 229, 0.1);
	--shadow-color: rgba(79, 70, 229, 0.1);
	}
	}

	@media (prefers-color-scheme: light) {
	:root {
	--bg-primary: #ffffff;
	--bg-secondary: rgba(243, 244, 246, 0.4);
	--bg-tertiary: rgba(249, 250, 251, 0.95);
	--text-primary: #111827;
	--text-secondary: #4B5563;
	--text-tertiary: #6B7280;
	--border-primary: rgba(209, 213, 219, 0.5);
	--border-hover: rgba(79, 70, 229, 0.4);
	--accent-blue: #3B82F6;
	--accent-purple: #8B5CF6;
	--accent-pink: #EC4899;
	--card-hover-bg: rgba(243, 244, 246, 0.8);
	--shadow-color: rgba(0, 0, 0, 0.1);
	}
	}

	/* [Previous CSS remains the same until features-grid] */

	/* Features Grid Section */
	.features-grid {
	display: grid;
	grid-template-columns: repeat(3, 1fr);
	gap: 1.5rem;
	width: 100%;
	padding: 2rem 0;
	}

	.dataset-table {
	width: 100%;
	border-collapse: separate;
	border-spacing: 0;
	margin: 2rem 0;
	background: var(--bg-tertiary);
	border-radius: 1rem;
	overflow: hidden;
	box-shadow: 0 4px 20px var(--shadow-color);
	}

	.dataset-table thead {
	background: linear-gradient(90deg, var(--accent-blue), var(--accent-purple));
	}

	.dataset-table th {
	padding: 1.25rem 1rem;
	text-align: left;
	color: white;
	font-weight: 600;
	font-size: 1rem;
	}

	.dataset-table td {
	padding: 1rem;
	border-bottom: 1px solid var(--border-primary);
	color: var(--text-secondary);
	transition: all 0.2s ease;
	}

	.dataset-table tbody tr:hover td {
	background: var(--card-hover-bg);
	color: var(--text-primary);
	}

	.dataset-table td[rowspan] {
	background: var(--bg-secondary);
	color: var(--accent-blue);
	font-weight: 600;
	border-right: 1px solid var(--border-primary);
	}

	.purpose-cell {
	max-width: 300px;
	line-height: 1.5;
	}

	.category-cell {
	color: var(--accent-purple);
	font-weight: 500;
	}

	.dataset-name {
	font-family: monospace;
	color: var(--accent-pink);
	font-size: 0.9rem;
	}

	[Rest of the CSS remains the same]
	</style>
	<!-- Methodology Section -->
	<div class="methodology-section">
	<h1 class="methodology-title">Methodology</h1>

	<h2 class="methodology-subtitle">Overview</h2>
	<p class="methodology-text">
	The Berkeley Function Calling Leaderboard (BFCL) evaluates language models' ability to effectively use tools
	and maintain coherent multi-turn conversations. Our evaluation focuses on both basic functionality and edge
	cases that challenge real-world applicability.
	</p>

	<h2 class="methodology-subtitle">Tool Selection Quality (TSQ) Metric</h2>
	<ul class="metric-list">
	<li>Correctly identify when tools are needed</li>
	<li>Select the appropriate tool for the task</li>
	<li>Handle cases where no suitable tool exists</li>
	<li>Maintain context across multiple interactions</li>
	<li>Consider cost-effectiveness of tool usage</li>
	<li>Optimize for minimal necessary tool calls</li>
	</ul>

	<h2 class="methodology-subtitle">Dataset Structure</h2>
	<div class="table-container">
	<table class="dataset-table">
	<thead>
	<tr>
	<th>Type</th>
	<th>Samples</th>
	<th>Category</th>
	<th>Dataset Name</th>
	<th>Purpose</th>
	</tr>
	</thead>
	<tbody>
	<tr>
	<td rowspan="4">Single-Turn</td>
	<td>100 + 100</td>
	<td class="category-cell">Single Function Call</td>
	<td class="dataset-name">xlam_single_tool_single_call</td>
	<td class="purpose-cell">Evaluates basic ability to read documentation and make single function calls</td>
	</tr>
	<tr>
	<td>200 + 50</td>
	<td class="category-cell">Multiple Function Call</td>
	<td class="dataset-name">xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call</td>
	<td class="purpose-cell">Tests parallel execution and result aggregation capabilities</td>
	</tr>
	<tr>
	<td>100</td>
	<td class="category-cell">Irrelevant Query</td>
	<td class="dataset-name">BFCL_v3_irrelevance</td>
	<td class="purpose-cell">Tests ability to recognize when available tools don't match user needs</td>
	</tr>
	<tr>
	<td>100</td>
	<td class="category-cell">Long Context</td>
	<td class="dataset-name">tau_long_context</td>
	<td class="purpose-cell">Assesses handling of extended interactions and complex instructions</td>
	</tr>
	<tr>
	<td rowspan="5">Multi-Turn</td>
	<td>50 + 30</td>
	<td class="category-cell">Single Function Call</td>
	<td class="dataset-name">BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call</td>
	<td class="purpose-cell">Tests basic conversational function calling abilities</td>
	</tr>
	<tr>
	<td>50</td>
	<td class="category-cell">Multiple Function Call</td>
	<td class="dataset-name">BFCL_v3_multi_turn_base_multi_func_call</td>
	<td class="purpose-cell">Evaluates handling of multiple function calls in conversation</td>
	</tr>
	<tr>
	<td>100</td>
	<td class="category-cell">Missing Function</td>
	<td class="dataset-name">BFCL_v3_multi_turn_miss_func</td>
	<td class="purpose-cell">Tests graceful handling of unavailable tools</td>
	</tr>
	<tr>
	<td>100</td>
	<td class="category-cell">Missing Parameters</td>
	<td class="dataset-name">BFCL_v3_multi_turn_miss_param</td>
	<td class="purpose-cell">Assesses parameter collection and handling incomplete information</td>
	</tr>
	<tr>
	<td>100</td>
	<td class="category-cell">Composite</td>
	<td class="dataset-name">BFCL_v3_multi_turn_composite</td>
	<td class="purpose-cell">Tests overall robustness in complex scenarios</td>
	</tr>
	</tbody>
	</table>
	</div>

	<!-- Features Grid Section -->
	<div class="features-grid">
	<div class="feature-card">
	<div class="feature-icon">
	<svg width="24" height="24" fill="none" stroke="var(--accent-blue)" stroke-width="2" viewBox="0 0 24 24">
	<path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
	</svg>
	</div>
	<h3 class="feature-title">Make Better Decisions</h3>
	<ul class="feature-list">
	<li>Cost-effectiveness analysis</li>
	<li>Business impact metrics</li>
	<li>Vendor strategy insights</li>
	</ul>
	</div>

	<div class="feature-card">
	<div class="feature-icon">
	<svg width="24" height="24" fill="none" stroke="var(--accent-purple)" stroke-width="2" viewBox="0 0 24 24">
	<path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
	</svg>
	</div>
	<h3 class="feature-title">360° Domain Evaluation</h3>
	<ul class="feature-list">
	<li>Cross-domain evaluation</li>
	<li>Real-world use cases</li>
	<li>Edge case evaluation</li>
	</ul>
	</div>

	<div class="feature-card">
	<div class="feature-icon">
	<svg width="24" height="24" fill="none" stroke="var(--accent-pink)" stroke-width="2" viewBox="0 0 24 24">
	<path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
	</svg>
	</div>
	<h3 class="feature-title">Updated Periodically</h3>
	<ul class="feature-list">
	<li>12 private models evaluated</li>
	<li>5 open source models included</li>
	<li>Monthly model additions</li>
	</ul>
	</div>
	</div>
	"""