Spaces:

galileo-ai
/

agent-leaderboard

Running on CPU Upgrade

App Files Files Community

Pratik Bhavsar commited on 17 days ago

Commit

b8ddec2

1 Parent(s): a64af65

improved looks

Browse files

Files changed (3) hide show

data_loader.py +452 -287
results.csv +1 -1
tabs/leaderboard.py +3 -3

data_loader.py CHANGED Viewed

@@ -40,48 +40,6 @@ CATEGORIES = {
     "Composite": ["BFCL_v3_multi_turn_composite"],
 }
-METHODOLOGY = """# Methodology
-                ## Overview
-                The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
-                The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
-                ## Tool Selection Quality Metric
-                Models are evaluated on their ability to:
-                - Correctly identify when tools are needed
-                - Select the appropriate tool for the task
-                - Handle cases where no suitable tool exists
-                - Maintain context across multiple interactions
-                ## Dataset Structure
-                | Type | Samples | Category | Dataset Name | Purpose |
-                |------|---------|-----------|--------------|----------|
-                | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
-                | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
-                | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
-                | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
-                | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
-                | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
-                | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
-                | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
-                | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
-                """
-INSIGHTS = """
-                # Key Insights from Agent Leaderboard
-                | Category | Finding | Implications |
-                |----------|---------|--------------|
-                | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
-                | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
-                | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
-                | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
-                | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
-                | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
-                **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
-                """
 chat_css = """
 /* Container styles */
@@ -275,265 +233,48 @@ h3 {
 }
 """
-# Updated header and cards with theme awareness
-HEADER_CONTENT = """
 <style>
     @media (prefers-color-scheme: dark) {
         :root {
-            --bg-primary: rgb(17, 17, 27);
-            --bg-secondary: rgba(30, 30, 45, 0.95);
-            --bg-hover: rgba(40, 40, 55, 0.95);
             --text-primary: #ffffff;
-            --text-secondary: #94a3b8;
             --text-tertiary: #e2e8f0;
-            --border-color: rgba(255, 255, 255, 0.1);
-            --border-hover: rgba(255, 255, 255, 0.2);
-            --card-bg: rgba(17, 17, 27, 0.6);
-            --accent-color: #4F46E5;
             --accent-bg: rgba(79, 70, 229, 0.1);
         }
     }
     @media (prefers-color-scheme: light) {
         :root {
-            --bg-primary: rgb(255, 255, 255);
-            --bg-secondary: rgba(243, 244, 246, 0.95);
             --bg-hover: rgba(229, 231, 235, 0.95);
-            --text-primary: #000000;
-            --text-secondary: #4b5563;
-            --text-tertiary: #1f2937;
-            --border-color: rgba(0, 0, 0, 0.1);
-            --border-hover: rgba(0, 0, 0, 0.2);
-            --card-bg: rgba(249, 250, 251, 0.6);
             --accent-color: #4F46E5;
             --accent-bg: rgba(79, 70, 229, 0.1);
         }
     }
-    .header-wrapper {
-        padding: 3rem 2rem;
-        background: var(--bg-primary);
-        border-radius: 16px;
-        display: flex;
-        flex-direction: column;
-        align-items: center;
-        text-align: center;
-    }
-    .header-wrapper a {
-        color: var(--text-primary) !important;
-        text-decoration: none !important;
-    }
-    .description {
-        color: var(--text-primary);
-        font-size: 1.1rem;
-        line-height: 1.6;
-        max-width: 800px;
-        margin: 0 auto 2rem;
-        text-align: center;
-    }
-    .actions {
-        display: flex;
-        gap: 1rem;
-        justify-content: center;
-        margin-bottom: 2rem;
-        color: var(--text-primary);
-    }
-    .action-button {
-        display: flex;
-        align-items: center;
-        gap: 0.5rem;
-        padding: 0.75rem 1.5rem;
-        background: var(--bg-secondary);
-        border: 1px solid var(--border-color);
-        border-radius: 100px;
-        color: var(--text-primary) !important;
-        text-decoration: none !important;
-        font-size: 0.95rem;
-        transition: all 0.2s ease;
-    }
-    .action-button:hover {
-        background: var(--bg-hover);
-        border-color: var(--border-hover);
-        color: var(--text-primary) !important;
-    }
-    .update-info {
-        color: var(--text-secondary);
-        font-size: 0.9rem;
-        margin-bottom: 3rem;
-    }
-    .features-grid {
-        display: grid;
-        grid-template-columns: repeat(3, 1fr);
-        gap: 1.5rem;
-        width: 100%;
-        max-width: 1200px;
-    }
-    .feature-card {
-        background: var(--card-bg);
-        border: 1px solid var(--border-color);
-        border-radius: 16px;
-        padding: 2rem;
-        text-align: left;
-    }
-    .feature-icon {
-        background: var(--accent-bg);
-        width: 40px;
-        height: 40px;
-        border-radius: 12px;
-        display: flex;
-        align-items: center;
-        justify-content: center;
-        margin-bottom: 1.5rem;
-    }
-    .feature-title {
-        color: var(--text-primary);
-        font-size: 1.25rem;
-        font-weight: 600;
-        margin-bottom: 1rem;
-    }
-    .feature-description {
-        color: var(--text-secondary);
-        font-size: 0.95rem;
-        margin-bottom: 1.5rem;
-    }
-    .feature-list {
-        list-style: none;
-        padding: 0;
-        margin: 0;
-        display: flex;
-        flex-direction: column;
-        gap: 0.75rem;
-    }
-    .feature-list li {
-        color: var(--text-tertiary);
-        font-size: 0.95rem;
-        display: flex;
-        align-items: center;
-        gap: 0.5rem;
-    }
-    .feature-list li::before {
-        content: '';
-        width: 6px;
-        height: 6px;
-        background: var(--accent-color);
-        border-radius: 50%;
-        flex-shrink: 0;
-    }
-    /* Force all links to match theme */
-    .header-wrapper a:link,
-    .header-wrapper a:visited,
-    .header-wrapper a:hover,
-    .header-wrapper a:active {
-        color: var(--text-primary) !important;
-    }
-    /* Title specific styles */
-    .main-title {
-        color: var(--text-primary);
-        font-size: 48px;
-        font-weight: 700;
-        margin: 40px 0;
-        text-align: center;
-    }
-    .subtitle {
-        color: var(--text-secondary);
-        margin-bottom: 2rem;
-    }
 </style>
-<div class="header-wrapper">
-    <h1 class="main-title">Agent Leaderboard</h1>
-    <h2 class="subtitle">Comprehensive multi-benchmark evaluation for tool calling</h2>
-    <div class="actions">
-        <a href="#" class="action-button">
-            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
-                <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
-                <line x1="8" y1="12" x2="16" y2="12"/>
-            </svg>
-            Blog
-        </a>
-        <a href="#" class="action-button">
-            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
-                <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
-            </svg>
-            GitHub
-        </a>
-        <a href="#" class="action-button">
-            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
-                <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
-                <polyline points="7 10 12 15 17 10"/>
-                <line x1="12" y1="15" x2="12" y2="3"/>
-            </svg>
-            Dataset
-        </a>
-    </div>
-"""
-CARDS = """
-    <div class="features-grid">
-        <div class="feature-card">
-            <div class="feature-icon">
-                <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
-                    <path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
-                </svg>
-            </div>
-            <h3 class="feature-title">Make Better Decisions</h3>
-            <ul class="feature-list">
-                <li>Cost-effectiveness analysis</li>
-                <li>Business impact metrics</li>
-                <li>Vendor strategy insights</li>
-            </ul>
-        </div>
-        <div class="feature-card">
-            <div class="feature-icon">
-                <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
-                    <path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
-                </svg>
-            </div>
-            <h3 class="feature-title">360° Domain Evaluation</h3>
-            <ul class="feature-list">
-                <li>Cross-domain evaluation</li>
-                <li>Real-world use cases</li>
-                <li>Edge case evaluation</li>
-            </ul>
-        </div>
-        <div class="feature-card">
-            <div class="feature-icon">
-                <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
-                    <path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
-                </svg>
-            </div>
-            <h3 class="feature-title">Updated Periodically</h3>
-            <ul class="feature-list">
-                <li>11 private models evaluated</li>
-                <li>5 open source models included</li>
-                <li>Monthly model additions</li>
-            </ul>
-        </div>
-    </div>
-</div>
 """
 DESCRIPTION_HTML = """
@@ -558,7 +299,7 @@ DESCRIPTION_HTML = """
         ">
             🎯 Purpose
             <span style="
-                background: var(--accent-color, #4F46E5);
                 color: white;
                 padding: 4px 12px;
                 border-radius: 100px;
@@ -570,8 +311,7 @@ DESCRIPTION_HTML = """
             margin: 0;
             line-height: 1.6;
         ">
-            Welcome to the AI Agent Tool Calling Leaderboard! This comprehensive benchmark evaluates
-            language models' ability to effectively utilize tools and functions in complex scenarios.
         </p>
         <div style="
@@ -636,3 +376,428 @@ DESCRIPTION_HTML = """
     </div>
 </div>
 """

     "Composite": ["BFCL_v3_multi_turn_composite"],
 }
 chat_css = """
 /* Container styles */
 }
 """
+COMMON = """
 <style>
     @media (prefers-color-scheme: dark) {
         :root {
+            --bg-primary: #0B0B19;
+            --bg-secondary: rgba(19, 19, 37, 0.4);
+            --bg-hover: rgba(30, 30, 45, 0.95);
             --text-primary: #ffffff;
+            --text-secondary: #e2e8f0;
             --text-tertiary: #e2e8f0;
+            --border-color: rgba(31, 41, 55, 0.5);
+            --border-hover: rgba(79, 70, 229, 0.4);
+            --card-bg: rgba(17, 17, 27, 0.4);
+            --accent-color: #ffffff;
             --accent-bg: rgba(79, 70, 229, 0.1);
+            --blue-gradient: linear-gradient(45deg, #60A5FA, #3B82F6);
+            --purple-gradient: linear-gradient(45deg, #A78BFA, #8B5CF6);
+            --pink-gradient: linear-gradient(45deg, #F472B6, #EC4899);
+            --shadow-color: rgba(0, 0, 0, 0.2);
         }
     }
     @media (prefers-color-scheme: light) {
         :root {
+            --bg-primary: #ffffff;
+            --bg-secondary: rgba(243, 244, 246, 0.4);
             --bg-hover: rgba(229, 231, 235, 0.95);
+            --text-primary: #1F2937;
+            --text-secondary: #4B5563;
+            --text-tertiary: #6B7280;
+            --border-color: rgba(209, 213, 219, 0.5);
+            --border-hover: rgba(79, 70, 229, 0.4);
+            --card-bg: rgba(249, 250, 251, 0.4);
             --accent-color: #4F46E5;
             --accent-bg: rgba(79, 70, 229, 0.1);
+            --blue-gradient: linear-gradient(45deg, #3B82F6, #2563EB);
+            --purple-gradient: linear-gradient(45deg, #8B5CF6, #EF43CD);
+            --pink-gradient: linear-gradient(45deg, #EC4899, #DB2777);
+            --shadow-color: rgba(0, 0, 0, 0.1);
         }
     }
 </style>
 """
 DESCRIPTION_HTML = """
         ">
             🎯 Purpose
             <span style="
+                background: linear-gradient(to right, var(--accent-blue), var(--accent-purple));
                 color: white;
                 padding: 4px 12px;
                 border-radius: 100px;
             margin: 0;
             line-height: 1.6;
         ">
+            This comprehensive benchmark evaluates language models' ability to effectively utilize tools and functions in complex scenarios.
         </p>
         <div style="
     </div>
 </div>
 """
+HEADER_CONTENT = (
+    COMMON
+    + """
+<style>
+    .header-wrapper {
+        background: var(--bg-primary);
+        padding: 4rem 2rem;
+        border-radius: 16px;
+        margin-bottom: 0;
+        transition: all 0.3s ease;
+    }
+    .header-content {
+        max-width: 72rem;
+        margin: 0 auto;
+    }
+    .title-section {
+        text-align: center;
+        margin-bottom: 4rem;
+    }
+    .title-gradient {
+        font-size: 5rem;
+        font-weight: 800;
+        line-height: 1.1;
+        background: var(--purple-gradient);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        margin-bottom: 0.5rem;
+    }
+    .subtitle-white {
+        font-size: 5rem;
+        font-weight: 800;
+        line-height: 1.1;
+        color: var(--text-primary);
+        margin-bottom: 3rem;
+        transition: color 0.3s ease;
+    }
+    .description {
+        color: var(--text-secondary);
+        font-size: 1.25rem;
+        line-height: 1.75;
+        max-width: 800px;
+        margin: 0 auto;
+        text-align: center;
+        transition: color 0.3s ease;
+    }
+    .highlight-question {
+        background: var(--blue-gradient);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        display: block;
+        margin-top: 1rem;
+        font-size: 1.5rem;
+        font-weight: 500;
+    }
+    .metrics-grid {
+        display: grid;
+        grid-template-columns: repeat(3, 1fr);
+        gap: 1.5rem;
+        margin-top: 4rem;
+    }
+    .metric-card {
+        background: var(--bg-secondary);
+        border: 1px solid var(--border-color);
+        border-radius: 1rem;
+        padding: 2rem;
+        transition: all 0.3s ease;
+    }
+    .metric-card:hover {
+        transform: translateY(-5px);
+        border-color: var(--border-hover);
+        box-shadow: 0 4px 20px var(--shadow-color);
+    }
+    .metric-number {
+        font-size: 4rem;
+        font-weight: 800;
+        margin-bottom: 1rem;
+    }
+    .metric-blue {
+        background: var(--blue-gradient);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+    }
+    .metric-purple {
+        background: var(--purple-gradient);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+    }
+    .metric-pink {
+        background: var(--pink-gradient);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+    }
+    .metric-label {
+        color: var(--text-secondary);
+        font-size: 1.5rem;
+        margin-bottom: 1.5rem;
+        transition: color 0.3s ease;
+    }
+    .metric-detail {
+        font-size: 1.125rem;
+        line-height: 1.75;
+        margin-top: 0.5rem;
+        transition: color 0.3s ease;
+    }
+    .metric-detail.primary {
+        color: var(--accent-color);
+    }
+    .metric-detail.secondary {
+        color: var(--text-secondary);
+    }
+    .actions {
+        display: flex;
+        gap: 1rem;
+        justify-content: center;
+        margin-top: 3rem;
+    }
+    .action-button {
+        display: flex;
+        align-items: center;
+        gap: 0.5rem;
+        padding: 0.75rem 1.5rem;
+        background: var(--bg-secondary);
+        border: 1px solid var(--border-color);
+        border-radius: 100px;
+        color: var(--text-primary) !important;
+        text-decoration: none !important;
+        font-size: 0.95rem;
+        transition: all 0.3s ease;
+    }
+    .action-button:hover {
+        transform: translateY(-2px);
+        border-color: var(--accent-color);
+        background: var(--accent-bg);
+    }
+    @media (max-width: 768px) {
+        .title-gradient, .subtitle-white {
+            font-size: 3rem;
+        }
+        .metrics-grid {
+            grid-template-columns: 1fr;
+        }
+    }
+</style>
+<div class="header-wrapper">
+    <div class="header-content">
+        <div class="title-section">
+            <div class="subtitle-white">Welcome to the</div>
+            <div class="title-gradient">Agent Leaderboard!</div>
+            <div class="description">
+                The landscape of AI agents is evolving rapidly, with major tech CEOs predicting 2025 as a pivotal year.
+                We built this leaderboard to answer one simple question:
+                <div class="highlight-question">
+                    "How do AI agents perform in real-world agentic scenarios?"
+                </div>
+            </div>
+        </div>
+        <div class="actions">
+            <a href="#" class="action-button">
+                <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                    <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
+                    <line x1="8" y1="12" x2="16" y2="12"/>
+                </svg>
+                Blog
+            </a>
+            <a href="#" class="action-button">
+                <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                    <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
+                </svg>
+                GitHub
+            </a>
+            <a href="#" class="action-button">
+                <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                    <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
+                    <polyline points="7 10 12 15 17 10"/>
+                    <line x1="12" y1="15" x2="12" y2="3"/>
+                </svg>
+                Dataset
+            </a>
+        </div>
+    </div>
+</div>
+"""
+)
+CARDS = """        <div class="metrics-grid">
+            <div class="metric-card">
+                <div class="metric-number metric-blue">17</div>
+                <div class="metric-label">Total Models</div>
+                <div class="metric-detail primary">12 Private</div>
+                <div class="metric-detail primary">5 Open Source</div>
+            </div>
+            <div class="metric-card">
+                <div class="metric-number metric-purple">14</div>
+                <div class="metric-label">Evaluation Datasets</div>
+                <div class="metric-detail primary">Cross-Domain Testing</div>
+                <div class="metric-detail primary">Real-world use cases</div>
+            </div>
+            <div class="metric-card">
+                <div class="metric-number metric-pink">TSQ</div>
+                <div class="metric-label">Evaluation Metric</div>
+                <div class="metric-detail primary">Tool Selection Quality</div>
+                <div class="metric-detail primary">GPT-4o Based Judge</div>
+            </div>
+        </div>"""
+METHODOLOGY = """
+<style>
+    @media (prefers-color-scheme: dark) {
+        :root {
+            --bg-primary: #0B0B19;
+            --bg-secondary: rgba(19, 19, 37, 0.4);
+            --bg-tertiary: rgba(30, 30, 45, 0.95);
+            --text-primary: #ffffff;
+            --text-secondary: #94A3B8;
+            --text-tertiary: #E2E8F0;
+            --border-primary: rgba(31, 41, 55, 0.5);
+            --border-hover: rgba(79, 70, 229, 0.4);
+            --accent-blue: #60A5FA;
+            --accent-purple: #A78BFA;
+            --accent-pink: #F472B6;
+            --card-hover-bg: rgba(79, 70, 229, 0.1);
+            --shadow-color: rgba(79, 70, 229, 0.1);
+        }
+    }
+    @media (prefers-color-scheme: light) {
+        :root {
+            --bg-primary: #ffffff;
+            --bg-secondary: rgba(243, 244, 246, 0.4);
+            --bg-tertiary: rgba(249, 250, 251, 0.95);
+            --text-primary: #111827;
+            --text-secondary: #4B5563;
+            --text-tertiary: #6B7280;
+            --border-primary: rgba(209, 213, 219, 0.5);
+            --border-hover: rgba(79, 70, 229, 0.4);
+            --accent-blue: #3B82F6;
+            --accent-purple: #8B5CF6;
+            --accent-pink: #EC4899;
+            --card-hover-bg: rgba(243, 244, 246, 0.8);
+            --shadow-color: rgba(0, 0, 0, 0.1);
+        }
+    }
+    /* [Previous CSS remains the same until features-grid] */
+    /* Features Grid Section */
+    .features-grid {
+        display: grid;
+        grid-template-columns: repeat(3, 1fr);
+        gap: 1.5rem;
+        width: 100%;
+        padding: 2rem 0;
+    }
+    [Rest of the CSS remains the same]
+</style>
+<!-- Methodology Section -->
+<div class="methodology-section">
+    <h1 class="methodology-title">Methodology</h1>
+    <h2 class="methodology-subtitle">Overview</h2>
+    <p class="methodology-text">
+        The Berkeley Function Calling Leaderboard (BFCL) evaluates language models' ability to effectively use tools
+        and maintain coherent multi-turn conversations. Our evaluation focuses on both basic functionality and edge
+        cases that challenge real-world applicability.
+    </p>
+    <h2 class="methodology-subtitle">Tool Selection Quality (TSQ) Metric</h2>
+    <ul class="metric-list">
+        <li>Correctly identify when tools are needed</li>
+        <li>Select the appropriate tool for the task</li>
+        <li>Handle cases where no suitable tool exists</li>
+        <li>Maintain context across multiple interactions</li>
+        <li>Consider cost-effectiveness of tool usage</li>
+        <li>Optimize for minimal necessary tool calls</li>
+    </ul>
+    <h2 class="methodology-subtitle">Dataset Structure</h2>
+    <div class="table-container">
+        <table class="dataset-table">
+            <thead>
+                <tr>
+                    <th>Type</th>
+                    <th>Samples</th>
+                    <th>Category</th>
+                    <th>Dataset Name</th>
+                    <th>Purpose</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                    <td rowspan="4">Single-Turn</td>
+                    <td>200</td>
+                    <td>Single Function Call</td>
+                    <td>xlam_single_tool_single_call</td>
+                    <td>Basic ability to read documentation and make single function calls</td>
+                </tr>
+                <tr>
+                    <td>250</td>
+                    <td>Multiple Function Call</td>
+                    <td>xlam_multiple_tool_multiple_call</td>
+                    <td>Parallel execution and result aggregation capabilities</td>
+                </tr>
+                <tr>
+                    <td>100</td>
+                    <td>Irrelevant Query</td>
+                    <td>BFCL_v3_irrelevance</td>
+                    <td>Recognition of tool mismatches with user needs</td>
+                </tr>
+                <tr>
+                    <td>100</td>
+                    <td>Long Context</td>
+                    <td>tau_long_context</td>
+                    <td>Extended interactions and complex instructions</td>
+                </tr>
+                <tr>
+                    <td rowspan="5">Multi-Turn</td>
+                    <td>80</td>
+                    <td>Single Function Call</td>
+                    <td>BFCL_v3_multi_turn_base_single_func_call</td>
+                    <td>Conversational function calling abilities</td>
+                </tr>
+                <tr>
+                    <td>50</td>
+                    <td>Multiple Function Call</td>
+                    <td>BFCL_v3_multi_turn_base_multi_func_call</td>
+                    <td>Multiple function calls in conversation</td>
+                </tr>
+                <tr>
+                    <td>100</td>
+                    <td>Missing Function</td>
+                    <td>BFCL_v3_multi_turn_miss_func</td>
+                    <td>Graceful handling of unavailable tools</td>
+                </tr>
+                <tr>
+                    <td>100</td>
+                    <td>Missing Parameters</td>
+                    <td>BFCL_v3_multi_turn_miss_param</td>
+                    <td>Parameter collection and incomplete information</td>
+                </tr>
+                <tr>
+                    <td>100</td>
+                    <td>Composite</td>
+                    <td>BFCL_v3_multi_turn_composite</td>
+                    <td>Overall robustness in complex scenarios</td>
+                </tr>
+            </tbody>
+        </table>
+    </div>
+</div>
+<!-- Features Grid Section -->
+<div class="features-grid">
+    <div class="feature-card">
+        <div class="feature-icon">
+            <svg width="24" height="24" fill="none" stroke="var(--accent-blue)" stroke-width="2" viewBox="0 0 24 24">
+                <path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
+            </svg>
+        </div>
+        <h3 class="feature-title">Make Better Decisions</h3>
+        <ul class="feature-list">
+            <li>Cost-effectiveness analysis</li>
+            <li>Business impact metrics</li>
+            <li>Vendor strategy insights</li>
+        </ul>
+    </div>
+    <div class="feature-card">
+        <div class="feature-icon">
+            <svg width="24" height="24" fill="none" stroke="var(--accent-purple)" stroke-width="2" viewBox="0 0 24 24">
+                <path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
+            </svg>
+        </div>
+        <h3 class="feature-title">360° Domain Evaluation</h3>
+        <ul class="feature-list">
+            <li>Cross-domain evaluation</li>
+            <li>Real-world use cases</li>
+            <li>Edge case evaluation</li>
+        </ul>
+    </div>
+    <div class="feature-card">
+        <div class="feature-icon">
+            <svg width="24" height="24" fill="none" stroke="var(--accent-pink)" stroke-width="2" viewBox="0 0 24 24">
+                <path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
+            </svg>
+        </div>
+        <h3 class="feature-title">Updated Periodically</h3>
+        <ul class="feature-list">
+            <li>12 private models evaluated</li>
+            <li>5 open source models included</li>
+            <li>Monthly model additions</li>
+        </ul>
+    </div>
+</div>
+"""

results.csv CHANGED Viewed

@@ -1,5 +1,5 @@
 Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
-gemini-2.0-flash-exp,Private,Normal,Google,0.075,0.3,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
 gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
 gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
 gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925

 Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
+gemini-2.0-flash-exp,Private,Normal,Google,0.1,0.4,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
 gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
 gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
 gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925

tabs/leaderboard.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
-from data_loader import CATEGORIES, DESCRIPTION_HTML
 from visualization import (
     get_performance_chart,
     get_performance_cost_chart,
@@ -9,7 +9,6 @@ from utils import (
     get_rank_badge,
     get_score_bar,
     get_type_badge,
-    get_output_type_badge,
 )
 def filter_leaderboard(df, model_type, category, sort_by):
@@ -181,7 +180,8 @@ def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
         output = gr.HTML()
         plot1 = gr.Plot()
         plot2 = gr.Plot()
-        gr.Markdown(METHODOLOGY)
         for input_comp in [model_type, category, sort_by]:
             input_comp.change(

 import gradio as gr
+from data_loader import CATEGORIES, DESCRIPTION_HTML, CARDS
 from visualization import (
     get_performance_chart,
     get_performance_cost_chart,
     get_rank_badge,
     get_score_bar,
     get_type_badge,
 )
 def filter_leaderboard(df, model_type, category, sort_by):
         output = gr.HTML()
         plot1 = gr.Plot()
         plot2 = gr.Plot()
+        gr.HTML(METHODOLOGY)
         for input_comp in [model_type, category, sort_by]:
             input_comp.change(