Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Pratik Bhavsar
commited on
Commit
·
b9405c8
1
Parent(s):
0ef09d3
added more info
Browse files- app.py +1 -1
- data_loader.py +355 -24
- results.csv +2 -2
- tabs/leaderboard.py +1 -1
- tabs/model_comparison.py +1 -1
- visualization.py +7 -2
app.py
CHANGED
@@ -34,7 +34,7 @@ def create_app():
|
|
34 |
df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
|
35 |
)
|
36 |
|
37 |
-
mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT
|
38 |
|
39 |
# exp_outputs = create_exploration_tab(
|
40 |
# df, MODELS, DATASETS, SCORES, HEADER_CONTENT
|
|
|
34 |
df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
|
35 |
)
|
36 |
|
37 |
+
mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
|
38 |
|
39 |
# exp_outputs = create_exploration_tab(
|
40 |
# df, MODELS, DATASETS, SCORES, HEADER_CONTENT
|
data_loader.py
CHANGED
@@ -538,8 +538,8 @@ HEADER_CONTENT = (
|
|
538 |
<div class="header-wrapper">
|
539 |
<div class="header-content">
|
540 |
<div class="title-section">
|
541 |
-
|
542 |
-
<div class="title-gradient">Agent Leaderboard
|
543 |
|
544 |
<div class="description">
|
545 |
The landscape of AI agents is evolving rapidly, with major tech CEOs predicting 2025 as a pivotal year.
|
@@ -589,7 +589,7 @@ CARDS = """ <div class="metrics-grid">
|
|
589 |
<div class="metric-card">
|
590 |
<div class="metric-number metric-purple">14</div>
|
591 |
<div class="metric-label">Evaluation Datasets</div>
|
592 |
-
<div class="metric-detail primary">
|
593 |
<div class="metric-detail primary">Real-world use cases</div>
|
594 |
</div>
|
595 |
|
@@ -708,20 +708,20 @@ METHODOLOGY = """
|
|
708 |
font-size: 0.9rem;
|
709 |
}
|
710 |
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
|
|
716 |
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
|
724 |
-
<style>
|
725 |
.key-insights thead tr {
|
726 |
background: linear-gradient(90deg, #60A5FA, #818CF8);
|
727 |
}
|
@@ -739,10 +739,235 @@ METHODOLOGY = """
|
|
739 |
padding: 1rem;
|
740 |
border-bottom: 1px solid rgba(31, 41, 55, 0.5);
|
741 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
742 |
</style>
|
743 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
744 |
<div class="methodology-section">
|
745 |
-
<h1 class="
|
746 |
<div class="table-container">
|
747 |
<table class="dataset-table key-insights">
|
748 |
<thead>
|
@@ -754,11 +979,11 @@ METHODOLOGY = """
|
|
754 |
<tbody>
|
755 |
<tr>
|
756 |
<td>Performance Champion</td>
|
757 |
-
<td>Gemini-2.0-flash dominates with 0.
|
758 |
</tr>
|
759 |
<tr>
|
760 |
<td>Price-Performance Paradox</td>
|
761 |
-
<td>Top 3 models span
|
762 |
</tr>
|
763 |
<tr>
|
764 |
<td>Open Vs Closed Source</td>
|
@@ -770,7 +995,7 @@ METHODOLOGY = """
|
|
770 |
</tr>
|
771 |
<tr>
|
772 |
<td>Tool Miss Detection</td>
|
773 |
-
<td>
|
774 |
</tr>
|
775 |
<tr>
|
776 |
<td>Architecture Trade-offs</td>
|
@@ -780,7 +1005,7 @@ METHODOLOGY = """
|
|
780 |
</table>
|
781 |
</div>
|
782 |
|
783 |
-
<
|
784 |
<div class="table-container">
|
785 |
<table class="dataset-table key-insights">
|
786 |
<thead>
|
@@ -818,7 +1043,92 @@ METHODOLOGY = """
|
|
818 |
</table>
|
819 |
</div>
|
820 |
|
821 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
822 |
<div class="table-container">
|
823 |
<table class="dataset-table">
|
824 |
<thead>
|
@@ -832,10 +1142,10 @@ METHODOLOGY = """
|
|
832 |
</thead>
|
833 |
<tbody>
|
834 |
<tr>
|
835 |
-
<td rowspan="
|
836 |
<td>100 + 100</td>
|
837 |
<td class="category-cell">Single Function Call</td>
|
838 |
-
<td class="dataset-name">xlam_single_tool_single_call</td>
|
839 |
<td class="purpose-cell">Evaluates basic ability to read documentation and make single function calls</td>
|
840 |
</tr>
|
841 |
<tr>
|
@@ -856,6 +1166,12 @@ METHODOLOGY = """
|
|
856 |
<td class="dataset-name">tau_long_context</td>
|
857 |
<td class="purpose-cell">Assesses handling of extended interactions and complex instructions</td>
|
858 |
</tr>
|
|
|
|
|
|
|
|
|
|
|
|
|
859 |
<tr>
|
860 |
<td rowspan="5">Multi-Turn</td>
|
861 |
<td>50 + 30</td>
|
@@ -890,6 +1206,16 @@ METHODOLOGY = """
|
|
890 |
</tbody>
|
891 |
</table>
|
892 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
893 |
|
894 |
<!-- Features Grid Section -->
|
895 |
<div class="features-grid">
|
@@ -915,11 +1241,12 @@ METHODOLOGY = """
|
|
915 |
</div>
|
916 |
<h3 class="feature-title">360° Domain Evaluation</h3>
|
917 |
<ul class="feature-list">
|
918 |
-
<li>
|
919 |
<li>Real-world use cases</li>
|
920 |
<li>Edge case evaluation</li>
|
921 |
</ul>
|
922 |
</div>
|
|
|
923 |
|
924 |
<div class="feature-card">
|
925 |
<div class="feature-icon">
|
@@ -935,4 +1262,8 @@ METHODOLOGY = """
|
|
935 |
</ul>
|
936 |
</div>
|
937 |
|
|
|
|
|
|
|
|
|
938 |
"""
|
|
|
538 |
<div class="header-wrapper">
|
539 |
<div class="header-content">
|
540 |
<div class="title-section">
|
541 |
+
|
542 |
+
<div class="title-gradient">Agent Leaderboard</div>
|
543 |
|
544 |
<div class="description">
|
545 |
The landscape of AI agents is evolving rapidly, with major tech CEOs predicting 2025 as a pivotal year.
|
|
|
589 |
<div class="metric-card">
|
590 |
<div class="metric-number metric-purple">14</div>
|
591 |
<div class="metric-label">Evaluation Datasets</div>
|
592 |
+
<div class="metric-detail primary">Multi-Domain Testing</div>
|
593 |
<div class="metric-detail primary">Real-world use cases</div>
|
594 |
</div>
|
595 |
|
|
|
708 |
font-size: 0.9rem;
|
709 |
}
|
710 |
|
711 |
+
.code-intro {
|
712 |
+
color: var(--text-secondary);
|
713 |
+
font-size: 1.1rem;
|
714 |
+
margin-bottom: 1.5rem;
|
715 |
+
line-height: 1.6;
|
716 |
+
}
|
717 |
|
718 |
+
.section-divider {
|
719 |
+
margin: 1rem 0;
|
720 |
+
border-top: 1px solid var(--border-color);
|
721 |
+
opacity: 0.3;
|
722 |
+
}
|
723 |
+
|
724 |
|
|
|
725 |
.key-insights thead tr {
|
726 |
background: linear-gradient(90deg, #60A5FA, #818CF8);
|
727 |
}
|
|
|
739 |
padding: 1rem;
|
740 |
border-bottom: 1px solid rgba(31, 41, 55, 0.5);
|
741 |
}
|
742 |
+
|
743 |
+
.highlight {
|
744 |
+
color: var(--accent-blue);
|
745 |
+
font-weight: 600;
|
746 |
+
display: inline-flex;
|
747 |
+
align-items: center;
|
748 |
+
}
|
749 |
+
|
750 |
+
.highlight::after {
|
751 |
+
content: ":";
|
752 |
+
margin-right: 0.5rem; /* Adds space after the colon */
|
753 |
+
}
|
754 |
+
|
755 |
+
@media (prefers-color-scheme: dark) {
|
756 |
+
:root {
|
757 |
+
--bg-primary: #0B0B19;
|
758 |
+
--bg-secondary: rgba(19, 19, 37, 0.4);
|
759 |
+
--text-primary: #ffffff;
|
760 |
+
--text-secondary: #94A3B8;
|
761 |
+
--border-color: rgba(31, 41, 55, 0.5);
|
762 |
+
--accent-blue: #60A5FA;
|
763 |
+
--accent-purple: #A78BFA;
|
764 |
+
--code-bg: #1E1E2E;
|
765 |
+
--code-line-highlight: rgba(96, 165, 250, 0.1);
|
766 |
+
--bullet-color: #60A5FA;
|
767 |
+
--table-header: #1a1b1e;
|
768 |
+
--table-border: #2d2e32;
|
769 |
+
--table-hover: #2d2e32;
|
770 |
+
}
|
771 |
+
}
|
772 |
+
|
773 |
+
@media (prefers-color-scheme: light) {
|
774 |
+
:root {
|
775 |
+
--bg-primary: #ffffff;
|
776 |
+
--bg-secondary: rgba(243, 244, 246, 0.4);
|
777 |
+
--text-primary: #111827;
|
778 |
+
--text-secondary: #4B5563;
|
779 |
+
--border-color: rgba(209, 213, 219, 0.5);
|
780 |
+
--accent-blue: #3B82F6;
|
781 |
+
--accent-purple: #8B5CF6;
|
782 |
+
--code-bg: #F8FAFC;
|
783 |
+
--code-line-highlight: rgba(59, 130, 246, 0.1);
|
784 |
+
--bullet-color: #3B82F6;
|
785 |
+
--table-header: #F8FAFC;
|
786 |
+
--table-border: #E5E7EB;
|
787 |
+
--table-hover: #F3F4F6;
|
788 |
+
}
|
789 |
+
}
|
790 |
+
|
791 |
+
.methodology-content {
|
792 |
+
max-width: 1200px;
|
793 |
+
margin: 0 auto;
|
794 |
+
padding: 2rem;
|
795 |
+
color: var(--text-secondary);
|
796 |
+
line-height: 1.7;
|
797 |
+
font-size: 1rem;
|
798 |
+
}
|
799 |
+
|
800 |
+
.section-title {
|
801 |
+
font-size: 2.5rem;
|
802 |
+
font-weight: 700;
|
803 |
+
margin: 3rem 0 1.5rem;
|
804 |
+
color: var(--text-primary);
|
805 |
+
background: linear-gradient(to right, var(--accent-blue), var(--accent-purple));
|
806 |
+
-webkit-background-clip: text;
|
807 |
+
-webkit-text-fill-color: transparent;
|
808 |
+
letter-spacing: -0.02em;
|
809 |
+
}
|
810 |
+
|
811 |
+
.subsection-title {
|
812 |
+
font-size: 1.75rem;
|
813 |
+
font-weight: 600;
|
814 |
+
margin: 2rem 0 1rem;
|
815 |
+
color: var(--text-primary);
|
816 |
+
letter-spacing: -0.01em;
|
817 |
+
}
|
818 |
+
|
819 |
+
.content-block {
|
820 |
+
background: var(--bg-secondary);
|
821 |
+
border: 1px solid var(--border-color);
|
822 |
+
border-radius: 12px;
|
823 |
+
padding: 1.5rem;
|
824 |
+
margin: 1.5rem 0;
|
825 |
+
}
|
826 |
+
|
827 |
+
.methodology-list {
|
828 |
+
list-style: none !important; /* Force remove default bullets */
|
829 |
+
padding: 0;
|
830 |
+
margin: 1rem 0;
|
831 |
+
}
|
832 |
+
|
833 |
+
.methodology-list li {
|
834 |
+
padding-left: 2rem;
|
835 |
+
position: relative;
|
836 |
+
margin: 1rem 0;
|
837 |
+
color: var(--text-secondary);
|
838 |
+
display: flex; /* Add flex display */
|
839 |
+
align-items: flex-start; /* Align items to top */
|
840 |
+
}
|
841 |
+
|
842 |
+
.methodology-list li::before {
|
843 |
+
content: '';
|
844 |
+
position: absolute;
|
845 |
+
left: 0;
|
846 |
+
top: 0.75rem;
|
847 |
+
width: 8px;
|
848 |
+
height: 8px;
|
849 |
+
background: var(--bullet-color);
|
850 |
+
border-radius: 50%;
|
851 |
+
box-shadow: 0 0 0 2px rgba(96, 165, 250, 0.2);
|
852 |
+
flex-shrink: 0; /* Prevent bullet from shrinking */
|
853 |
+
}
|
854 |
+
|
855 |
+
/* Additional fix for nested list items */
|
856 |
+
.methodology-list li > * {
|
857 |
+
list-style: none !important;
|
858 |
+
margin-left: 0;
|
859 |
+
padding-left: 0;
|
860 |
+
}
|
861 |
+
|
862 |
+
.code-block {
|
863 |
+
background: var(--code-bg);
|
864 |
+
border-radius: 12px;
|
865 |
+
padding: 1.5rem;
|
866 |
+
margin: 1.5rem 0;
|
867 |
+
font-family: 'SF Mono', 'Menlo', monospace;
|
868 |
+
font-size: 0.9rem;
|
869 |
+
overflow-x: auto;
|
870 |
+
border: 1px solid var(--border-color);
|
871 |
+
}
|
872 |
+
|
873 |
+
.code-block pre {
|
874 |
+
margin: 0;
|
875 |
+
padding: 0;
|
876 |
+
}
|
877 |
+
|
878 |
+
.highlight {
|
879 |
+
color: var(--accent-blue);
|
880 |
+
font-weight: 600;
|
881 |
+
}
|
882 |
+
|
883 |
+
/* Dataset Table Styling */
|
884 |
+
.dataset-table {
|
885 |
+
width: 100%;
|
886 |
+
border-collapse: collapse;
|
887 |
+
margin: 1.5rem 0;
|
888 |
+
background: var(--bg-secondary);
|
889 |
+
border-radius: 12px;
|
890 |
+
overflow: hidden;
|
891 |
+
}
|
892 |
+
|
893 |
+
.dataset-table th {
|
894 |
+
background: var(--table-header);
|
895 |
+
padding: 1rem;
|
896 |
+
text-align: left;
|
897 |
+
font-weight: 600;
|
898 |
+
color: var(--text-primary);
|
899 |
+
border-bottom: 2px solid var(--table-border);
|
900 |
+
}
|
901 |
+
|
902 |
+
.dataset-table td {
|
903 |
+
padding: 1rem;
|
904 |
+
border-bottom: 1px solid var(--table-border);
|
905 |
+
color: var(--text-secondary);
|
906 |
+
}
|
907 |
+
|
908 |
+
.dataset-table tbody tr:hover {
|
909 |
+
background: var(--table-hover);
|
910 |
+
}
|
911 |
+
|
912 |
+
.dataset-table td:first-child {
|
913 |
+
font-weight: 500;
|
914 |
+
}
|
915 |
+
|
916 |
</style>
|
917 |
|
918 |
+
<!-- Methodology Section -->
|
919 |
+
<h1 class="section-title">Methodology</h1>
|
920 |
+
<p>Our evaluation process follows a systematic approach to ensure comprehensive and fair assessment of AI agents. We evaluate language models' ability to effectively use tools
|
921 |
+
in single and multi-turn conversations. Our evaluation focuses on both basic functionality and edge
|
922 |
+
cases that challenge real-world applicability.</p>
|
923 |
+
|
924 |
+
<ul class="methodology-list">
|
925 |
+
<li><span class="highlight">Model Selection</span>We begin by curating a diverse set of leading language models, including both proprietary and open-source implementations.</li>
|
926 |
+
<li><span class="highlight">Agent Configuration</span>Each model is configured as an agent using a standardized system prompt and given access to a consistent set of tools.</li>
|
927 |
+
<li><span class="highlight">Metric Definition</span> <a href="https://docs.galileo.ai/galileo/gen-ai-studio-products/galileo-guardrail-metrics/tool-selection-quality#tool-selection-quality">Tool Selection Quality (TSQ)</a></li>
|
928 |
+
<li><span class="highlight">Scoring System</span>The final performance score is calculated as an equally weighted average across all datasets.</li>
|
929 |
+
<li><span class="highlight">Dataset Curation</span>We strategically sampled from established benchmarking datasets. See later section for more info.</li>
|
930 |
+
<div class="methodology-section">
|
931 |
+
<div class="table-container">
|
932 |
+
<table class="dataset-table">
|
933 |
+
<thead>
|
934 |
+
<tr>
|
935 |
+
<th>Dataset</th>
|
936 |
+
<th>Domains</th>
|
937 |
+
<th>Link</th>
|
938 |
+
</tr>
|
939 |
+
</thead>
|
940 |
+
<tbody>
|
941 |
+
<tr>
|
942 |
+
<td>BFCL</td>
|
943 |
+
<td>Mathematics, Entertainment, Education, and Academic Domains</td>
|
944 |
+
<td><a href="https://gorilla.cs.berkeley.edu/leaderboard.html">View Dataset</a></td>
|
945 |
+
</tr>
|
946 |
+
<tr>
|
947 |
+
<td>τ-bench</td>
|
948 |
+
<td>Retail and Airline Industry Scenarios</td>
|
949 |
+
<td><a href="https://github.com/sierra-research/tau-bench">View Dataset</a></td>
|
950 |
+
</tr>
|
951 |
+
<tr>
|
952 |
+
<td>xLAM</td>
|
953 |
+
<td>Cross-domain Data Generation (21 Domains)</td>
|
954 |
+
<td><a href="https://www.salesforce.com/blog/xlam-large-action-models/">View Dataset</a></td>
|
955 |
+
</tr>
|
956 |
+
<tr>
|
957 |
+
<td>ToolACE</td>
|
958 |
+
<td>API Interactions across 390 Domains</td>
|
959 |
+
<td><a href="https://arxiv.org/abs/2409.00920">View Dataset</a></td>
|
960 |
+
</tr>
|
961 |
+
</tbody>
|
962 |
+
</table>
|
963 |
+
</div>
|
964 |
+
</div>
|
965 |
+
|
966 |
+
|
967 |
+
</ul>
|
968 |
+
|
969 |
<div class="methodology-section">
|
970 |
+
<h1 class="section-title">Key Insights</h1>
|
971 |
<div class="table-container">
|
972 |
<table class="dataset-table key-insights">
|
973 |
<thead>
|
|
|
979 |
<tbody>
|
980 |
<tr>
|
981 |
<td>Performance Champion</td>
|
982 |
+
<td>Gemini-2.0-flash dominates with 0.938 score at a very affordable cost, excelling in both complex tasks and safety features.</td>
|
983 |
</tr>
|
984 |
<tr>
|
985 |
<td>Price-Performance Paradox</td>
|
986 |
+
<td>Top 3 models span 10x price difference yet only 4% performance gap, challenging pricing assumptions</td>
|
987 |
</tr>
|
988 |
<tr>
|
989 |
<td>Open Vs Closed Source</td>
|
|
|
995 |
</tr>
|
996 |
<tr>
|
997 |
<td>Tool Miss Detection</td>
|
998 |
+
<td>Low dataset averages of 0.60(tool_miss) and 0.73(miss_func) reveal fundamental challenges in handling edge cases and maintaining context, even as models excel at basic tasks</td>
|
999 |
</tr>
|
1000 |
<tr>
|
1001 |
<td>Architecture Trade-offs</td>
|
|
|
1005 |
</table>
|
1006 |
</div>
|
1007 |
|
1008 |
+
<h1 class="section-title">Development Implications</h2>
|
1009 |
<div class="table-container">
|
1010 |
<table class="dataset-table key-insights">
|
1011 |
<thead>
|
|
|
1043 |
</table>
|
1044 |
</div>
|
1045 |
|
1046 |
+
|
1047 |
+
<div></div>
|
1048 |
+
<h1 class="section-title">How Do We Measure Agent's Performance?</h1>
|
1049 |
+
<div>
|
1050 |
+
<p>The complexity of tool calling extends far beyond simple API invocations. We developed the Tool Selection Quality metric to assess agents' tool call performance, evaluating tool selection accuracy and effectiveness of parameter usage.</p>
|
1051 |
+
|
1052 |
+
<div class="section-divider"></div>
|
1053 |
+
<h2 class="subsection-title">Scenario Recognition</h2>
|
1054 |
+
<div class="explanation-block">
|
1055 |
+
<p>When an agent encounters a query, it must first determine if tool usage is warranted. Information may already exist in the conversation history, making tool calls redundant. Alternatively, available tools might be insufficient or irrelevant to the task, requiring the agent to acknowledge limitations rather than force inappropriate tool usage.</p>
|
1056 |
+
</div>
|
1057 |
+
|
1058 |
+
<div class="section-divider"></div>
|
1059 |
+
<h2 class="subsection-title">Tool Selection Dynamics</h2>
|
1060 |
+
<div class="explanation-block">
|
1061 |
+
<p>Tool selection isn't binary—it involves both precision and recall. An agent might correctly identify one necessary tool while missing others (recall issue) or select appropriate tools alongside unnecessary ones (precision issue). While suboptimal, these scenarios represent different severity levels of selection errors.</p>
|
1062 |
+
</div>
|
1063 |
+
|
1064 |
+
<div class="section-divider"></div>
|
1065 |
+
<h2 class="subsection-title">Parameter Handling</h2>
|
1066 |
+
<div class="explanation-block">
|
1067 |
+
<p>Even with correct tool selection, argument handling introduces additional complexity. Agents must:</p>
|
1068 |
+
<ul class="methodology-list">
|
1069 |
+
<li>Provide all required parameters with correct naming</li>
|
1070 |
+
<li>Handle optional parameters appropriately</li>
|
1071 |
+
<li>Maintain parameter value accuracy</li>
|
1072 |
+
<li>Format arguments according to tool specifications</li>
|
1073 |
+
</ul>
|
1074 |
+
</div>
|
1075 |
+
|
1076 |
+
<div class="section-divider"></div>
|
1077 |
+
<h2 class="subsection-title">Sequential Decision Making</h2>
|
1078 |
+
<div class="explanation-block">
|
1079 |
+
<p>Multi-step tasks require agents to:</p>
|
1080 |
+
<ul class="methodology-list">
|
1081 |
+
<li>Determine optimal tool calling sequence</li>
|
1082 |
+
<li>Handle interdependencies between tool calls</li>
|
1083 |
+
<li>Maintain context across multiple operations</li>
|
1084 |
+
<li>Adapt to partial results or failures</li>
|
1085 |
+
</ul>
|
1086 |
+
</div>
|
1087 |
+
|
1088 |
+
<div class="section-divider"></div>
|
1089 |
+
<h2 class="subsection-title">Example code</h2>
|
1090 |
+
<p class="code-intro">Below is the code example of evaluating an LLM for a dataset.</p>
|
1091 |
+
<div class="code-block">
|
1092 |
+
|
1093 |
+
<pre>
|
1094 |
+
import promptquality as pq
|
1095 |
+
|
1096 |
+
df = pd.read_parquet(file_path)
|
1097 |
+
|
1098 |
+
chainpoll_tool_selection_scorer = pq.CustomizedChainPollScorer(
|
1099 |
+
scorer_name=pq.CustomizedScorerName.tool_selection_quality,
|
1100 |
+
model_alias=pq.Models.gpt_4o,
|
1101 |
+
)
|
1102 |
+
|
1103 |
+
evaluate_handler = pq.GalileoPromptCallback(
|
1104 |
+
project_name=project_name,
|
1105 |
+
run_name=run_name,
|
1106 |
+
scorers=[chainpoll_tool_selection_scorer],
|
1107 |
+
)
|
1108 |
+
|
1109 |
+
llm = llm_handler.get_llm(model, temperature=0.0, max_tokens=4000)
|
1110 |
+
system_msg = {
|
1111 |
+
"role": "system",
|
1112 |
+
"content": 'Your job is to use the given tools to answer the query of human. If there is no relevant tool then reply with "I cannot answer the question with given tools". If tool is available but sufficient information is not available, then ask human to get the same. You can call as many tools as you want. Use multiple tools if needed. If the tools need to be called in a sequence then just call the first tool.',
|
1113 |
+
}
|
1114 |
+
|
1115 |
+
for row in df.itertuples():
|
1116 |
+
chain = llm.bind_tools(tools)
|
1117 |
+
outputs.append(
|
1118 |
+
chain.invoke(
|
1119 |
+
[system_msg, *row.conversation],
|
1120 |
+
config=dict(callbacks=[evaluate_handler])
|
1121 |
+
)
|
1122 |
+
)
|
1123 |
+
|
1124 |
+
evaluate_handler.finish()
|
1125 |
+
|
1126 |
+
</pre>
|
1127 |
+
</div>
|
1128 |
+
</div>
|
1129 |
+
|
1130 |
+
|
1131 |
+
<h1 class="section-title">Dataset Structure</h2>
|
1132 |
<div class="table-container">
|
1133 |
<table class="dataset-table">
|
1134 |
<thead>
|
|
|
1142 |
</thead>
|
1143 |
<tbody>
|
1144 |
<tr>
|
1145 |
+
<td rowspan="5">Single-Turn</td>
|
1146 |
<td>100 + 100</td>
|
1147 |
<td class="category-cell">Single Function Call</td>
|
1148 |
+
<td class="dataset-name">xlam_single_tool_single_call, xlam_multiple_tool_single_call</td>
|
1149 |
<td class="purpose-cell">Evaluates basic ability to read documentation and make single function calls</td>
|
1150 |
</tr>
|
1151 |
<tr>
|
|
|
1166 |
<td class="dataset-name">tau_long_context</td>
|
1167 |
<td class="purpose-cell">Assesses handling of extended interactions and complex instructions</td>
|
1168 |
</tr>
|
1169 |
+
<tr>
|
1170 |
+
<td>100</td>
|
1171 |
+
<td class="category-cell">Missing Function</td>
|
1172 |
+
<td class="dataset-name">xlam_tool_miss</td>
|
1173 |
+
<td class="purpose-cell">Tests graceful handling of unavailable tools</td>
|
1174 |
+
</tr>
|
1175 |
<tr>
|
1176 |
<td rowspan="5">Multi-Turn</td>
|
1177 |
<td>50 + 30</td>
|
|
|
1206 |
</tbody>
|
1207 |
</table>
|
1208 |
</div>
|
1209 |
+
|
1210 |
+
<div class="section-divider"></div>
|
1211 |
+
<h2 class="section-title">Citation</h2>
|
1212 |
+
<div class="bibtex-citation" style="font-family: monospace; white-space: pre; padding: 1em; background-color: rgba(128, 128, 128, 0.1); border: 1px solid rgba(128, 128, 128, 0.2); border-radius: 4px; color: currentColor;">@misc{agent-leaderboard,
|
1213 |
+
author = {Pratik Bhavsar},
|
1214 |
+
title = {Agent Leaderboard},
|
1215 |
+
year = {2025},
|
1216 |
+
publisher = {Galileo.ai},
|
1217 |
+
howpublished = {\\url{https://huggingface.co/spaces/galileo-ai/agent-leaderboard}}
|
1218 |
+
}</div>
|
1219 |
|
1220 |
<!-- Features Grid Section -->
|
1221 |
<div class="features-grid">
|
|
|
1241 |
</div>
|
1242 |
<h3 class="feature-title">360° Domain Evaluation</h3>
|
1243 |
<ul class="feature-list">
|
1244 |
+
<li>Multi-domain evaluation</li>
|
1245 |
<li>Real-world use cases</li>
|
1246 |
<li>Edge case evaluation</li>
|
1247 |
</ul>
|
1248 |
</div>
|
1249 |
+
|
1250 |
|
1251 |
<div class="feature-card">
|
1252 |
<div class="feature-icon">
|
|
|
1262 |
</ul>
|
1263 |
</div>
|
1264 |
|
1265 |
+
</div>
|
1266 |
+
|
1267 |
+
|
1268 |
+
|
1269 |
"""
|
results.csv
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
|
2 |
-
gemini-2.0-flash-
|
3 |
gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
|
4 |
gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
|
5 |
gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
|
@@ -16,4 +16,4 @@ mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,
|
|
16 |
ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
|
17 |
Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
|
18 |
open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
|
19 |
-
,,,,,,,0.83,0.79,0.81,0.78,0.76,0.
|
|
|
1 |
Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
|
2 |
+
gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
|
3 |
gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
|
4 |
gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
|
5 |
gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
|
|
|
16 |
ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
|
17 |
Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
|
18 |
open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
|
19 |
+
,,,,,,,0.83,0.79,0.81,0.78,0.76,0.88,0.80,0.96,0.60,0.81,0.82,0.81,0.92,0.85,0.73,0.80
|
tabs/leaderboard.py
CHANGED
@@ -126,7 +126,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
|
|
126 |
<th>Type</th>
|
127 |
<th>Vendor</th>
|
128 |
<th>Cost (I/O)</th>
|
129 |
-
<th>Category Score</th>
|
130 |
</tr>
|
131 |
</thead>
|
132 |
<tbody>
|
|
|
126 |
<th>Type</th>
|
127 |
<th>Vendor</th>
|
128 |
<th>Cost (I/O)</th>
|
129 |
+
<th>Avg Category Score (TSQ)</th>
|
130 |
</tr>
|
131 |
</thead>
|
132 |
<tbody>
|
tabs/model_comparison.py
CHANGED
@@ -47,7 +47,7 @@ def compare_models(df, model_names=None):
|
|
47 |
return info_html, radar_chart
|
48 |
|
49 |
|
50 |
-
def create_model_comparison_tab(df, HEADER_CONTENT
|
51 |
with gr.Tab("Model Comparison"):
|
52 |
gr.HTML(HEADER_CONTENT)
|
53 |
with gr.Column():
|
|
|
47 |
return info_html, radar_chart
|
48 |
|
49 |
|
50 |
+
def create_model_comparison_tab(df, HEADER_CONTENT):
|
51 |
with gr.Tab("Model Comparison"):
|
52 |
gr.HTML(HEADER_CONTENT)
|
53 |
with gr.Column():
|
visualization.py
CHANGED
@@ -39,7 +39,12 @@ def get_performance_chart(df, category_name="Overall"):
|
|
39 |
fontweight="bold",
|
40 |
color=colors["text"],
|
41 |
)
|
42 |
-
ax.set_xlabel(
|
|
|
|
|
|
|
|
|
|
|
43 |
ax.set_xlim(0.0, 1.0)
|
44 |
|
45 |
ax.set_yticks(np.arange(len(df_sorted)))
|
@@ -222,7 +227,7 @@ def get_performance_cost_chart(df, category_name="Overall"):
|
|
222 |
|
223 |
ax.set_title(
|
224 |
f"Performance vs. Cost - {category_name}",
|
225 |
-
fontsize=
|
226 |
pad=15,
|
227 |
fontweight="bold",
|
228 |
color=colors["text"],
|
|
|
39 |
fontweight="bold",
|
40 |
color=colors["text"],
|
41 |
)
|
42 |
+
ax.set_xlabel(
|
43 |
+
"Average Score (Tool Selection Quality)",
|
44 |
+
fontsize=14,
|
45 |
+
labelpad=10,
|
46 |
+
color=colors["text"],
|
47 |
+
)
|
48 |
ax.set_xlim(0.0, 1.0)
|
49 |
|
50 |
ax.set_yticks(np.arange(len(df_sorted)))
|
|
|
227 |
|
228 |
ax.set_title(
|
229 |
f"Performance vs. Cost - {category_name}",
|
230 |
+
fontsize=14,
|
231 |
pad=15,
|
232 |
fontweight="bold",
|
233 |
color=colors["text"],
|