Pratik Bhavsar commited on
Commit
b9405c8
·
1 Parent(s): 0ef09d3

added more info

Browse files
Files changed (6) hide show
  1. app.py +1 -1
  2. data_loader.py +355 -24
  3. results.csv +2 -2
  4. tabs/leaderboard.py +1 -1
  5. tabs/model_comparison.py +1 -1
  6. visualization.py +7 -2
app.py CHANGED
@@ -34,7 +34,7 @@ def create_app():
34
  df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
35
  )
36
 
37
- mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT, CARDS)
38
 
39
  # exp_outputs = create_exploration_tab(
40
  # df, MODELS, DATASETS, SCORES, HEADER_CONTENT
 
34
  df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
35
  )
36
 
37
+ mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
38
 
39
  # exp_outputs = create_exploration_tab(
40
  # df, MODELS, DATASETS, SCORES, HEADER_CONTENT
data_loader.py CHANGED
@@ -538,8 +538,8 @@ HEADER_CONTENT = (
538
  <div class="header-wrapper">
539
  <div class="header-content">
540
  <div class="title-section">
541
- <div class="subtitle-white">Welcome to the</div>
542
- <div class="title-gradient">Agent Leaderboard!</div>
543
 
544
  <div class="description">
545
  The landscape of AI agents is evolving rapidly, with major tech CEOs predicting 2025 as a pivotal year.
@@ -589,7 +589,7 @@ CARDS = """ <div class="metrics-grid">
589
  <div class="metric-card">
590
  <div class="metric-number metric-purple">14</div>
591
  <div class="metric-label">Evaluation Datasets</div>
592
- <div class="metric-detail primary">Cross-Domain Testing</div>
593
  <div class="metric-detail primary">Real-world use cases</div>
594
  </div>
595
 
@@ -708,20 +708,20 @@ METHODOLOGY = """
708
  font-size: 0.9rem;
709
  }
710
 
711
- [Rest of the CSS remains the same]
712
- </style>
713
- <!-- Methodology Section -->
714
- <div class="methodology-section">
715
- <h1 class="methodology-title">Methodology</h1>
 
716
 
717
- <h2 class="methodology-subtitle">Overview</h2>
718
- <p class="methodology-text">
719
- We evaluate language models' ability to effectively use tools
720
- in single and multi-turn conversations. Our evaluation focuses on both basic functionality and edge
721
- cases that challenge real-world applicability.
722
- </p>
723
 
724
- <style>
725
  .key-insights thead tr {
726
  background: linear-gradient(90deg, #60A5FA, #818CF8);
727
  }
@@ -739,10 +739,235 @@ METHODOLOGY = """
739
  padding: 1rem;
740
  border-bottom: 1px solid rgba(31, 41, 55, 0.5);
741
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
  </style>
743
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
744
  <div class="methodology-section">
745
- <h1 class="methodology-subtitle">Key Insights</h1>
746
  <div class="table-container">
747
  <table class="dataset-table key-insights">
748
  <thead>
@@ -754,11 +979,11 @@ METHODOLOGY = """
754
  <tbody>
755
  <tr>
756
  <td>Performance Champion</td>
757
- <td>Gemini-2.0-flash dominates with 0.935 score at just $0.075 per million tokens, excelling in both complex tasks (0.95) and safety features (0.98)</td>
758
  </tr>
759
  <tr>
760
  <td>Price-Performance Paradox</td>
761
- <td>Top 3 models span 20x price difference yet only 3% performance gap, challenging pricing assumptions</td>
762
  </tr>
763
  <tr>
764
  <td>Open Vs Closed Source</td>
@@ -770,7 +995,7 @@ METHODOLOGY = """
770
  </tr>
771
  <tr>
772
  <td>Tool Miss Detection</td>
773
- <td>Dataset averages of 0.59 and 0.78 reveal fundamental challenges in handling edge cases and maintaining context, even as models excel at basic tasks</td>
774
  </tr>
775
  <tr>
776
  <td>Architecture Trade-offs</td>
@@ -780,7 +1005,7 @@ METHODOLOGY = """
780
  </table>
781
  </div>
782
 
783
- <h2 class="methodology-subtitle">Development Implications</h2>
784
  <div class="table-container">
785
  <table class="dataset-table key-insights">
786
  <thead>
@@ -818,7 +1043,92 @@ METHODOLOGY = """
818
  </table>
819
  </div>
820
 
821
- <h2 class="methodology-subtitle">Dataset Structure</h2>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
822
  <div class="table-container">
823
  <table class="dataset-table">
824
  <thead>
@@ -832,10 +1142,10 @@ METHODOLOGY = """
832
  </thead>
833
  <tbody>
834
  <tr>
835
- <td rowspan="4">Single-Turn</td>
836
  <td>100 + 100</td>
837
  <td class="category-cell">Single Function Call</td>
838
- <td class="dataset-name">xlam_single_tool_single_call</td>
839
  <td class="purpose-cell">Evaluates basic ability to read documentation and make single function calls</td>
840
  </tr>
841
  <tr>
@@ -856,6 +1166,12 @@ METHODOLOGY = """
856
  <td class="dataset-name">tau_long_context</td>
857
  <td class="purpose-cell">Assesses handling of extended interactions and complex instructions</td>
858
  </tr>
 
 
 
 
 
 
859
  <tr>
860
  <td rowspan="5">Multi-Turn</td>
861
  <td>50 + 30</td>
@@ -890,6 +1206,16 @@ METHODOLOGY = """
890
  </tbody>
891
  </table>
892
  </div>
 
 
 
 
 
 
 
 
 
 
893
 
894
  <!-- Features Grid Section -->
895
  <div class="features-grid">
@@ -915,11 +1241,12 @@ METHODOLOGY = """
915
  </div>
916
  <h3 class="feature-title">360° Domain Evaluation</h3>
917
  <ul class="feature-list">
918
- <li>Cross-domain evaluation</li>
919
  <li>Real-world use cases</li>
920
  <li>Edge case evaluation</li>
921
  </ul>
922
  </div>
 
923
 
924
  <div class="feature-card">
925
  <div class="feature-icon">
@@ -935,4 +1262,8 @@ METHODOLOGY = """
935
  </ul>
936
  </div>
937
 
 
 
 
 
938
  """
 
538
  <div class="header-wrapper">
539
  <div class="header-content">
540
  <div class="title-section">
541
+
542
+ <div class="title-gradient">Agent Leaderboard</div>
543
 
544
  <div class="description">
545
  The landscape of AI agents is evolving rapidly, with major tech CEOs predicting 2025 as a pivotal year.
 
589
  <div class="metric-card">
590
  <div class="metric-number metric-purple">14</div>
591
  <div class="metric-label">Evaluation Datasets</div>
592
+ <div class="metric-detail primary">Multi-Domain Testing</div>
593
  <div class="metric-detail primary">Real-world use cases</div>
594
  </div>
595
 
 
708
  font-size: 0.9rem;
709
  }
710
 
711
+ .code-intro {
712
+ color: var(--text-secondary);
713
+ font-size: 1.1rem;
714
+ margin-bottom: 1.5rem;
715
+ line-height: 1.6;
716
+ }
717
 
718
+ .section-divider {
719
+ margin: 1rem 0;
720
+ border-top: 1px solid var(--border-color);
721
+ opacity: 0.3;
722
+ }
723
+
724
 
 
725
  .key-insights thead tr {
726
  background: linear-gradient(90deg, #60A5FA, #818CF8);
727
  }
 
739
  padding: 1rem;
740
  border-bottom: 1px solid rgba(31, 41, 55, 0.5);
741
  }
742
+
743
+ .highlight {
744
+ color: var(--accent-blue);
745
+ font-weight: 600;
746
+ display: inline-flex;
747
+ align-items: center;
748
+ }
749
+
750
+ .highlight::after {
751
+ content: ":";
752
+ margin-right: 0.5rem; /* Adds space after the colon */
753
+ }
754
+
755
+ @media (prefers-color-scheme: dark) {
756
+ :root {
757
+ --bg-primary: #0B0B19;
758
+ --bg-secondary: rgba(19, 19, 37, 0.4);
759
+ --text-primary: #ffffff;
760
+ --text-secondary: #94A3B8;
761
+ --border-color: rgba(31, 41, 55, 0.5);
762
+ --accent-blue: #60A5FA;
763
+ --accent-purple: #A78BFA;
764
+ --code-bg: #1E1E2E;
765
+ --code-line-highlight: rgba(96, 165, 250, 0.1);
766
+ --bullet-color: #60A5FA;
767
+ --table-header: #1a1b1e;
768
+ --table-border: #2d2e32;
769
+ --table-hover: #2d2e32;
770
+ }
771
+ }
772
+
773
+ @media (prefers-color-scheme: light) {
774
+ :root {
775
+ --bg-primary: #ffffff;
776
+ --bg-secondary: rgba(243, 244, 246, 0.4);
777
+ --text-primary: #111827;
778
+ --text-secondary: #4B5563;
779
+ --border-color: rgba(209, 213, 219, 0.5);
780
+ --accent-blue: #3B82F6;
781
+ --accent-purple: #8B5CF6;
782
+ --code-bg: #F8FAFC;
783
+ --code-line-highlight: rgba(59, 130, 246, 0.1);
784
+ --bullet-color: #3B82F6;
785
+ --table-header: #F8FAFC;
786
+ --table-border: #E5E7EB;
787
+ --table-hover: #F3F4F6;
788
+ }
789
+ }
790
+
791
+ .methodology-content {
792
+ max-width: 1200px;
793
+ margin: 0 auto;
794
+ padding: 2rem;
795
+ color: var(--text-secondary);
796
+ line-height: 1.7;
797
+ font-size: 1rem;
798
+ }
799
+
800
+ .section-title {
801
+ font-size: 2.5rem;
802
+ font-weight: 700;
803
+ margin: 3rem 0 1.5rem;
804
+ color: var(--text-primary);
805
+ background: linear-gradient(to right, var(--accent-blue), var(--accent-purple));
806
+ -webkit-background-clip: text;
807
+ -webkit-text-fill-color: transparent;
808
+ letter-spacing: -0.02em;
809
+ }
810
+
811
+ .subsection-title {
812
+ font-size: 1.75rem;
813
+ font-weight: 600;
814
+ margin: 2rem 0 1rem;
815
+ color: var(--text-primary);
816
+ letter-spacing: -0.01em;
817
+ }
818
+
819
+ .content-block {
820
+ background: var(--bg-secondary);
821
+ border: 1px solid var(--border-color);
822
+ border-radius: 12px;
823
+ padding: 1.5rem;
824
+ margin: 1.5rem 0;
825
+ }
826
+
827
+ .methodology-list {
828
+ list-style: none !important; /* Force remove default bullets */
829
+ padding: 0;
830
+ margin: 1rem 0;
831
+ }
832
+
833
+ .methodology-list li {
834
+ padding-left: 2rem;
835
+ position: relative;
836
+ margin: 1rem 0;
837
+ color: var(--text-secondary);
838
+ display: flex; /* Add flex display */
839
+ align-items: flex-start; /* Align items to top */
840
+ }
841
+
842
+ .methodology-list li::before {
843
+ content: '';
844
+ position: absolute;
845
+ left: 0;
846
+ top: 0.75rem;
847
+ width: 8px;
848
+ height: 8px;
849
+ background: var(--bullet-color);
850
+ border-radius: 50%;
851
+ box-shadow: 0 0 0 2px rgba(96, 165, 250, 0.2);
852
+ flex-shrink: 0; /* Prevent bullet from shrinking */
853
+ }
854
+
855
+ /* Additional fix for nested list items */
856
+ .methodology-list li > * {
857
+ list-style: none !important;
858
+ margin-left: 0;
859
+ padding-left: 0;
860
+ }
861
+
862
+ .code-block {
863
+ background: var(--code-bg);
864
+ border-radius: 12px;
865
+ padding: 1.5rem;
866
+ margin: 1.5rem 0;
867
+ font-family: 'SF Mono', 'Menlo', monospace;
868
+ font-size: 0.9rem;
869
+ overflow-x: auto;
870
+ border: 1px solid var(--border-color);
871
+ }
872
+
873
+ .code-block pre {
874
+ margin: 0;
875
+ padding: 0;
876
+ }
877
+
878
+ .highlight {
879
+ color: var(--accent-blue);
880
+ font-weight: 600;
881
+ }
882
+
883
+ /* Dataset Table Styling */
884
+ .dataset-table {
885
+ width: 100%;
886
+ border-collapse: collapse;
887
+ margin: 1.5rem 0;
888
+ background: var(--bg-secondary);
889
+ border-radius: 12px;
890
+ overflow: hidden;
891
+ }
892
+
893
+ .dataset-table th {
894
+ background: var(--table-header);
895
+ padding: 1rem;
896
+ text-align: left;
897
+ font-weight: 600;
898
+ color: var(--text-primary);
899
+ border-bottom: 2px solid var(--table-border);
900
+ }
901
+
902
+ .dataset-table td {
903
+ padding: 1rem;
904
+ border-bottom: 1px solid var(--table-border);
905
+ color: var(--text-secondary);
906
+ }
907
+
908
+ .dataset-table tbody tr:hover {
909
+ background: var(--table-hover);
910
+ }
911
+
912
+ .dataset-table td:first-child {
913
+ font-weight: 500;
914
+ }
915
+
916
  </style>
917
 
918
+ <!-- Methodology Section -->
919
+ <h1 class="section-title">Methodology</h1>
920
+ <p>Our evaluation process follows a systematic approach to ensure comprehensive and fair assessment of AI agents. We evaluate language models' ability to effectively use tools
921
+ in single and multi-turn conversations. Our evaluation focuses on both basic functionality and edge
922
+ cases that challenge real-world applicability.</p>
923
+
924
+ <ul class="methodology-list">
925
+ <li><span class="highlight">Model Selection</span>We begin by curating a diverse set of leading language models, including both proprietary and open-source implementations.</li>
926
+ <li><span class="highlight">Agent Configuration</span>Each model is configured as an agent using a standardized system prompt and given access to a consistent set of tools.</li>
927
+ <li><span class="highlight">Metric Definition</span> <a href="https://docs.galileo.ai/galileo/gen-ai-studio-products/galileo-guardrail-metrics/tool-selection-quality#tool-selection-quality">Tool Selection Quality (TSQ)</a></li>
928
+ <li><span class="highlight">Scoring System</span>The final performance score is calculated as an equally weighted average across all datasets.</li>
929
+ <li><span class="highlight">Dataset Curation</span>We strategically sampled from established benchmarking datasets. See later section for more info.</li>
930
+ <div class="methodology-section">
931
+ <div class="table-container">
932
+ <table class="dataset-table">
933
+ <thead>
934
+ <tr>
935
+ <th>Dataset</th>
936
+ <th>Domains</th>
937
+ <th>Link</th>
938
+ </tr>
939
+ </thead>
940
+ <tbody>
941
+ <tr>
942
+ <td>BFCL</td>
943
+ <td>Mathematics, Entertainment, Education, and Academic Domains</td>
944
+ <td><a href="https://gorilla.cs.berkeley.edu/leaderboard.html">View Dataset</a></td>
945
+ </tr>
946
+ <tr>
947
+ <td>τ-bench</td>
948
+ <td>Retail and Airline Industry Scenarios</td>
949
+ <td><a href="https://github.com/sierra-research/tau-bench">View Dataset</a></td>
950
+ </tr>
951
+ <tr>
952
+ <td>xLAM</td>
953
+ <td>Cross-domain Data Generation (21 Domains)</td>
954
+ <td><a href="https://www.salesforce.com/blog/xlam-large-action-models/">View Dataset</a></td>
955
+ </tr>
956
+ <tr>
957
+ <td>ToolACE</td>
958
+ <td>API Interactions across 390 Domains</td>
959
+ <td><a href="https://arxiv.org/abs/2409.00920">View Dataset</a></td>
960
+ </tr>
961
+ </tbody>
962
+ </table>
963
+ </div>
964
+ </div>
965
+
966
+
967
+ </ul>
968
+
969
  <div class="methodology-section">
970
+ <h1 class="section-title">Key Insights</h1>
971
  <div class="table-container">
972
  <table class="dataset-table key-insights">
973
  <thead>
 
979
  <tbody>
980
  <tr>
981
  <td>Performance Champion</td>
982
+ <td>Gemini-2.0-flash dominates with 0.938 score at a very affordable cost, excelling in both complex tasks and safety features.</td>
983
  </tr>
984
  <tr>
985
  <td>Price-Performance Paradox</td>
986
+ <td>Top 3 models span 10x price difference yet only 4% performance gap, challenging pricing assumptions</td>
987
  </tr>
988
  <tr>
989
  <td>Open Vs Closed Source</td>
 
995
  </tr>
996
  <tr>
997
  <td>Tool Miss Detection</td>
998
+ <td>Low dataset averages of 0.60(tool_miss) and 0.73(miss_func) reveal fundamental challenges in handling edge cases and maintaining context, even as models excel at basic tasks</td>
999
  </tr>
1000
  <tr>
1001
  <td>Architecture Trade-offs</td>
 
1005
  </table>
1006
  </div>
1007
 
1008
+ <h1 class="section-title">Development Implications</h2>
1009
  <div class="table-container">
1010
  <table class="dataset-table key-insights">
1011
  <thead>
 
1043
  </table>
1044
  </div>
1045
 
1046
+
1047
+ <div></div>
1048
+ <h1 class="section-title">How Do We Measure Agent's Performance?</h1>
1049
+ <div>
1050
+ <p>The complexity of tool calling extends far beyond simple API invocations. We developed the Tool Selection Quality metric to assess agents' tool call performance, evaluating tool selection accuracy and effectiveness of parameter usage.</p>
1051
+
1052
+ <div class="section-divider"></div>
1053
+ <h2 class="subsection-title">Scenario Recognition</h2>
1054
+ <div class="explanation-block">
1055
+ <p>When an agent encounters a query, it must first determine if tool usage is warranted. Information may already exist in the conversation history, making tool calls redundant. Alternatively, available tools might be insufficient or irrelevant to the task, requiring the agent to acknowledge limitations rather than force inappropriate tool usage.</p>
1056
+ </div>
1057
+
1058
+ <div class="section-divider"></div>
1059
+ <h2 class="subsection-title">Tool Selection Dynamics</h2>
1060
+ <div class="explanation-block">
1061
+ <p>Tool selection isn't binary—it involves both precision and recall. An agent might correctly identify one necessary tool while missing others (recall issue) or select appropriate tools alongside unnecessary ones (precision issue). While suboptimal, these scenarios represent different severity levels of selection errors.</p>
1062
+ </div>
1063
+
1064
+ <div class="section-divider"></div>
1065
+ <h2 class="subsection-title">Parameter Handling</h2>
1066
+ <div class="explanation-block">
1067
+ <p>Even with correct tool selection, argument handling introduces additional complexity. Agents must:</p>
1068
+ <ul class="methodology-list">
1069
+ <li>Provide all required parameters with correct naming</li>
1070
+ <li>Handle optional parameters appropriately</li>
1071
+ <li>Maintain parameter value accuracy</li>
1072
+ <li>Format arguments according to tool specifications</li>
1073
+ </ul>
1074
+ </div>
1075
+
1076
+ <div class="section-divider"></div>
1077
+ <h2 class="subsection-title">Sequential Decision Making</h2>
1078
+ <div class="explanation-block">
1079
+ <p>Multi-step tasks require agents to:</p>
1080
+ <ul class="methodology-list">
1081
+ <li>Determine optimal tool calling sequence</li>
1082
+ <li>Handle interdependencies between tool calls</li>
1083
+ <li>Maintain context across multiple operations</li>
1084
+ <li>Adapt to partial results or failures</li>
1085
+ </ul>
1086
+ </div>
1087
+
1088
+ <div class="section-divider"></div>
1089
+ <h2 class="subsection-title">Example code</h2>
1090
+ <p class="code-intro">Below is the code example of evaluating an LLM for a dataset.</p>
1091
+ <div class="code-block">
1092
+
1093
+ <pre>
1094
+ import promptquality as pq
1095
+
1096
+ df = pd.read_parquet(file_path)
1097
+
1098
+ chainpoll_tool_selection_scorer = pq.CustomizedChainPollScorer(
1099
+ scorer_name=pq.CustomizedScorerName.tool_selection_quality,
1100
+ model_alias=pq.Models.gpt_4o,
1101
+ )
1102
+
1103
+ evaluate_handler = pq.GalileoPromptCallback(
1104
+ project_name=project_name,
1105
+ run_name=run_name,
1106
+ scorers=[chainpoll_tool_selection_scorer],
1107
+ )
1108
+
1109
+ llm = llm_handler.get_llm(model, temperature=0.0, max_tokens=4000)
1110
+ system_msg = {
1111
+ "role": "system",
1112
+ "content": 'Your job is to use the given tools to answer the query of human. If there is no relevant tool then reply with "I cannot answer the question with given tools". If tool is available but sufficient information is not available, then ask human to get the same. You can call as many tools as you want. Use multiple tools if needed. If the tools need to be called in a sequence then just call the first tool.',
1113
+ }
1114
+
1115
+ for row in df.itertuples():
1116
+ chain = llm.bind_tools(tools)
1117
+ outputs.append(
1118
+ chain.invoke(
1119
+ [system_msg, *row.conversation],
1120
+ config=dict(callbacks=[evaluate_handler])
1121
+ )
1122
+ )
1123
+
1124
+ evaluate_handler.finish()
1125
+
1126
+ </pre>
1127
+ </div>
1128
+ </div>
1129
+
1130
+
1131
+ <h1 class="section-title">Dataset Structure</h2>
1132
  <div class="table-container">
1133
  <table class="dataset-table">
1134
  <thead>
 
1142
  </thead>
1143
  <tbody>
1144
  <tr>
1145
+ <td rowspan="5">Single-Turn</td>
1146
  <td>100 + 100</td>
1147
  <td class="category-cell">Single Function Call</td>
1148
+ <td class="dataset-name">xlam_single_tool_single_call, xlam_multiple_tool_single_call</td>
1149
  <td class="purpose-cell">Evaluates basic ability to read documentation and make single function calls</td>
1150
  </tr>
1151
  <tr>
 
1166
  <td class="dataset-name">tau_long_context</td>
1167
  <td class="purpose-cell">Assesses handling of extended interactions and complex instructions</td>
1168
  </tr>
1169
+ <tr>
1170
+ <td>100</td>
1171
+ <td class="category-cell">Missing Function</td>
1172
+ <td class="dataset-name">xlam_tool_miss</td>
1173
+ <td class="purpose-cell">Tests graceful handling of unavailable tools</td>
1174
+ </tr>
1175
  <tr>
1176
  <td rowspan="5">Multi-Turn</td>
1177
  <td>50 + 30</td>
 
1206
  </tbody>
1207
  </table>
1208
  </div>
1209
+
1210
+ <div class="section-divider"></div>
1211
+ <h2 class="section-title">Citation</h2>
1212
+ <div class="bibtex-citation" style="font-family: monospace; white-space: pre; padding: 1em; background-color: rgba(128, 128, 128, 0.1); border: 1px solid rgba(128, 128, 128, 0.2); border-radius: 4px; color: currentColor;">@misc{agent-leaderboard,
1213
+ author = {Pratik Bhavsar},
1214
+ title = {Agent Leaderboard},
1215
+ year = {2025},
1216
+ publisher = {Galileo.ai},
1217
+ howpublished = {\\url{https://huggingface.co/spaces/galileo-ai/agent-leaderboard}}
1218
+ }</div>
1219
 
1220
  <!-- Features Grid Section -->
1221
  <div class="features-grid">
 
1241
  </div>
1242
  <h3 class="feature-title">360° Domain Evaluation</h3>
1243
  <ul class="feature-list">
1244
+ <li>Multi-domain evaluation</li>
1245
  <li>Real-world use cases</li>
1246
  <li>Edge case evaluation</li>
1247
  </ul>
1248
  </div>
1249
+
1250
 
1251
  <div class="feature-card">
1252
  <div class="feature-icon">
 
1262
  </ul>
1263
  </div>
1264
 
1265
+ </div>
1266
+
1267
+
1268
+
1269
  """
results.csv CHANGED
@@ -1,5 +1,5 @@
1
  Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
2
- gemini-2.0-flash-exp,Private,Normal,Google,0.1,0.4,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
3
  gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
4
  gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
5
  gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
@@ -16,4 +16,4 @@ mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,
16
  ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
17
  Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
18
  open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
19
- ,,,,,,,0.83,0.79,0.81,0.78,0.76,0.89,0.80,0.96,0.60,0.81,0.82,0.82,0.92,0.85,0.73,0.80
 
1
  Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
2
+ gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
3
  gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
4
  gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
5
  gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
 
16
  ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
17
  Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
18
  open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
19
+ ,,,,,,,0.83,0.79,0.81,0.78,0.76,0.88,0.80,0.96,0.60,0.81,0.82,0.81,0.92,0.85,0.73,0.80
tabs/leaderboard.py CHANGED
@@ -126,7 +126,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
126
  <th>Type</th>
127
  <th>Vendor</th>
128
  <th>Cost (I/O)</th>
129
- <th>Category Score</th>
130
  </tr>
131
  </thead>
132
  <tbody>
 
126
  <th>Type</th>
127
  <th>Vendor</th>
128
  <th>Cost (I/O)</th>
129
+ <th>Avg Category Score (TSQ)</th>
130
  </tr>
131
  </thead>
132
  <tbody>
tabs/model_comparison.py CHANGED
@@ -47,7 +47,7 @@ def compare_models(df, model_names=None):
47
  return info_html, radar_chart
48
 
49
 
50
- def create_model_comparison_tab(df, HEADER_CONTENT, CARDS):
51
  with gr.Tab("Model Comparison"):
52
  gr.HTML(HEADER_CONTENT)
53
  with gr.Column():
 
47
  return info_html, radar_chart
48
 
49
 
50
+ def create_model_comparison_tab(df, HEADER_CONTENT):
51
  with gr.Tab("Model Comparison"):
52
  gr.HTML(HEADER_CONTENT)
53
  with gr.Column():
visualization.py CHANGED
@@ -39,7 +39,12 @@ def get_performance_chart(df, category_name="Overall"):
39
  fontweight="bold",
40
  color=colors["text"],
41
  )
42
- ax.set_xlabel("Average Score", fontsize=14, labelpad=10, color=colors["text"])
 
 
 
 
 
43
  ax.set_xlim(0.0, 1.0)
44
 
45
  ax.set_yticks(np.arange(len(df_sorted)))
@@ -222,7 +227,7 @@ def get_performance_cost_chart(df, category_name="Overall"):
222
 
223
  ax.set_title(
224
  f"Performance vs. Cost - {category_name}",
225
- fontsize=12,
226
  pad=15,
227
  fontweight="bold",
228
  color=colors["text"],
 
39
  fontweight="bold",
40
  color=colors["text"],
41
  )
42
+ ax.set_xlabel(
43
+ "Average Score (Tool Selection Quality)",
44
+ fontsize=14,
45
+ labelpad=10,
46
+ color=colors["text"],
47
+ )
48
  ax.set_xlim(0.0, 1.0)
49
 
50
  ax.set_yticks(np.arange(len(df_sorted)))
 
227
 
228
  ax.set_title(
229
  f"Performance vs. Cost - {category_name}",
230
+ fontsize=14,
231
  pad=15,
232
  fontweight="bold",
233
  color=colors["text"],