Update examples
Browse files- src/about.py +18 -18
- src/display/css_html_js.py +6 -0
src/about.py
CHANGED
@@ -79,20 +79,20 @@ These benchmarks assess fundamental reasoning and knowledge capabilities of mode
|
|
79 |
|
80 |
<div class="benchmark-table-container">
|
81 |
|
82 |
-
| Benchmark | Description |
|
83 |
|--------------------|----------------------------------------------------------------------------------|-----------------------------------------------|
|
84 |
-
| **ARC-Easy** / **ARC-Challenge** | Multiple-choice science questions measuring scientific & commonsense reasoning. |
|
85 |
-
| **DROP** | Reading comprehension benchmark emphasizing discrete reasoning steps. |
|
86 |
-
| **WinoGrande** | Commonsense reasoning challenge focused on co-reference resolution. |
|
87 |
-
| **GSM8K** | Grade-school math word problems testing arithmetic & multi-step reasoning. |
|
88 |
-
| **HellaSwag** | Commonsense inference task centered on action completion. |
|
89 |
-
| **HumanEval** | Evaluates code generation and reasoning in a programming context. |
|
90 |
-
| **IFEval** | Specialized benchmark for incremental formal reasoning. |
|
91 |
-
| **
|
92 |
-
| **
|
93 |
-
| **
|
94 |
-
| **
|
95 |
-
|
96 |
</div>
|
97 |
|
98 |
### 🚀 Agentic Benchmarks
|
@@ -103,11 +103,11 @@ These benchmarks go beyond basic reasoning and evaluate more advanced, autonomou
|
|
103 |
|
104 |
| Benchmark | Description | Key Skills |
|
105 |
|-----------------------|-----------------------------------------------------------------------------|-------------------------------------------------|
|
106 |
-
| **GAIA** | Evaluates autonomous reasoning, planning, problem-solving, & multi-turn interactions. |
|
107 |
-
| [**InterCode-CTF**](https://ukgovernmentbeis.github.io/inspect_evals/evals/cybersecurity/in_house_ctf/) | Capture-the-flag challenge focused on code interpretation & debugging. |
|
108 |
-
| **GDM-In-House-CTF** | Capture-the-flag challenge testing web application security skills. |
|
109 |
-
| **AgentHarm** / **AgentHarm-Benign** | Measures harmfulness of LLM agents (and benign behavior baseline). |
|
110 |
-
| **SWE-Bench** | Tests AI agent ability to solve software engineering tasks. |
|
111 |
|
112 |
</div>
|
113 |
"""
|
|
|
79 |
|
80 |
<div class="benchmark-table-container">
|
81 |
|
82 |
+
| Benchmark | Description | Domain |
|
83 |
|--------------------|----------------------------------------------------------------------------------|-----------------------------------------------|
|
84 |
+
| **ARC-Easy** / **ARC-Challenge** | Multiple-choice science questions measuring scientific & commonsense reasoning. | Example |
|
85 |
+
| **DROP** | Reading comprehension benchmark emphasizing discrete reasoning steps. | Example |
|
86 |
+
| **WinoGrande** | Commonsense reasoning challenge focused on co-reference resolution. | Example |
|
87 |
+
| **GSM8K** | Grade-school math word problems testing arithmetic & multi-step reasoning. | Example |
|
88 |
+
| **HellaSwag** | Commonsense inference task centered on action completion. | Example |
|
89 |
+
| **HumanEval** | Evaluates code generation and reasoning in a programming context. | Example |
|
90 |
+
| **IFEval** | Specialized benchmark for incremental formal reasoning. | Example |
|
91 |
+
| **IFEval** | Specialized benchmark for incremental formal reasoning. | Example |
|
92 |
+
| **MATH** | High school-level math questions requiring detailed solutions. | Example |
|
93 |
+
| **MMLU** / **MMLU-Pro**| Multi-subject multiple-choice tests of advanced knowledge. | Example |
|
94 |
+
| **GPQA-Diamond** | Question-answering benchmark assessing deeper reasoning & knowledge linking. | Example |
|
95 |
+
| **MMMU** (Multi-Choice / Open-Ended) | Multilingual & multi-domain tasks testing structured & open responses. | Example |
|
96 |
</div>
|
97 |
|
98 |
### 🚀 Agentic Benchmarks
|
|
|
103 |
|
104 |
| Benchmark | Description | Key Skills |
|
105 |
|-----------------------|-----------------------------------------------------------------------------|-------------------------------------------------|
|
106 |
+
| **GAIA** | Evaluates autonomous reasoning, planning, problem-solving, & multi-turn interactions. | Example |
|
107 |
+
| [**InterCode-CTF**](https://ukgovernmentbeis.github.io/inspect_evals/evals/cybersecurity/in_house_ctf/) | Capture-the-flag challenge focused on code interpretation & debugging. | Example |
|
108 |
+
| **GDM-In-House-CTF** | Capture-the-flag challenge testing web application security skills. | Example |
|
109 |
+
| **AgentHarm** / **AgentHarm-Benign** | Measures harmfulness of LLM agents (and benign behavior baseline). | Example |
|
110 |
+
| **SWE-Bench** | Tests AI agent ability to solve software engineering tasks. | Example |
|
111 |
|
112 |
</div>
|
113 |
"""
|
src/display/css_html_js.py
CHANGED
@@ -104,6 +104,11 @@ custom_css = """
|
|
104 |
overflow-x: auto;
|
105 |
}
|
106 |
|
|
|
|
|
|
|
|
|
|
|
107 |
.llm-benchmark-tab-table .table-wrap table.table tr td,
|
108 |
.llm-benchmark-tab-table .table-wrap table.table tr th {
|
109 |
border-bottom: 1px solid var(--border-color-primary);
|
@@ -165,6 +170,7 @@ custom_css = """
|
|
165 |
}
|
166 |
}
|
167 |
|
|
|
168 |
"""
|
169 |
|
170 |
get_window_url_params = """
|
|
|
104 |
overflow-x: auto;
|
105 |
}
|
106 |
|
107 |
+
.llm-benchmark-tab-table .table-wrap table.table a {
|
108 |
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
|
109 |
+
color: #ec4899
|
110 |
+
}
|
111 |
+
|
112 |
.llm-benchmark-tab-table .table-wrap table.table tr td,
|
113 |
.llm-benchmark-tab-table .table-wrap table.table tr th {
|
114 |
border-bottom: 1px solid var(--border-color-primary);
|
|
|
170 |
}
|
171 |
}
|
172 |
|
173 |
+
|
174 |
"""
|
175 |
|
176 |
get_window_url_params = """
|