xeon27
Update reproducibility text
5652cd0
raw
history blame
10.1 kB
from dataclasses import dataclass
from enum import Enum
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
type: str
source: str
# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
# base
task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
task2 = Task("drop", "mean", "DROP", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop")
task3 = Task("winogrande", "accuracy", "WinoGrande", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande")
task4 = Task("gsm8k", "accuracy", "GSM8K", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k")
task5 = Task("hellaswag", "accuracy", "HellaSwag", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag")
task6 = Task("humaneval", "mean", "HumanEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval")
task7 = Task("ifeval", "final_acc", "IFEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval")
task8 = Task("math", "accuracy", "MATH", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics")
task9 = Task("mmlu", "accuracy", "MMLU", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu")
task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro")
task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa")
task12 = Task("mmmu_multiple_choice", "accuracy", "MMMU-Multiple-Choice", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
# agentic
task14 = Task("gaia", "accuracy", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
task15 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
task16 = Task("gdm_in_house_ctf", "accuracy", "GDM-In-House-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf")
task17 = Task("agentharm", "avg_score", "AgentHarm", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
task18 = Task("agentharm_benign", "avg_score", "AgentHarm-Benign", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
task19 = Task("swe_bench", "mean", "SWE-Bench", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench")
NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------
# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">State of Evaluation Leaderboard</h1>"""
SINGLE_TURN_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "base"])
AGENTIC_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "agentic"])
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = f"""
Powered by **Inspect** and **Inspect Evals**, the **Vector State of Evaluation Leaderboard** presents an objective evaluation of leading frontier models across a comprehensive suite of benchmarks. Go beyond the summary metrics: click through to interactive reporting for each model and benchmark to explore sample-level performance and detailed traces."""
# Which evaluations are you running? how can people reproduce what you have?
ABOUT_TEXT = f"""
## Vector Institute
The **Vector Institute** is dedicated to advancing the fields of artificial intelligence and machine learning through cutting-edge research, collaborative projects, and open-source contributions. Our mission is to drive excellence and innovation in AI, fostering a vibrant community of researchers, developers, and industry partners.
## 🎯 Benchmarks
This leaderboard showcases performance across a comprehensive suite of benchmarks, designed to rigorously evaluate different aspects of AI model capabilities. Let's explore the benchmarks we use:
### Inspect Evals
This leaderboard leverages [Inspect Evals](https://ukgovernmentbeis.github.io/inspect_evals/) to power evaluation. Inspect Evals is an open-source repository built upon the Inspect AI framework. Developed in collaboration between the Vector Institute, Arcadia Impact and the UK AI Safety Institute, Inspect Evals provides a comprehensive suite of high-quality benchmarks spanning diverse domains like coding, mathematics, cybersecurity, reasoning, and general knowledge.
#### Transparent and Detailed Insights
All evaluations presented on this leaderboard are run using Inspect Evals. To facilitate in-depth analysis and promote transparency, we provide [Inspect Logs](https://inspect.ai-safety-institute.org.uk/log-viewer.html) for every benchmark run. These logs offer sample and trace level reporting, allowing the community to explore the granular details of model performance.
### ⚙️ Base Benchmarks
These benchmarks assess fundamental reasoning and knowledge capabilities of models.
<div class="benchmark-table-container">
| Benchmark | Description | Domain |
|--------------------|----------------------------------------------------------------------------------|-----------------------------------------------|
| **ARC-Easy** / **ARC-Challenge** | Multiple-choice science questions measuring scientific & commonsense reasoning. | Example |
| **DROP** | Reading comprehension benchmark emphasizing discrete reasoning steps. | Example |
| **WinoGrande** | Commonsense reasoning challenge focused on co-reference resolution. | Example |
| **GSM8K** | Grade-school math word problems testing arithmetic & multi-step reasoning. | Example |
| **HellaSwag** | Commonsense inference task centered on action completion. | Example |
| **HumanEval** | Evaluates code generation and reasoning in a programming context. | Example |
| **IFEval** | Specialized benchmark for incremental formal reasoning. | Example |
| **IFEval** | Specialized benchmark for incremental formal reasoning. | Example |
| **MATH** | High school-level math questions requiring detailed solutions. | Example |
| **MMLU** / **MMLU-Pro**| Multi-subject multiple-choice tests of advanced knowledge. | Example |
| **GPQA-Diamond** | Question-answering benchmark assessing deeper reasoning & knowledge linking. | Example |
| **MMMU** (Multi-Choice / Open-Ended) | Multilingual & multi-domain tasks testing structured & open responses. | Example |
</div>
### 🚀 Agentic Benchmarks
These benchmarks go beyond basic reasoning and evaluate more advanced, autonomous, or "agentic" capabilities of models, such as planning and interaction.
<div class="benchmark-table-container">
| Benchmark | Description | Key Skills |
|-----------------------|-----------------------------------------------------------------------------|-------------------------------------------------|
| **GAIA** | Evaluates autonomous reasoning, planning, problem-solving, & multi-turn interactions. | Example |
| [**InterCode-CTF**](https://ukgovernmentbeis.github.io/inspect_evals/evals/cybersecurity/in_house_ctf/) | Capture-the-flag challenge focused on code interpretation & debugging. | Example |
| **GDM-In-House-CTF** | Capture-the-flag challenge testing web application security skills. | Example |
| **AgentHarm** / **AgentHarm-Benign** | Measures harmfulness of LLM agents (and benign behavior baseline). | Example |
| **SWE-Bench** | Tests AI agent ability to solve software engineering tasks. | Example |
</div>
"""
REPRODUCIBILITY_TEXT = """
## 🛠️ Reproducibility
The [Vector State of Evaluation Leaderboard Repository](https://github.com/VectorInstitute/evaluation) repository contains the evaluation script to reproduce results presented on the leaderboard.
### Install dependencies
1. Create a python virtual env. with ```python>=3.10``` and activate it
```bash
python -m venv env
source env/bin/activate
```
2. Install ```inspect_ai```, ```inspect_evals``` and other dependencies based on ```requirements.txt```
```bash
python -m pip install -r requirements.txt
```
3. Install any packages required for models you'd like to evaluate and use as grader models
Note: ```openai``` package is already included in ```requirements.txt```
```bash
python -m pip install <model_package>
```
### Run Inspect evaluation
1. Update the ```src/evals_cfg/run_cfg.yaml``` file to select the evals (base/agentic) and include all models to be evaluated
2. Now run evaluation as follows:
```bash
python src/run_evals.py
```
"""