Merge branch 'main' of hf.co:spaces/vector-institute/llm-eval-leaderboard
Browse files- app.py +8 -1
- inspect_log_file_names.json +26 -0
- refactor_eval_results.py +5 -0
- src/about.py +23 -22
- src/display/utils.py +1 -0
- src/leaderboard/read_evals.py +2 -0
- src/populate.py +1 -1
app.py
CHANGED
@@ -60,7 +60,14 @@ def init_leaderboard(df, benchmark_type):
|
|
60 |
if df is None or df.empty:
|
61 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
62 |
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
# styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
|
66 |
# df.style.set_table_styles([
|
|
|
60 |
if df is None or df.empty:
|
61 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
62 |
|
63 |
+
non_task_cols = ["Model"]
|
64 |
+
if benchmark_type == "agentic":
|
65 |
+
# Include agent column
|
66 |
+
non_task_cols.append("Agent")
|
67 |
+
elif benchmark_type == "base":
|
68 |
+
# Drop agent column
|
69 |
+
dataframe = dataframe.drop(columns=["Agent"])
|
70 |
+
AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name in non_task_cols) or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
|
71 |
|
72 |
# styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
|
73 |
# df.style.set_table_styles([
|
inspect_log_file_names.json
CHANGED
@@ -37,6 +37,32 @@
|
|
37 |
"mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
|
38 |
"mmmu_multiple_choice": "2025-01-20T23-21-33-05-00_mmmu-multiple-choice_3huWbH3SVWx7NTGwYoKbBD.json"
|
39 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
"o1": {
|
41 |
"winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
|
42 |
"humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
|
|
|
37 |
"mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
|
38 |
"mmmu_multiple_choice": "2025-01-20T23-21-33-05-00_mmmu-multiple-choice_3huWbH3SVWx7NTGwYoKbBD.json"
|
39 |
},
|
40 |
+
"o3-mini": {
|
41 |
+
"math": "2025-02-06T18-33-30-05-00_math_86Gx8n4BxhpyfaSHmRcCUm.json",
|
42 |
+
"humaneval": "2025-02-06T20-58-48-05-00_humaneval_Dkod7CS9RmbbogYx9aEXtx.json",
|
43 |
+
"mmlu_pro": "2025-02-06T19-49-27-05-00_mmlu-pro_jz9woKfdKt8VMzqNFsy7kY.json",
|
44 |
+
"gpqa_diamond": "2025-02-06T17-57-54-05-00_gpqa-diamond_2znyMtdc7X4LJufxXeXA8Z.json",
|
45 |
+
"winogrande": "2025-02-06T22-50-40-05-00_winogrande_VsTW2uU2Kj66YoNoFfRfUj.json",
|
46 |
+
"gsm8k": "2025-02-06T18-23-05-05-00_gsm8k_d523pJzkcvobxamhhobCRb.json",
|
47 |
+
"arc_challenge": "2025-02-06T17-53-30-05-00_arc-challenge_AYFHec7wmd4jELF2Rgzfya.json",
|
48 |
+
"arc_easy": "2025-02-06T17-45-57-05-00_arc-easy_Nd8NP3K48tvwLVZb8kXDwg.json",
|
49 |
+
"gaia": "2025-02-05T23-21-20+00-00_gaia_hyMq8MzMm6NgAeq3dNqZSU.json",
|
50 |
+
"gdm_intercode_ctf": "2025-02-05T21-43-18+00-00_gdm-intercode-ctf_gdm29C6DuTEsX9qm9ymmrC.json",
|
51 |
+
"gdm_in_house_ctf": "2025-02-05T23-59-08+00-00_gdm-in-house-ctf_2zkAX5nkJoxDnVKpJL9VgW.json",
|
52 |
+
"agentharm_benign": "2025-02-03T18-49-08-08-00_agentharm-benign_Gv94YFpAXaaCJqe3Fc6yr3.json",
|
53 |
+
"agentharm": "2025-02-03T18-17-03-08-00_agentharm_DmN6i5HrgXHNARjsuSewjg.json",
|
54 |
+
"swe_bench": "2025-02-03T06-49-09+00-00_openai-o3-mini.json"
|
55 |
+
},
|
56 |
+
"DeepSeek-R1": {
|
57 |
+
"mmlu_pro": "2025-02-12T11-02-35-05-00_mmlu-pro_BhD89DYN9KM3k4weSDfaQK.json",
|
58 |
+
"humaneval": "2025-02-03T11-45-22-05-00_humaneval_hnkHWYqrb5HxiBt2CWzCnq.json",
|
59 |
+
"math": "2025-02-11T11-38-10-05-00_math_ZYFSqsWsmP5kLRLHEMWULU.json",
|
60 |
+
"gsm8k": "2025-02-02T16-28-05-05-00_gsm8k_YMw6WiZkgTBQ54z5UHtDDX.json",
|
61 |
+
"arc_challenge": "2025-01-30T15-42-39-05-00_arc-challenge_CviW9ro6rKBbctkwJzQstp.json",
|
62 |
+
"winogrande": "2025-02-04T00-25-12-05-00_winogrande_NPgTbtqom2QSPKxeThWrdZ.json",
|
63 |
+
"arc_easy": "2025-01-30T12-48-35-05-00_arc-easy_SvRDfqsHDECQtvNU7rodZH.json",
|
64 |
+
"gpqa_diamond": "2025-02-11T11-37-45-05-00_gpqa-diamond_MwnVeLwyuiEAALr3M5q3dn.json"
|
65 |
+
},
|
66 |
"o1": {
|
67 |
"winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
|
68 |
"humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
|
refactor_eval_results.py
CHANGED
@@ -42,6 +42,8 @@ MODEL_SHA_MAP = {
|
|
42 |
"gpt-4o": "https://openai.com/index/hello-gpt-4o",
|
43 |
"gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
|
44 |
"o1": "https://openai.com/o1",
|
|
|
|
|
45 |
}
|
46 |
|
47 |
MODEL_VERSION_MAP = {
|
@@ -58,6 +60,8 @@ MODEL_VERSION_MAP = {
|
|
58 |
"gpt-4o": "GPT-4o-20240806",
|
59 |
"gpt-4o-mini": "GPT-4o-mini-20240718",
|
60 |
"o1": "o1-20241217",
|
|
|
|
|
61 |
}
|
62 |
|
63 |
AGENTIC_LOG_MODEL_NAME_MAP = {
|
@@ -65,6 +69,7 @@ AGENTIC_LOG_MODEL_NAME_MAP = {
|
|
65 |
"gemini-1.5-pro": "gemini-1.5-pro-002",
|
66 |
"gpt-4o": "gpt-4o-2024-08-06",
|
67 |
"o1": "o1-2024-12-17",
|
|
|
68 |
}
|
69 |
|
70 |
AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]
|
|
|
42 |
"gpt-4o": "https://openai.com/index/hello-gpt-4o",
|
43 |
"gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
|
44 |
"o1": "https://openai.com/o1",
|
45 |
+
"o3-mini": "https://openai.com/index/openai-o3-mini",
|
46 |
+
"DeepSeek-R1": "https://api-docs.deepseek.com/news/news250120"
|
47 |
}
|
48 |
|
49 |
MODEL_VERSION_MAP = {
|
|
|
60 |
"gpt-4o": "GPT-4o-20240806",
|
61 |
"gpt-4o-mini": "GPT-4o-mini-20240718",
|
62 |
"o1": "o1-20241217",
|
63 |
+
"o3-mini": "o3-mini-20250131",
|
64 |
+
"DeepSeek-R1": "DeepSeek-R1",
|
65 |
}
|
66 |
|
67 |
AGENTIC_LOG_MODEL_NAME_MAP = {
|
|
|
69 |
"gemini-1.5-pro": "gemini-1.5-pro-002",
|
70 |
"gpt-4o": "gpt-4o-2024-08-06",
|
71 |
"o1": "o1-2024-12-17",
|
72 |
+
"o3-mini": "o3-mini-2025-01-31",
|
73 |
}
|
74 |
|
75 |
AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]
|
src/about.py
CHANGED
@@ -113,32 +113,33 @@ These benchmarks go beyond basic reasoning and evaluate more advanced, autonomou
|
|
113 |
"""
|
114 |
|
115 |
REPRODUCIBILITY_TEXT = """
|
116 |
-
##
|
117 |
-
|
118 |
-
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
119 |
-
```python
|
120 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
121 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
122 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
123 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
124 |
-
```
|
125 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
126 |
|
127 |
-
|
128 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
129 |
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
132 |
|
133 |
-
|
134 |
-
|
|
|
|
|
135 |
|
136 |
-
|
137 |
-
|
|
|
|
|
|
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
|
|
|
|
143 |
"""
|
144 |
|
|
|
113 |
"""
|
114 |
|
115 |
REPRODUCIBILITY_TEXT = """
|
116 |
+
## 🛠️ Reproducibility
|
117 |
+
The [Vector State of Evaluation Leaderboard Repository](https://github.com/VectorInstitute/evaluation) repository contains the evaluation script to reproduce results presented on the leaderboard.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
+
### Install dependencies
|
|
|
120 |
|
121 |
+
1. Create a python virtual env. with ```python>=3.10``` and activate it
|
122 |
+
```bash
|
123 |
+
python -m venv env
|
124 |
+
source env/bin/activate
|
125 |
+
```
|
126 |
|
127 |
+
2. Install ```inspect_ai```, ```inspect_evals``` and other dependencies based on ```requirements.txt```
|
128 |
+
```bash
|
129 |
+
python -m pip install -r requirements.txt
|
130 |
+
```
|
131 |
|
132 |
+
3. Install any packages required for models you'd like to evaluate and use as grader models
|
133 |
+
```bash
|
134 |
+
python -m pip install <model_package>
|
135 |
+
```
|
136 |
+
Note: ```openai``` package is already included in ```requirements.txt```
|
137 |
|
138 |
+
### Run Inspect evaluation
|
139 |
+
1. Update the ```src/evals_cfg/run_cfg.yaml``` file to select the evals (base/agentic) and include all models to be evaluated
|
140 |
+
2. Now run evaluation as follows:
|
141 |
+
```bash
|
142 |
+
python src/run_evals.py
|
143 |
+
```
|
144 |
"""
|
145 |
|
src/display/utils.py
CHANGED
@@ -27,6 +27,7 @@ class ColumnContent:
|
|
27 |
auto_eval_column_dict = []
|
28 |
# Init
|
29 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
30 |
# Scores
|
31 |
for task in Tasks:
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])
|
|
|
27 |
auto_eval_column_dict = []
|
28 |
# Init
|
29 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
30 |
+
auto_eval_column_dict.append(["agent", ColumnContent, ColumnContent("Agent", "markdown", True, never_hidden=True)])
|
31 |
# Scores
|
32 |
for task in Tasks:
|
33 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])
|
src/leaderboard/read_evals.py
CHANGED
@@ -118,6 +118,8 @@ class EvalResult:
|
|
118 |
data_dict = {
|
119 |
"eval_name": self.eval_name, # not a column, just a save name,
|
120 |
AutoEvalColumn.model.name: make_clickable_model(self.model_version, self.revision),
|
|
|
|
|
121 |
}
|
122 |
|
123 |
for task in Tasks:
|
|
|
118 |
data_dict = {
|
119 |
"eval_name": self.eval_name, # not a column, just a save name,
|
120 |
AutoEvalColumn.model.name: make_clickable_model(self.model_version, self.revision),
|
121 |
+
# As of now all models use the basic inspect agent
|
122 |
+
AutoEvalColumn.agent.name: "[Basic Agent](https://inspect.ai-safety-institute.org.uk/agents.html#sec-basic-agent)"
|
123 |
}
|
124 |
|
125 |
for task in Tasks:
|
src/populate.py
CHANGED
@@ -46,7 +46,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
46 |
df = df[cols].round(decimals=2)
|
47 |
|
48 |
# subset for model and benchmark cols
|
49 |
-
df = df[[AutoEvalColumn.model.name] + benchmark_cols]
|
50 |
|
51 |
# drop rows for which all benchmark cols are empty
|
52 |
df = df.dropna(subset=benchmark_cols, axis=0, how="all")
|
|
|
46 |
df = df[cols].round(decimals=2)
|
47 |
|
48 |
# subset for model and benchmark cols
|
49 |
+
df = df[[AutoEvalColumn.model.name, AutoEvalColumn.agent.name] + benchmark_cols]
|
50 |
|
51 |
# drop rows for which all benchmark cols are empty
|
52 |
df = df.dropna(subset=benchmark_cols, axis=0, how="all")
|