jwilles commited on
Commit
3159db8
·
2 Parent(s): 7564219 176ef94

Merge branch 'main' of hf.co:spaces/vector-institute/llm-eval-leaderboard

Browse files
app.py CHANGED
@@ -60,7 +60,14 @@ def init_leaderboard(df, benchmark_type):
60
  if df is None or df.empty:
61
  raise ValueError("Leaderboard DataFrame is empty or None.")
62
 
63
- AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name=="Model") or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
 
 
 
 
 
 
 
64
 
65
  # styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
66
  # df.style.set_table_styles([
 
60
  if df is None or df.empty:
61
  raise ValueError("Leaderboard DataFrame is empty or None.")
62
 
63
+ non_task_cols = ["Model"]
64
+ if benchmark_type == "agentic":
65
+ # Include agent column
66
+ non_task_cols.append("Agent")
67
+ elif benchmark_type == "base":
68
+ # Drop agent column
69
+ dataframe = dataframe.drop(columns=["Agent"])
70
+ AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name in non_task_cols) or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
71
 
72
  # styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
73
  # df.style.set_table_styles([
inspect_log_file_names.json CHANGED
@@ -37,6 +37,32 @@
37
  "mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
38
  "mmmu_multiple_choice": "2025-01-20T23-21-33-05-00_mmmu-multiple-choice_3huWbH3SVWx7NTGwYoKbBD.json"
39
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  "o1": {
41
  "winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
42
  "humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
 
37
  "mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
38
  "mmmu_multiple_choice": "2025-01-20T23-21-33-05-00_mmmu-multiple-choice_3huWbH3SVWx7NTGwYoKbBD.json"
39
  },
40
+ "o3-mini": {
41
+ "math": "2025-02-06T18-33-30-05-00_math_86Gx8n4BxhpyfaSHmRcCUm.json",
42
+ "humaneval": "2025-02-06T20-58-48-05-00_humaneval_Dkod7CS9RmbbogYx9aEXtx.json",
43
+ "mmlu_pro": "2025-02-06T19-49-27-05-00_mmlu-pro_jz9woKfdKt8VMzqNFsy7kY.json",
44
+ "gpqa_diamond": "2025-02-06T17-57-54-05-00_gpqa-diamond_2znyMtdc7X4LJufxXeXA8Z.json",
45
+ "winogrande": "2025-02-06T22-50-40-05-00_winogrande_VsTW2uU2Kj66YoNoFfRfUj.json",
46
+ "gsm8k": "2025-02-06T18-23-05-05-00_gsm8k_d523pJzkcvobxamhhobCRb.json",
47
+ "arc_challenge": "2025-02-06T17-53-30-05-00_arc-challenge_AYFHec7wmd4jELF2Rgzfya.json",
48
+ "arc_easy": "2025-02-06T17-45-57-05-00_arc-easy_Nd8NP3K48tvwLVZb8kXDwg.json",
49
+ "gaia": "2025-02-05T23-21-20+00-00_gaia_hyMq8MzMm6NgAeq3dNqZSU.json",
50
+ "gdm_intercode_ctf": "2025-02-05T21-43-18+00-00_gdm-intercode-ctf_gdm29C6DuTEsX9qm9ymmrC.json",
51
+ "gdm_in_house_ctf": "2025-02-05T23-59-08+00-00_gdm-in-house-ctf_2zkAX5nkJoxDnVKpJL9VgW.json",
52
+ "agentharm_benign": "2025-02-03T18-49-08-08-00_agentharm-benign_Gv94YFpAXaaCJqe3Fc6yr3.json",
53
+ "agentharm": "2025-02-03T18-17-03-08-00_agentharm_DmN6i5HrgXHNARjsuSewjg.json",
54
+ "swe_bench": "2025-02-03T06-49-09+00-00_openai-o3-mini.json"
55
+ },
56
+ "DeepSeek-R1": {
57
+ "mmlu_pro": "2025-02-12T11-02-35-05-00_mmlu-pro_BhD89DYN9KM3k4weSDfaQK.json",
58
+ "humaneval": "2025-02-03T11-45-22-05-00_humaneval_hnkHWYqrb5HxiBt2CWzCnq.json",
59
+ "math": "2025-02-11T11-38-10-05-00_math_ZYFSqsWsmP5kLRLHEMWULU.json",
60
+ "gsm8k": "2025-02-02T16-28-05-05-00_gsm8k_YMw6WiZkgTBQ54z5UHtDDX.json",
61
+ "arc_challenge": "2025-01-30T15-42-39-05-00_arc-challenge_CviW9ro6rKBbctkwJzQstp.json",
62
+ "winogrande": "2025-02-04T00-25-12-05-00_winogrande_NPgTbtqom2QSPKxeThWrdZ.json",
63
+ "arc_easy": "2025-01-30T12-48-35-05-00_arc-easy_SvRDfqsHDECQtvNU7rodZH.json",
64
+ "gpqa_diamond": "2025-02-11T11-37-45-05-00_gpqa-diamond_MwnVeLwyuiEAALr3M5q3dn.json"
65
+ },
66
  "o1": {
67
  "winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
68
  "humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
refactor_eval_results.py CHANGED
@@ -42,6 +42,8 @@ MODEL_SHA_MAP = {
42
  "gpt-4o": "https://openai.com/index/hello-gpt-4o",
43
  "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
44
  "o1": "https://openai.com/o1",
 
 
45
  }
46
 
47
  MODEL_VERSION_MAP = {
@@ -58,6 +60,8 @@ MODEL_VERSION_MAP = {
58
  "gpt-4o": "GPT-4o-20240806",
59
  "gpt-4o-mini": "GPT-4o-mini-20240718",
60
  "o1": "o1-20241217",
 
 
61
  }
62
 
63
  AGENTIC_LOG_MODEL_NAME_MAP = {
@@ -65,6 +69,7 @@ AGENTIC_LOG_MODEL_NAME_MAP = {
65
  "gemini-1.5-pro": "gemini-1.5-pro-002",
66
  "gpt-4o": "gpt-4o-2024-08-06",
67
  "o1": "o1-2024-12-17",
 
68
  }
69
 
70
  AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]
 
42
  "gpt-4o": "https://openai.com/index/hello-gpt-4o",
43
  "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
44
  "o1": "https://openai.com/o1",
45
+ "o3-mini": "https://openai.com/index/openai-o3-mini",
46
+ "DeepSeek-R1": "https://api-docs.deepseek.com/news/news250120"
47
  }
48
 
49
  MODEL_VERSION_MAP = {
 
60
  "gpt-4o": "GPT-4o-20240806",
61
  "gpt-4o-mini": "GPT-4o-mini-20240718",
62
  "o1": "o1-20241217",
63
+ "o3-mini": "o3-mini-20250131",
64
+ "DeepSeek-R1": "DeepSeek-R1",
65
  }
66
 
67
  AGENTIC_LOG_MODEL_NAME_MAP = {
 
69
  "gemini-1.5-pro": "gemini-1.5-pro-002",
70
  "gpt-4o": "gpt-4o-2024-08-06",
71
  "o1": "o1-2024-12-17",
72
+ "o3-mini": "o3-mini-2025-01-31",
73
  }
74
 
75
  AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]
src/about.py CHANGED
@@ -113,32 +113,33 @@ These benchmarks go beyond basic reasoning and evaluate more advanced, autonomou
113
  """
114
 
115
  REPRODUCIBILITY_TEXT = """
116
- ## Reproduce and Extend the Leaderboard
117
-
118
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
119
- ```python
120
- from transformers import AutoConfig, AutoModel, AutoTokenizer
121
- config = AutoConfig.from_pretrained("your model name", revision=revision)
122
- model = AutoModel.from_pretrained("your model name", revision=revision)
123
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
124
- ```
125
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
126
 
127
- Note: make sure your model is public!
128
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
129
 
130
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
131
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
 
 
 
132
 
133
- ### 3) Make sure your model has an open license!
134
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
 
 
135
 
136
- ### 4) Fill up your model card
137
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
 
 
 
138
 
139
- ## In case of model failure
140
- If your model is displayed in the `FAILED` category, its execution stopped.
141
- Make sure you have followed the above steps first.
142
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 
 
143
  """
144
 
 
113
  """
114
 
115
  REPRODUCIBILITY_TEXT = """
116
+ ## 🛠️ Reproducibility
117
+ The [Vector State of Evaluation Leaderboard Repository](https://github.com/VectorInstitute/evaluation) repository contains the evaluation script to reproduce results presented on the leaderboard.
 
 
 
 
 
 
 
 
118
 
119
+ ### Install dependencies
 
120
 
121
+ 1. Create a python virtual env. with ```python>=3.10``` and activate it
122
+ ```bash
123
+ python -m venv env
124
+ source env/bin/activate
125
+ ```
126
 
127
+ 2. Install ```inspect_ai```, ```inspect_evals``` and other dependencies based on ```requirements.txt```
128
+ ```bash
129
+ python -m pip install -r requirements.txt
130
+ ```
131
 
132
+ 3. Install any packages required for models you'd like to evaluate and use as grader models
133
+ ```bash
134
+ python -m pip install <model_package>
135
+ ```
136
+ Note: ```openai``` package is already included in ```requirements.txt```
137
 
138
+ ### Run Inspect evaluation
139
+ 1. Update the ```src/evals_cfg/run_cfg.yaml``` file to select the evals (base/agentic) and include all models to be evaluated
140
+ 2. Now run evaluation as follows:
141
+ ```bash
142
+ python src/run_evals.py
143
+ ```
144
  """
145
 
src/display/utils.py CHANGED
@@ -27,6 +27,7 @@ class ColumnContent:
27
  auto_eval_column_dict = []
28
  # Init
29
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
30
  # Scores
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])
 
27
  auto_eval_column_dict = []
28
  # Init
29
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
30
+ auto_eval_column_dict.append(["agent", ColumnContent, ColumnContent("Agent", "markdown", True, never_hidden=True)])
31
  # Scores
32
  for task in Tasks:
33
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])
src/leaderboard/read_evals.py CHANGED
@@ -118,6 +118,8 @@ class EvalResult:
118
  data_dict = {
119
  "eval_name": self.eval_name, # not a column, just a save name,
120
  AutoEvalColumn.model.name: make_clickable_model(self.model_version, self.revision),
 
 
121
  }
122
 
123
  for task in Tasks:
 
118
  data_dict = {
119
  "eval_name": self.eval_name, # not a column, just a save name,
120
  AutoEvalColumn.model.name: make_clickable_model(self.model_version, self.revision),
121
+ # As of now all models use the basic inspect agent
122
+ AutoEvalColumn.agent.name: "[Basic Agent](https://inspect.ai-safety-institute.org.uk/agents.html#sec-basic-agent)"
123
  }
124
 
125
  for task in Tasks:
src/populate.py CHANGED
@@ -46,7 +46,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
46
  df = df[cols].round(decimals=2)
47
 
48
  # subset for model and benchmark cols
49
- df = df[[AutoEvalColumn.model.name] + benchmark_cols]
50
 
51
  # drop rows for which all benchmark cols are empty
52
  df = df.dropna(subset=benchmark_cols, axis=0, how="all")
 
46
  df = df[cols].round(decimals=2)
47
 
48
  # subset for model and benchmark cols
49
+ df = df[[AutoEvalColumn.model.name, AutoEvalColumn.agent.name] + benchmark_cols]
50
 
51
  # drop rows for which all benchmark cols are empty
52
  df = df.dropna(subset=benchmark_cols, axis=0, how="all")