eval-leaderboard

Running

App Files Files Community

jwilles commited on Mar 1

Commit

3159db8

2 Parent(s): 7564219 176ef94

Merge branch 'main' of hf.co:spaces/vector-institute/llm-eval-leaderboard

Browse files

Files changed (7) hide show

app.py +8 -1
inspect_log_file_names.json +26 -0
refactor_eval_results.py +5 -0
src/about.py +23 -22
src/display/utils.py +1 -0
src/leaderboard/read_evals.py +2 -0
src/populate.py +1 -1

app.py CHANGED Viewed

@@ -60,7 +60,14 @@ def init_leaderboard(df, benchmark_type):
     if df is None or df.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
-    AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name=="Model") or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
     # styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
     # df.style.set_table_styles([

     if df is None or df.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
+    non_task_cols = ["Model"]
+    if benchmark_type == "agentic":
+        # Include agent column
+        non_task_cols.append("Agent")
+    elif benchmark_type == "base":
+        # Drop agent column
+        dataframe = dataframe.drop(columns=["Agent"])
+    AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name in non_task_cols) or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
     # styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
     # df.style.set_table_styles([

inspect_log_file_names.json CHANGED Viewed

@@ -37,6 +37,32 @@
         "mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
         "mmmu_multiple_choice": "2025-01-20T23-21-33-05-00_mmmu-multiple-choice_3huWbH3SVWx7NTGwYoKbBD.json"
     },
     "o1": {
         "winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
         "humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",

         "mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
         "mmmu_multiple_choice": "2025-01-20T23-21-33-05-00_mmmu-multiple-choice_3huWbH3SVWx7NTGwYoKbBD.json"
     },
+    "o3-mini": {
+        "math": "2025-02-06T18-33-30-05-00_math_86Gx8n4BxhpyfaSHmRcCUm.json",
+        "humaneval": "2025-02-06T20-58-48-05-00_humaneval_Dkod7CS9RmbbogYx9aEXtx.json",
+        "mmlu_pro": "2025-02-06T19-49-27-05-00_mmlu-pro_jz9woKfdKt8VMzqNFsy7kY.json",
+        "gpqa_diamond": "2025-02-06T17-57-54-05-00_gpqa-diamond_2znyMtdc7X4LJufxXeXA8Z.json",
+        "winogrande": "2025-02-06T22-50-40-05-00_winogrande_VsTW2uU2Kj66YoNoFfRfUj.json",
+        "gsm8k": "2025-02-06T18-23-05-05-00_gsm8k_d523pJzkcvobxamhhobCRb.json",
+        "arc_challenge": "2025-02-06T17-53-30-05-00_arc-challenge_AYFHec7wmd4jELF2Rgzfya.json",
+        "arc_easy": "2025-02-06T17-45-57-05-00_arc-easy_Nd8NP3K48tvwLVZb8kXDwg.json",
+        "gaia": "2025-02-05T23-21-20+00-00_gaia_hyMq8MzMm6NgAeq3dNqZSU.json",
+        "gdm_intercode_ctf": "2025-02-05T21-43-18+00-00_gdm-intercode-ctf_gdm29C6DuTEsX9qm9ymmrC.json",
+        "gdm_in_house_ctf": "2025-02-05T23-59-08+00-00_gdm-in-house-ctf_2zkAX5nkJoxDnVKpJL9VgW.json",
+        "agentharm_benign": "2025-02-03T18-49-08-08-00_agentharm-benign_Gv94YFpAXaaCJqe3Fc6yr3.json",
+        "agentharm": "2025-02-03T18-17-03-08-00_agentharm_DmN6i5HrgXHNARjsuSewjg.json",
+        "swe_bench": "2025-02-03T06-49-09+00-00_openai-o3-mini.json"
+    },
+    "DeepSeek-R1": {
+        "mmlu_pro": "2025-02-12T11-02-35-05-00_mmlu-pro_BhD89DYN9KM3k4weSDfaQK.json",
+        "humaneval": "2025-02-03T11-45-22-05-00_humaneval_hnkHWYqrb5HxiBt2CWzCnq.json",
+        "math": "2025-02-11T11-38-10-05-00_math_ZYFSqsWsmP5kLRLHEMWULU.json",
+        "gsm8k": "2025-02-02T16-28-05-05-00_gsm8k_YMw6WiZkgTBQ54z5UHtDDX.json",
+        "arc_challenge": "2025-01-30T15-42-39-05-00_arc-challenge_CviW9ro6rKBbctkwJzQstp.json",
+        "winogrande": "2025-02-04T00-25-12-05-00_winogrande_NPgTbtqom2QSPKxeThWrdZ.json",
+        "arc_easy": "2025-01-30T12-48-35-05-00_arc-easy_SvRDfqsHDECQtvNU7rodZH.json",
+        "gpqa_diamond": "2025-02-11T11-37-45-05-00_gpqa-diamond_MwnVeLwyuiEAALr3M5q3dn.json"
+    },
     "o1": {
         "winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
         "humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",

refactor_eval_results.py CHANGED Viewed

@@ -42,6 +42,8 @@ MODEL_SHA_MAP = {
     "gpt-4o": "https://openai.com/index/hello-gpt-4o",
     "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
     "o1": "https://openai.com/o1",
 }
 MODEL_VERSION_MAP = {
@@ -58,6 +60,8 @@ MODEL_VERSION_MAP = {
     "gpt-4o": "GPT-4o-20240806",
     "gpt-4o-mini": "GPT-4o-mini-20240718",
     "o1": "o1-20241217",
 }
 AGENTIC_LOG_MODEL_NAME_MAP = {
@@ -65,6 +69,7 @@ AGENTIC_LOG_MODEL_NAME_MAP = {
     "gemini-1.5-pro": "gemini-1.5-pro-002",
     "gpt-4o": "gpt-4o-2024-08-06",
     "o1": "o1-2024-12-17",
 }
 AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]

     "gpt-4o": "https://openai.com/index/hello-gpt-4o",
     "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
     "o1": "https://openai.com/o1",
+    "o3-mini": "https://openai.com/index/openai-o3-mini",
+    "DeepSeek-R1": "https://api-docs.deepseek.com/news/news250120"
 }
 MODEL_VERSION_MAP = {
     "gpt-4o": "GPT-4o-20240806",
     "gpt-4o-mini": "GPT-4o-mini-20240718",
     "o1": "o1-20241217",
+    "o3-mini": "o3-mini-20250131",
+    "DeepSeek-R1": "DeepSeek-R1",
 }
 AGENTIC_LOG_MODEL_NAME_MAP = {
     "gemini-1.5-pro": "gemini-1.5-pro-002",
     "gpt-4o": "gpt-4o-2024-08-06",
     "o1": "o1-2024-12-17",
+    "o3-mini": "o3-mini-2025-01-31",
 }
 AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]

src/about.py CHANGED Viewed

@@ -113,32 +113,33 @@ These benchmarks go beyond basic reasoning and evaluate more advanced, autonomou
 """
 REPRODUCIBILITY_TEXT = """
-## Reproduce and Extend the Leaderboard
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
-```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """

 """
 REPRODUCIBILITY_TEXT = """
+## 🛠️ Reproducibility
+The [Vector State of Evaluation Leaderboard Repository](https://github.com/VectorInstitute/evaluation) repository contains the evaluation script to reproduce results presented on the leaderboard.
+### Install dependencies
+1. Create a python virtual env. with ```python>=3.10``` and activate it
+```bash
+python -m venv env
+source env/bin/activate
+```
+2. Install ```inspect_ai```, ```inspect_evals``` and other dependencies based on ```requirements.txt```
+```bash
+python -m pip install -r requirements.txt
+```
+3. Install any packages required for models you'd like to evaluate and use as grader models
+```bash
+python -m pip install <model_package>
+```
+Note: ```openai``` package is already included in ```requirements.txt```
+### Run Inspect evaluation
+1. Update the ```src/evals_cfg/run_cfg.yaml``` file to select the evals (base/agentic) and include all models to be evaluated
+2. Now run evaluation as follows:
+```bash
+python src/run_evals.py
+```
 """

src/display/utils.py CHANGED Viewed

@@ -27,6 +27,7 @@ class ColumnContent:
 auto_eval_column_dict = []
 # Init
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 # Scores
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])

 auto_eval_column_dict = []
 # Init
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+auto_eval_column_dict.append(["agent", ColumnContent, ColumnContent("Agent", "markdown", True, never_hidden=True)])
 # Scores
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])

src/leaderboard/read_evals.py CHANGED Viewed

@@ -118,6 +118,8 @@ class EvalResult:
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.model.name: make_clickable_model(self.model_version, self.revision),
         }
         for task in Tasks:

         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.model.name: make_clickable_model(self.model_version, self.revision),
+            # As of now all models use the basic inspect agent
+            AutoEvalColumn.agent.name: "[Basic Agent](https://inspect.ai-safety-institute.org.uk/agents.html#sec-basic-agent)"
         }
         for task in Tasks:

src/populate.py CHANGED Viewed

@@ -46,7 +46,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     df = df[cols].round(decimals=2)
     # subset for model and benchmark cols
-    df = df[[AutoEvalColumn.model.name] + benchmark_cols]
     # drop rows for which all benchmark cols are empty
     df = df.dropna(subset=benchmark_cols, axis=0, how="all")

     df = df[cols].round(decimals=2)
     # subset for model and benchmark cols
+    df = df[[AutoEvalColumn.model.name, AutoEvalColumn.agent.name] + benchmark_cols]
     # drop rows for which all benchmark cols are empty
     df = df.dropna(subset=benchmark_cols, axis=0, how="all")