eval-leaderboard

Running

App Files Files Community

xeon27 commited on Jan 21

Commit

5438c77

1 Parent(s): e1d7bbb

Add relevant model links

Browse files

Files changed (1) hide show

refactor_eval_results.py +38 -15

refactor_eval_results.py CHANGED Viewed

@@ -16,12 +16,30 @@ METRIC_NAME = {
     "math": "accuracy",
     "mmlu": "accuracy",
     "mmlu_pro": "accuracy",
     # agentic
     "gaia": "mean",
     "gdm_intercode_ctf": "accuracy",
 }
 def combine_eval_results(results_path: str, model_name: str) -> dict:
     results = dict(
@@ -29,7 +47,7 @@ def combine_eval_results(results_path: str, model_name: str) -> dict:
             "config": {
                 "model_name": model_name,
                 # dummy keys
-                "model_sha": model_name,
                 "model_dtype": "torch.float16",
             },
             "results": {},
@@ -38,21 +56,26 @@ def combine_eval_results(results_path: str, model_name: str) -> dict:
     for file in os.listdir(os.path.join(results_path, model_name)):
         if file.endswith(".json"):
             with open(os.path.join(results_path, model_name, file), "r") as f:
-                result = json.load(f)
-                task_name = result["eval"]["task"].split("/")[-1]
-                if task_name == "math":
-                    metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
-                else:
-                    metrics = result["results"]["scores"][0]["metrics"]
-                metric_name = metrics[METRIC_NAME[task_name]]["name"]
-                metric_value = metrics[METRIC_NAME[task_name]]["value"]
-                results["results"].update(
-                    {
-                        task_name: {
-                            metric_name: metric_value
                         }
-                    }
-                )
     return results

     "math": "accuracy",
     "mmlu": "accuracy",
     "mmlu_pro": "accuracy",
+    "mmmu_multiple_choice": "accuracy",
+    "mmmu_open": "accuracy",
     # agentic
     "gaia": "mean",
     "gdm_intercode_ctf": "accuracy",
 }
+MODEL_SHA_MAP = {
+    # open source models
+    "c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", # TODO: verify for the 08-2024 version
+    "Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
+    "Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
+    "Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
+    # closed source models
+    "claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet",
+    "gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5
+    "gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro",
+    "gpt-4o": "https://openai.com/index/hello-gpt-4o",
+    "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
+    "o1": "https://openai.com/o1",
+}
 def combine_eval_results(results_path: str, model_name: str) -> dict:
     results = dict(
             "config": {
                 "model_name": model_name,
                 # dummy keys
+                "model_sha": MODEL_SHA_MAP[model_name],
                 "model_dtype": "torch.float16",
             },
             "results": {},
     for file in os.listdir(os.path.join(results_path, model_name)):
         if file.endswith(".json"):
             with open(os.path.join(results_path, model_name, file), "r") as f:
+                try:
+                    result = json.load(f)
+                    task_name = result["eval"]["task"].split("/")[-1]
+                    if task_name == "math":
+                        metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
+                    else:
+                        metrics = result["results"]["scores"][0]["metrics"]
+                    metric_name = metrics[METRIC_NAME[task_name]]["name"]
+                    metric_value = metrics[METRIC_NAME[task_name]]["value"]
+                    results["results"].update(
+                        {
+                            task_name: {
+                                metric_name: metric_value
+                            }
                         }
+                    )
+                except KeyError as e:
+                    print(f"KeyError: {e}")
+                    print(model_name)
+                    print(file)
     return results