eval-leaderboard

Running

App Files Files Community

xeon27 commited on Jan 20

Commit

8b91831

1 Parent(s): 40ac9c7

Add script for refactoring results from log files

Browse files

Files changed (1) hide show

refactor_eval_results.py +99 -0

refactor_eval_results.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import json
+import os
+METRIC_NAME = {
+    # base
+    "arc_easy": "accuracy",
+    "arc_challenge": "accuracy",
+    "gpqa_diamond": "accuracy",
+    "drop": "mean",
+    "winogrande": "accuracy",
+    "gsm8k": "accuracy",
+    "hellaswag": "accuracy",
+    "humaneval": "mean",
+    "ifeval": "final_acc",
+    "math": "accuracy",
+    "mmlu": "accuracy",
+    "mmlu_pro": "accuracy",
+    # agentic
+    "gaia": "mean",
+    "gdm_intercode_ctf": "accuracy",
+}
+def combine_eval_results(results_path: str, model_name: str) -> dict:
+    results = dict(
+        {
+            "config": {
+                "model_name": model_name,
+                # dummy keys
+                "model_sha": model_name,
+                "model_dtype": "torch.float16",
+            },
+            "results": {},
+        }
+    )
+    for file in os.listdir(os.path.join(results_path, model_name)):
+        if file.endswith(".json"):
+            with open(os.path.join(results_path, model_name, file), "r") as f:
+                result = json.load(f)
+                task_name = result["eval"]["task"].split("/")[-1]
+                if task_name == "math":
+                    metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
+                else:
+                    metrics = result["results"]["scores"][0]["metrics"]
+                metric_name = metrics[METRIC_NAME[task_name]]["name"]
+                metric_value = metrics[METRIC_NAME[task_name]]["value"]
+                results["results"].update(
+                    {
+                        task_name: {
+                            metric_name: metric_value
+                        }
+                    }
+                )
+    return results
+def main():
+    CACHE_PATH=os.getenv("HF_HOME", ".")
+    EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+    EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+    base_bm_input_path = "./base_benchmarking_logs"
+    agentic_bm_input_path = "./agentic_benchmarking_logs"
+    os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
+    os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
+    for model_name in os.listdir(base_bm_input_path):
+        if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
+            results = combine_eval_results(base_bm_input_path, model_name)
+        if os.path.isdir(os.path.join(agentic_bm_input_path, model_name)):
+            agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name)
+            results["results"].update(agentic_bm_results["results"])
+        with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
+            json.dump(results, f, indent=4)
+        # Create dummy requests file
+        requests = {
+            "model": model_name,
+            "base_model": "",
+            "revision": "main",
+            "private": False,
+            "precision": "float16",
+            "weight_type": "Original",
+            "status": "FINISHED",
+            "submitted_time": "",
+            "model_type": "pretrained",
+            "likes": 0,
+            "params": 0,
+            "license": "custom",
+        }
+        with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f:
+            json.dump(requests, f, indent=4)
+if __name__ == "__main__":
+    main()