xeon27 commited on
Commit
8b91831
·
1 Parent(s): 40ac9c7

Add script for refactoring results from log files

Browse files
Files changed (1) hide show
  1. refactor_eval_results.py +99 -0
refactor_eval_results.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+
5
+ METRIC_NAME = {
6
+ # base
7
+ "arc_easy": "accuracy",
8
+ "arc_challenge": "accuracy",
9
+ "gpqa_diamond": "accuracy",
10
+ "drop": "mean",
11
+ "winogrande": "accuracy",
12
+ "gsm8k": "accuracy",
13
+ "hellaswag": "accuracy",
14
+ "humaneval": "mean",
15
+ "ifeval": "final_acc",
16
+ "math": "accuracy",
17
+ "mmlu": "accuracy",
18
+ "mmlu_pro": "accuracy",
19
+
20
+ # agentic
21
+ "gaia": "mean",
22
+ "gdm_intercode_ctf": "accuracy",
23
+ }
24
+
25
+
26
+ def combine_eval_results(results_path: str, model_name: str) -> dict:
27
+ results = dict(
28
+ {
29
+ "config": {
30
+ "model_name": model_name,
31
+ # dummy keys
32
+ "model_sha": model_name,
33
+ "model_dtype": "torch.float16",
34
+ },
35
+ "results": {},
36
+ }
37
+ )
38
+ for file in os.listdir(os.path.join(results_path, model_name)):
39
+ if file.endswith(".json"):
40
+ with open(os.path.join(results_path, model_name, file), "r") as f:
41
+ result = json.load(f)
42
+ task_name = result["eval"]["task"].split("/")[-1]
43
+ if task_name == "math":
44
+ metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
45
+ else:
46
+ metrics = result["results"]["scores"][0]["metrics"]
47
+ metric_name = metrics[METRIC_NAME[task_name]]["name"]
48
+ metric_value = metrics[METRIC_NAME[task_name]]["value"]
49
+ results["results"].update(
50
+ {
51
+ task_name: {
52
+ metric_name: metric_value
53
+ }
54
+ }
55
+ )
56
+ return results
57
+
58
+
59
+ def main():
60
+
61
+ CACHE_PATH=os.getenv("HF_HOME", ".")
62
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
63
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
64
+
65
+ base_bm_input_path = "./base_benchmarking_logs"
66
+ agentic_bm_input_path = "./agentic_benchmarking_logs"
67
+ os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
68
+ os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
69
+
70
+ for model_name in os.listdir(base_bm_input_path):
71
+ if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
72
+ results = combine_eval_results(base_bm_input_path, model_name)
73
+ if os.path.isdir(os.path.join(agentic_bm_input_path, model_name)):
74
+ agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name)
75
+ results["results"].update(agentic_bm_results["results"])
76
+ with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
77
+ json.dump(results, f, indent=4)
78
+
79
+ # Create dummy requests file
80
+ requests = {
81
+ "model": model_name,
82
+ "base_model": "",
83
+ "revision": "main",
84
+ "private": False,
85
+ "precision": "float16",
86
+ "weight_type": "Original",
87
+ "status": "FINISHED",
88
+ "submitted_time": "",
89
+ "model_type": "pretrained",
90
+ "likes": 0,
91
+ "params": 0,
92
+ "license": "custom",
93
+ }
94
+ with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f:
95
+ json.dump(requests, f, indent=4)
96
+
97
+
98
+ if __name__ == "__main__":
99
+ main()