xeon27
commited on
Commit
·
5438c77
1
Parent(s):
e1d7bbb
Add relevant model links
Browse files- refactor_eval_results.py +38 -15
refactor_eval_results.py
CHANGED
@@ -16,12 +16,30 @@ METRIC_NAME = {
|
|
16 |
"math": "accuracy",
|
17 |
"mmlu": "accuracy",
|
18 |
"mmlu_pro": "accuracy",
|
|
|
|
|
19 |
|
20 |
# agentic
|
21 |
"gaia": "mean",
|
22 |
"gdm_intercode_ctf": "accuracy",
|
23 |
}
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def combine_eval_results(results_path: str, model_name: str) -> dict:
|
27 |
results = dict(
|
@@ -29,7 +47,7 @@ def combine_eval_results(results_path: str, model_name: str) -> dict:
|
|
29 |
"config": {
|
30 |
"model_name": model_name,
|
31 |
# dummy keys
|
32 |
-
"model_sha": model_name,
|
33 |
"model_dtype": "torch.float16",
|
34 |
},
|
35 |
"results": {},
|
@@ -38,21 +56,26 @@ def combine_eval_results(results_path: str, model_name: str) -> dict:
|
|
38 |
for file in os.listdir(os.path.join(results_path, model_name)):
|
39 |
if file.endswith(".json"):
|
40 |
with open(os.path.join(results_path, model_name, file), "r") as f:
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
53 |
}
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
56 |
return results
|
57 |
|
58 |
|
|
|
16 |
"math": "accuracy",
|
17 |
"mmlu": "accuracy",
|
18 |
"mmlu_pro": "accuracy",
|
19 |
+
"mmmu_multiple_choice": "accuracy",
|
20 |
+
"mmmu_open": "accuracy",
|
21 |
|
22 |
# agentic
|
23 |
"gaia": "mean",
|
24 |
"gdm_intercode_ctf": "accuracy",
|
25 |
}
|
26 |
|
27 |
+
MODEL_SHA_MAP = {
|
28 |
+
# open source models
|
29 |
+
"c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", # TODO: verify for the 08-2024 version
|
30 |
+
"Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
|
31 |
+
"Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
|
32 |
+
"Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
|
33 |
+
|
34 |
+
# closed source models
|
35 |
+
"claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet",
|
36 |
+
"gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5
|
37 |
+
"gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro",
|
38 |
+
"gpt-4o": "https://openai.com/index/hello-gpt-4o",
|
39 |
+
"gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
|
40 |
+
"o1": "https://openai.com/o1",
|
41 |
+
}
|
42 |
+
|
43 |
|
44 |
def combine_eval_results(results_path: str, model_name: str) -> dict:
|
45 |
results = dict(
|
|
|
47 |
"config": {
|
48 |
"model_name": model_name,
|
49 |
# dummy keys
|
50 |
+
"model_sha": MODEL_SHA_MAP[model_name],
|
51 |
"model_dtype": "torch.float16",
|
52 |
},
|
53 |
"results": {},
|
|
|
56 |
for file in os.listdir(os.path.join(results_path, model_name)):
|
57 |
if file.endswith(".json"):
|
58 |
with open(os.path.join(results_path, model_name, file), "r") as f:
|
59 |
+
try:
|
60 |
+
result = json.load(f)
|
61 |
+
task_name = result["eval"]["task"].split("/")[-1]
|
62 |
+
if task_name == "math":
|
63 |
+
metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
|
64 |
+
else:
|
65 |
+
metrics = result["results"]["scores"][0]["metrics"]
|
66 |
+
metric_name = metrics[METRIC_NAME[task_name]]["name"]
|
67 |
+
metric_value = metrics[METRIC_NAME[task_name]]["value"]
|
68 |
+
results["results"].update(
|
69 |
+
{
|
70 |
+
task_name: {
|
71 |
+
metric_name: metric_value
|
72 |
+
}
|
73 |
}
|
74 |
+
)
|
75 |
+
except KeyError as e:
|
76 |
+
print(f"KeyError: {e}")
|
77 |
+
print(model_name)
|
78 |
+
print(file)
|
79 |
return results
|
80 |
|
81 |
|