xeon27 commited on
Commit
5438c77
·
1 Parent(s): e1d7bbb

Add relevant model links

Browse files
Files changed (1) hide show
  1. refactor_eval_results.py +38 -15
refactor_eval_results.py CHANGED
@@ -16,12 +16,30 @@ METRIC_NAME = {
16
  "math": "accuracy",
17
  "mmlu": "accuracy",
18
  "mmlu_pro": "accuracy",
 
 
19
 
20
  # agentic
21
  "gaia": "mean",
22
  "gdm_intercode_ctf": "accuracy",
23
  }
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def combine_eval_results(results_path: str, model_name: str) -> dict:
27
  results = dict(
@@ -29,7 +47,7 @@ def combine_eval_results(results_path: str, model_name: str) -> dict:
29
  "config": {
30
  "model_name": model_name,
31
  # dummy keys
32
- "model_sha": model_name,
33
  "model_dtype": "torch.float16",
34
  },
35
  "results": {},
@@ -38,21 +56,26 @@ def combine_eval_results(results_path: str, model_name: str) -> dict:
38
  for file in os.listdir(os.path.join(results_path, model_name)):
39
  if file.endswith(".json"):
40
  with open(os.path.join(results_path, model_name, file), "r") as f:
41
- result = json.load(f)
42
- task_name = result["eval"]["task"].split("/")[-1]
43
- if task_name == "math":
44
- metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
45
- else:
46
- metrics = result["results"]["scores"][0]["metrics"]
47
- metric_name = metrics[METRIC_NAME[task_name]]["name"]
48
- metric_value = metrics[METRIC_NAME[task_name]]["value"]
49
- results["results"].update(
50
- {
51
- task_name: {
52
- metric_name: metric_value
 
 
53
  }
54
- }
55
- )
 
 
 
56
  return results
57
 
58
 
 
16
  "math": "accuracy",
17
  "mmlu": "accuracy",
18
  "mmlu_pro": "accuracy",
19
+ "mmmu_multiple_choice": "accuracy",
20
+ "mmmu_open": "accuracy",
21
 
22
  # agentic
23
  "gaia": "mean",
24
  "gdm_intercode_ctf": "accuracy",
25
  }
26
 
27
+ MODEL_SHA_MAP = {
28
+ # open source models
29
+ "c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", # TODO: verify for the 08-2024 version
30
+ "Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
31
+ "Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
32
+ "Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
33
+
34
+ # closed source models
35
+ "claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet",
36
+ "gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5
37
+ "gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro",
38
+ "gpt-4o": "https://openai.com/index/hello-gpt-4o",
39
+ "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
40
+ "o1": "https://openai.com/o1",
41
+ }
42
+
43
 
44
  def combine_eval_results(results_path: str, model_name: str) -> dict:
45
  results = dict(
 
47
  "config": {
48
  "model_name": model_name,
49
  # dummy keys
50
+ "model_sha": MODEL_SHA_MAP[model_name],
51
  "model_dtype": "torch.float16",
52
  },
53
  "results": {},
 
56
  for file in os.listdir(os.path.join(results_path, model_name)):
57
  if file.endswith(".json"):
58
  with open(os.path.join(results_path, model_name, file), "r") as f:
59
+ try:
60
+ result = json.load(f)
61
+ task_name = result["eval"]["task"].split("/")[-1]
62
+ if task_name == "math":
63
+ metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
64
+ else:
65
+ metrics = result["results"]["scores"][0]["metrics"]
66
+ metric_name = metrics[METRIC_NAME[task_name]]["name"]
67
+ metric_value = metrics[METRIC_NAME[task_name]]["value"]
68
+ results["results"].update(
69
+ {
70
+ task_name: {
71
+ metric_name: metric_value
72
+ }
73
  }
74
+ )
75
+ except KeyError as e:
76
+ print(f"KeyError: {e}")
77
+ print(model_name)
78
+ print(file)
79
  return results
80
 
81