File size: 7,459 Bytes
8b91831 eb538cb 8b91831 5438c77 8b91831 2718fde 8b91831 2718fde 1289818 8b91831 5438c77 954d8ee 5438c77 954d8ee 2718fde 1289818 8b91831 2718fde 8b91831 5438c77 8b91831 2718fde 5438c77 2718fde 1289818 2718fde 8b91831 2718fde 8b91831 2718fde 8b91831 2718fde e004342 18638a9 2718fde 8b91831 9c55d6d 954d8ee 8b91831 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import json
import os
METRIC_NAME = {
# single-turn
"arc_easy": "accuracy",
"arc_challenge": "accuracy",
"gpqa_diamond": "accuracy",
"drop": "mean",
"winogrande": "accuracy",
"gsm8k": "accuracy",
"hellaswag": "accuracy",
"humaneval": "mean",
"ifeval": "final_acc",
"math": "accuracy",
"mmlu": "accuracy",
"mmlu_pro": "accuracy",
"mmmu_multiple_choice": "accuracy",
"mmmu_open": "accuracy",
# agentic
"gaia": "accuracy",
"gdm_intercode_ctf": "accuracy",
"gdm_in_house_ctf": "accuracy",
"agentharm": "avg_score",
"agentharm_benign": "avg_score",
"swe_bench": "mean",
}
MODEL_SHA_MAP = {
# open source models
"c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
"Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
"Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
"Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
# closed source models
"claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet",
"gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5
"gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro",
"gpt-4o": "https://openai.com/index/hello-gpt-4o",
"gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
"o1": "https://openai.com/o1",
}
MODEL_VERSION_MAP = {
# open source models
"c4ai-command-r-plus": "c4ai-command-r-plus",
"Meta-Llama-3.1-70B-Instruct": "Llama-3.1-70B-Instruct",
"Mistral-Large-Instruct-2407": "Mistral-Large-Instruct-2407",
"Qwen2.5-72B-Instruct": "Qwen2.5-72B-Instruct",
# closed source models
"claude-3-5-sonnet-20241022": "Claude-3.5-Sonnet-20241022",
"gemini-1.5-flash": "Gemini-1.5-Flash",
"gemini-1.5-pro": "Gemini-1.5-Pro-002",
"gpt-4o": "GPT-4o-20240806",
"gpt-4o-mini": "GPT-4o-mini-20240718",
"o1": "o1-20241217",
}
AGENTIC_LOG_MODEL_NAME_MAP = {
"claude-3-5-sonnet-20241022": "claude-3-5-sonnet-20241022",
"gemini-1.5-pro": "gemini-1.5-pro-002",
"gpt-4o": "gpt-4o-2024-08-06",
"o1": "o1-2024-12-17",
}
AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]
def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict:
results = dict(
{
"config": {
"model_name": model_name,
# dummy keys
"model_sha": MODEL_SHA_MAP[model_name],
"model_dtype": "torch.float16",
},
"results": {},
}
)
if type == "base":
for file in os.listdir(os.path.join(results_path, model_name)):
if file.endswith(".json"):
with open(os.path.join(results_path, model_name, file), "r") as f:
try:
result = json.load(f)
task_name = result["eval"]["task"].split("/")[-1]
if task_name == "math":
metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
else:
metrics = result["results"]["scores"][0]["metrics"]
metric_name = metrics[METRIC_NAME[task_name]]["name"]
metric_value = metrics[METRIC_NAME[task_name]]["value"]
results["results"].update(
{
task_name: {
metric_name: metric_value
}
}
)
except KeyError as e:
print(f"KeyError: {e}")
print(model_name)
print(file)
elif type == "agentic":
model_name = AGENTIC_LOG_MODEL_NAME_MAP[model_name] # change name based on log file structure
results_path = os.path.join(results_path, model_name)
for task in AGENTIC_TASKS:
for file in os.listdir(os.path.join(results_path, task)):
if file.endswith(".json"):
with open(os.path.join(results_path, task, file), "r") as f:
try:
result = json.load(f)
task_name = result["eval"]["task"].split("/")[-1]
metrics = result["results"]["scores"][0]["metrics"]
metric_name = metrics[METRIC_NAME[task_name]]["name"].split("/")[-1]
metric_value = metrics[METRIC_NAME[task_name]]["value"]
results["results"].update(
{
task_name: {
metric_name: metric_value
}
}
)
except KeyError as e:
print(f"KeyError: {e}")
print(model_name)
print(file)
return results
def main():
CACHE_PATH=os.getenv("HF_HOME", ".")
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
base_bm_input_path = "./base_benchmarking_logs"
agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs"
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
for model_name in os.listdir(base_bm_input_path):
if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
results = combine_eval_results(base_bm_input_path, model_name, "base")
# TMP: Add missing benchmarks to the results
for metric in METRIC_NAME.items():
if metric[0] not in results["results"]:
results["results"].update({metric[0]: {metric[1]: None}})
if os.path.isdir(os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP.get(model_name, "NA"))):
agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name, "agentic")
results["results"].update(agentic_bm_results["results"])
with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
json.dump(results, f, indent=4)
# Create dummy requests file
requests = {
"model": model_name,
"model_sha": MODEL_SHA_MAP[model_name],
"model_version": MODEL_VERSION_MAP[model_name],
"base_model": "",
"revision": "main",
"private": False,
"precision": "float16",
"weight_type": "Original",
"status": "FINISHED",
"submitted_time": "",
"model_type": "pretrained",
"likes": 0,
"params": 0,
"license": "custom",
}
with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f:
json.dump(requests, f, indent=4)
if __name__ == "__main__":
main()
|