File size: 7,459 Bytes
8b91831
 
 
 
 
eb538cb
8b91831
 
 
 
 
 
 
 
 
 
 
 
5438c77
 
8b91831
 
2718fde
8b91831
2718fde
1289818
 
 
8b91831
 
5438c77
 
954d8ee
5438c77
 
 
 
 
 
 
 
 
 
 
 
 
954d8ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2718fde
 
 
 
 
 
 
1289818
8b91831
2718fde
 
8b91831
 
 
 
 
5438c77
8b91831
 
 
 
 
2718fde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5438c77
2718fde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1289818
2718fde
 
 
 
 
 
 
 
 
 
 
 
 
8b91831
 
 
 
 
 
 
 
 
 
2718fde
8b91831
 
 
 
2718fde
8b91831
2718fde
 
e004342
 
18638a9
2718fde
 
 
8b91831
 
 
 
 
 
 
9c55d6d
954d8ee
8b91831
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import json
import os


METRIC_NAME = {
    # single-turn
    "arc_easy": "accuracy",
    "arc_challenge": "accuracy",
    "gpqa_diamond": "accuracy",
    "drop": "mean",
    "winogrande": "accuracy",
    "gsm8k": "accuracy",
    "hellaswag": "accuracy",
    "humaneval": "mean",
    "ifeval": "final_acc",
    "math": "accuracy",
    "mmlu": "accuracy",
    "mmlu_pro": "accuracy",
    "mmmu_multiple_choice": "accuracy",
    "mmmu_open": "accuracy",

    # agentic
    "gaia": "accuracy",
    "gdm_intercode_ctf": "accuracy",
    "gdm_in_house_ctf": "accuracy",
    "agentharm": "avg_score",
    "agentharm_benign": "avg_score",
    "swe_bench": "mean",
}

MODEL_SHA_MAP = {
    # open source models
    "c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
    "Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
    "Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
    "Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",

    # closed source models
    "claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet",
    "gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5
    "gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro",
    "gpt-4o": "https://openai.com/index/hello-gpt-4o",
    "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
    "o1": "https://openai.com/o1",
}

MODEL_VERSION_MAP = {
    # open source models
    "c4ai-command-r-plus": "c4ai-command-r-plus",
    "Meta-Llama-3.1-70B-Instruct": "Llama-3.1-70B-Instruct",
    "Mistral-Large-Instruct-2407": "Mistral-Large-Instruct-2407",
    "Qwen2.5-72B-Instruct": "Qwen2.5-72B-Instruct",

    # closed source models
    "claude-3-5-sonnet-20241022": "Claude-3.5-Sonnet-20241022",
    "gemini-1.5-flash": "Gemini-1.5-Flash",
    "gemini-1.5-pro": "Gemini-1.5-Pro-002",
    "gpt-4o": "GPT-4o-20240806",
    "gpt-4o-mini": "GPT-4o-mini-20240718",
    "o1": "o1-20241217",
}

AGENTIC_LOG_MODEL_NAME_MAP = {
    "claude-3-5-sonnet-20241022": "claude-3-5-sonnet-20241022",
    "gemini-1.5-pro": "gemini-1.5-pro-002",
    "gpt-4o": "gpt-4o-2024-08-06",
    "o1": "o1-2024-12-17",
}

AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]


def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict:
    results = dict(
        {
            "config": {
                "model_name": model_name,
                # dummy keys
                "model_sha": MODEL_SHA_MAP[model_name],
                "model_dtype": "torch.float16",
            },
            "results": {},
        }
    )

    if type == "base":
        for file in os.listdir(os.path.join(results_path, model_name)):
            if file.endswith(".json"):
                with open(os.path.join(results_path, model_name, file), "r") as f:
                    try:
                        result = json.load(f)
                        task_name = result["eval"]["task"].split("/")[-1]
                        if task_name == "math":
                            metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
                        else:
                            metrics = result["results"]["scores"][0]["metrics"]
                        metric_name = metrics[METRIC_NAME[task_name]]["name"]
                        metric_value = metrics[METRIC_NAME[task_name]]["value"]
                        results["results"].update(
                            {
                                task_name: {
                                    metric_name: metric_value
                                }
                            }
                        )
                    except KeyError as e:
                        print(f"KeyError: {e}")
                        print(model_name)
                        print(file)

    elif type == "agentic":
        model_name = AGENTIC_LOG_MODEL_NAME_MAP[model_name] # change name based on log file structure
        results_path = os.path.join(results_path, model_name)
        for task in AGENTIC_TASKS:
            for file in os.listdir(os.path.join(results_path, task)):
                if file.endswith(".json"):
                    with open(os.path.join(results_path, task, file), "r") as f:
                        try:
                            result = json.load(f)
                            task_name = result["eval"]["task"].split("/")[-1]
                            metrics = result["results"]["scores"][0]["metrics"]
                            metric_name = metrics[METRIC_NAME[task_name]]["name"].split("/")[-1]
                            metric_value = metrics[METRIC_NAME[task_name]]["value"]
                            results["results"].update(
                                {
                                    task_name: {
                                        metric_name: metric_value
                                    }
                                }
                            )
                        except KeyError as e:
                            print(f"KeyError: {e}")
                            print(model_name)
                            print(file)

    return results


def main():

    CACHE_PATH=os.getenv("HF_HOME", ".")
    EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
    EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")

    base_bm_input_path = "./base_benchmarking_logs"
    agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs"
    os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
    os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)

    for model_name in os.listdir(base_bm_input_path):

        if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
            results = combine_eval_results(base_bm_input_path, model_name, "base")
        # TMP: Add missing benchmarks to the results
        for metric in METRIC_NAME.items():
            if metric[0] not in results["results"]:
                results["results"].update({metric[0]: {metric[1]: None}})

        if os.path.isdir(os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP.get(model_name, "NA"))):
            agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name, "agentic")
            results["results"].update(agentic_bm_results["results"])
        with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
            json.dump(results, f, indent=4)        

        # Create dummy requests file
        requests = {
            "model": model_name,
            "model_sha": MODEL_SHA_MAP[model_name],
            "model_version": MODEL_VERSION_MAP[model_name],
            "base_model": "",
            "revision": "main",
            "private": False,
            "precision": "float16",
            "weight_type": "Original",
            "status": "FINISHED",
            "submitted_time": "",
            "model_type": "pretrained",
            "likes": 0,
            "params": 0,
            "license": "custom",
        }
        with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f:
            json.dump(requests, f, indent=4)


if __name__ == "__main__":
    main()