eval-leaderboard

Running

File size: 7,459 Bytes

import json
import os


METRIC_NAME = {
    # single-turn
    "arc_easy": "accuracy",
    "arc_challenge": "accuracy",
    "gpqa_diamond": "accuracy",
    "drop": "mean",
    "winogrande": "accuracy",
    "gsm8k": "accuracy",
    "hellaswag": "accuracy",
    "humaneval": "mean",
    "ifeval": "final_acc",
    "math": "accuracy",
    "mmlu": "accuracy",
    "mmlu_pro": "accuracy",
    "mmmu_multiple_choice": "accuracy",
    "mmmu_open": "accuracy",

    # agentic
    "gaia": "accuracy",
    "gdm_intercode_ctf": "accuracy",
    "gdm_in_house_ctf": "accuracy",
    "agentharm": "avg_score",
    "agentharm_benign": "avg_score",
    "swe_bench": "mean",
}

MODEL_SHA_MAP = {
    # open source models
    "c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
    "Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
    "Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
    "Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",

    # closed source models
    "claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet",
    "gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5
    "gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro",
    "gpt-4o": "https://openai.com/index/hello-gpt-4o",
    "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
    "o1": "https://openai.com/o1",
}

MODEL_VERSION_MAP = {
    # open source models
    "c4ai-command-r-plus": "c4ai-command-r-plus",
    "Meta-Llama-3.1-70B-Instruct": "Llama-3.1-70B-Instruct",
    "Mistral-Large-Instruct-2407": "Mistral-Large-Instruct-2407",
    "Qwen2.5-72B-Instruct": "Qwen2.5-72B-Instruct",

    # closed source models
    "claude-3-5-sonnet-20241022": "Claude-3.5-Sonnet-20241022",
    "gemini-1.5-flash": "Gemini-1.5-Flash",
    "gemini-1.5-pro": "Gemini-1.5-Pro-002",
    "gpt-4o": "GPT-4o-20240806",
    "gpt-4o-mini": "GPT-4o-mini-20240718",
    "o1": "o1-20241217",
}

AGENTIC_LOG_MODEL_NAME_MAP = {
    "claude-3-5-sonnet-20241022": "claude-3-5-sonnet-20241022",
    "gemini-1.5-pro": "gemini-1.5-pro-002",
    "gpt-4o": "gpt-4o-2024-08-06",
    "o1": "o1-2024-12-17",
}

AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]


def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict:
    results = dict(
        {
            "config": {
                "model_name": model_name,
                # dummy keys
                "model_sha": MODEL_SHA_MAP[model_name],
                "model_dtype": "torch.float16",
            },
            "results": {},
        }
    )

    if type == "base":
        for file in os.listdir(os.path.join(results_path, model_name)):
            if file.endswith(".json"):
                with open(os.path.join(results_path, model_name, file), "r") as f:
                    try:
                        result = json.load(f)
                        task_name = result["eval"]["task"].split("/")[-1]
                        if task_name == "math":
                            metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
                        else:
                            metrics = result["results"]["scores"][0]["metrics"]
                        metric_name = metrics[METRIC_NAME[task_name]]["name"]
                        metric_value = metrics[METRIC_NAME[task_name]]["value"]
                        results["results"].update(
                            {
                                task_name: {
                                    metric_name: metric_value
                                }
                            }
                        )
                    except KeyError as e:
                        print(f"KeyError: {e}")
                        print(model_name)
                        print(file)

    elif type == "agentic":
        model_name = AGENTIC_LOG_MODEL_NAME_MAP[model_name] # change name based on log file structure
        results_path = os.path.join(results_path, model_name)
        for task in AGENTIC_TASKS:
            for file in os.listdir(os.path.join(results_path, task)):
                if file.endswith(".json"):
                    with open(os.path.join(results_path, task, file), "r") as f:
                        try:
                            result = json.load(f)
                            task_name = result["eval"]["task"].split("/")[-1]
                            metrics = result["results"]["scores"][0]["metrics"]
                            metric_name = metrics[METRIC_NAME[task_name]]["name"].split("/")[-1]
                            metric_value = metrics[METRIC_NAME[task_name]]["value"]
                            results["results"].update(
                                {
                                    task_name: {
                                        metric_name: metric_value
                                    }
                                }
                            )
                        except KeyError as e:
                            print(f"KeyError: {e}")
                            print(model_name)
                            print(file)

    return results


def main():

    CACHE_PATH=os.getenv("HF_HOME", ".")
    EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
    EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")

    base_bm_input_path = "./base_benchmarking_logs"
    agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs"
    os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
    os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)

    for model_name in os.listdir(base_bm_input_path):

        if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
            results = combine_eval_results(base_bm_input_path, model_name, "base")
        # TMP: Add missing benchmarks to the results
        for metric in METRIC_NAME.items():
            if metric[0] not in results["results"]:
                results["results"].update({metric[0]: {metric[1]: None}})

        if os.path.isdir(os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP.get(model_name, "NA"))):
            agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name, "agentic")
            results["results"].update(agentic_bm_results["results"])
        with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
            json.dump(results, f, indent=4)        

        # Create dummy requests file
        requests = {
            "model": model_name,
            "model_sha": MODEL_SHA_MAP[model_name],
            "model_version": MODEL_VERSION_MAP[model_name],
            "base_model": "",
            "revision": "main",
            "private": False,
            "precision": "float16",
            "weight_type": "Original",
            "status": "FINISHED",
            "submitted_time": "",
            "model_type": "pretrained",
            "likes": 0,
            "params": 0,
            "license": "custom",
        }
        with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f:
            json.dump(requests, f, indent=4)


if __name__ == "__main__":
    main()