File size: 1,847 Bytes
ea047ad
3adea5e
ea047ad
9562cba
ea047ad
1a6cc70
67741f2
ea047ad
 
67741f2
 
ea047ad
3adea5e
 
9562cba
 
ea047ad
67741f2
 
ea047ad
67741f2
 
 
 
ea047ad
 
67741f2
 
3adea5e
67741f2
 
 
ea047ad
 
 
 
3adea5e
ea047ad
3adea5e
ea047ad
3adea5e
ea047ad
 
 
 
 
 
67741f2
 
 
 
 
 
ea047ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import subprocess
import asyncio
from pathlib import Path

from yourbench_space.leaderboard_space.env import INIT_MODELS


ON_SPACES = os.environ.get("system") == "spaces"
OUTPUT_DIR = "/data" if ON_SPACES else "."


def create_eval_file(eval_ds_name: str):
    task_name = eval_ds_name.replace("/", "_")
    template_path = Path("/home/user/app/yourbench_space/lighteval_task/yourbench_task.py")
    subprocess.run(["lighteval", "tasks", "create", str(template_path), task_name, eval_ds_name])

async def run_process(args: list) -> dict:
    process = await asyncio.create_subprocess_exec(
        *args, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
    )
    await asyncio.wait_for(process.wait(), timeout=180)
    stdout = await process.stdout.read()
    stderr = await process.stderr.read()
    return {"pid": process.pid, "stdout": stdout.decode(), "stderr": stderr.decode()}


async def run_evaluations(eval_ds_name: str, org: str) -> list:
    task_name = eval_ds_name.replace("/", "_")
    tasks = []
    for model_name, provider in INIT_MODELS:
        args = [
            "lighteval",
            "endpoint",
            "inference-providers",
            f"model={model_name},provider={provider}",
            f"custom|{task_name}|0|0",
            "--custom-tasks",
            f"custom_{task_name}_task.py",
            "--max-samples",
            "30",
            "--output-dir",
            f"{OUTPUT_DIR}",
            "--save-details",
            "--results-org",
            org,
            "--push-to-hub",
        ]
        tasks.append(run_process(args))
    # Will capture the task if failed
    processes = await asyncio.gather(*tasks, return_exceptions=True)
    if all(not isinstance(result, Exception) for result in processes):
        return "✅"
    return "At least one model failed"