Spaces:
Runtime error
Runtime error
Commit
·
9b8ac8f
1
Parent(s):
bcda822
add app file
Browse files- app.py +3 -9
- experiments.json +94 -5
app.py
CHANGED
@@ -5,19 +5,13 @@ import gradio as gr
|
|
5 |
from functools import lru_cache
|
6 |
|
7 |
# Load models and experiments
|
8 |
-
MODELS = [
|
9 |
-
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
|
10 |
-
"o3-mini-2025-01-31",
|
11 |
-
"meta-llama/Llama-3.3-70B-Instruct",
|
12 |
-
"moonshotai/Moonlight-16B-A3B-Instruct",
|
13 |
-
"gpt-4o",
|
14 |
-
"claude-3-7-sonnet-20250219",
|
15 |
-
"openai/gpt-4.5-preview-2025-02-27"
|
16 |
-
]
|
17 |
|
18 |
with open("experiments.json") as f:
|
19 |
experiments = json.load(f)
|
20 |
|
|
|
|
|
|
|
21 |
@lru_cache
|
22 |
def load_details_and_results(model, benchmark, experiment_tag):
|
23 |
def worker(example):
|
|
|
5 |
from functools import lru_cache
|
6 |
|
7 |
# Load models and experiments
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
with open("experiments.json") as f:
|
10 |
experiments = json.load(f)
|
11 |
|
12 |
+
MODELS = list(experiments.keys())
|
13 |
+
MODELS = [m for m in MODELS if m != "claude-3-7-sonnet-20250219"]
|
14 |
+
|
15 |
@lru_cache
|
16 |
def load_details_and_results(model, benchmark, experiment_tag):
|
17 |
def worker(example):
|
experiments.json
CHANGED
@@ -62,7 +62,8 @@
|
|
62 |
"extractive_match"
|
63 |
],
|
64 |
"tags": {
|
65 |
-
"
|
|
|
66 |
}
|
67 |
},
|
68 |
"gpqa_diamond": {
|
@@ -71,7 +72,8 @@
|
|
71 |
"extractive_match"
|
72 |
],
|
73 |
"tags": {
|
74 |
-
"
|
|
|
75 |
}
|
76 |
},
|
77 |
"aime_24": {
|
@@ -80,7 +82,8 @@
|
|
80 |
"extractive_match"
|
81 |
],
|
82 |
"tags": {
|
83 |
-
"
|
|
|
84 |
}
|
85 |
},
|
86 |
"aime_25": {
|
@@ -89,7 +92,8 @@
|
|
89 |
"extractive_match"
|
90 |
],
|
91 |
"tags": {
|
92 |
-
"
|
|
|
93 |
}
|
94 |
},
|
95 |
"ifeval": {
|
@@ -98,7 +102,8 @@
|
|
98 |
"prompt_level_strict_acc"
|
99 |
],
|
100 |
"tags": {
|
101 |
-
"
|
|
|
102 |
}
|
103 |
}
|
104 |
}
|
@@ -416,5 +421,89 @@
|
|
416 |
}
|
417 |
}
|
418 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
419 |
}
|
420 |
}
|
|
|
62 |
"extractive_match"
|
63 |
],
|
64 |
"tags": {
|
65 |
+
"default": "2025-02-25T14-35-15.137825",
|
66 |
+
"thinking": "2025-03-05T10-14-44.802711"
|
67 |
}
|
68 |
},
|
69 |
"gpqa_diamond": {
|
|
|
72 |
"extractive_match"
|
73 |
],
|
74 |
"tags": {
|
75 |
+
"default": "2025-02-25T12-43-49.294245",
|
76 |
+
"thinking": "2025-03-05T15-37-37.180318"
|
77 |
}
|
78 |
},
|
79 |
"aime_24": {
|
|
|
82 |
"extractive_match"
|
83 |
],
|
84 |
"tags": {
|
85 |
+
"default": "2025-02-25T12-37-52.771787",
|
86 |
+
"thinking": "2025-03-05T12-39-13.627801"
|
87 |
}
|
88 |
},
|
89 |
"aime_25": {
|
|
|
92 |
"extractive_match"
|
93 |
],
|
94 |
"tags": {
|
95 |
+
"default": "2025-02-25T12-37-52.771787",
|
96 |
+
"thinking": "2025-03-05T12-39-13.627801"
|
97 |
}
|
98 |
},
|
99 |
"ifeval": {
|
|
|
102 |
"prompt_level_strict_acc"
|
103 |
],
|
104 |
"tags": {
|
105 |
+
"default": "2025-02-25T12-24-45.750753",
|
106 |
+
"thinking": "2025-03-05T15-37-37.180318"
|
107 |
}
|
108 |
}
|
109 |
}
|
|
|
421 |
}
|
422 |
}
|
423 |
}
|
424 |
+
},
|
425 |
+
"openai/deepseek-ai/DeepSeek-R1": {
|
426 |
+
"display_name": "DeepSeek R1",
|
427 |
+
"provider": "deepseek",
|
428 |
+
"open": true,
|
429 |
+
"benchmarks": {
|
430 |
+
"math_500": {
|
431 |
+
"subset": "lighteval|math_500|0",
|
432 |
+
"metrics": ["extractive_match"],
|
433 |
+
"tags": {
|
434 |
+
"latest": "2025-03-04T17-06-33.124766"
|
435 |
+
}
|
436 |
+
},
|
437 |
+
"gpqa_diamond": {
|
438 |
+
"subset": "lighteval|gpqa:diamond|0",
|
439 |
+
"metrics": ["extractive_match"],
|
440 |
+
"tags": {
|
441 |
+
"latest": "2025-03-04T17-06-33.124766"
|
442 |
+
}
|
443 |
+
},
|
444 |
+
"aime_24": {
|
445 |
+
"subset": "lighteval|aime24|0",
|
446 |
+
"metrics": ["extractive_match"],
|
447 |
+
"tags": {
|
448 |
+
"latest": "2025-03-04T14-52-35.594174"
|
449 |
+
}
|
450 |
+
},
|
451 |
+
"aime_25": {
|
452 |
+
"subset": "lighteval|aime25|0",
|
453 |
+
"metrics": ["extractive_match"],
|
454 |
+
"tags": {
|
455 |
+
"latest": "2025-03-04T14-25-05.009799"
|
456 |
+
}
|
457 |
+
},
|
458 |
+
"ifeval": {
|
459 |
+
"subset": "extended|ifeval|0",
|
460 |
+
"metrics": ["prompt_level_strict_acc"],
|
461 |
+
"tags": {
|
462 |
+
"latest": "2025-03-04T15-24-42.488745"
|
463 |
+
}
|
464 |
+
}
|
465 |
+
}
|
466 |
+
},
|
467 |
+
"Qwen/QwQ-32B": {
|
468 |
+
"display_name": "QwQ 32B",
|
469 |
+
"provider": "Qwen",
|
470 |
+
"open": true,
|
471 |
+
"benchmarks": {
|
472 |
+
"math_500": {
|
473 |
+
"subset": "lighteval|math_500|0",
|
474 |
+
"metrics": ["extractive_match"],
|
475 |
+
"tags": {
|
476 |
+
"latest": "2025-03-07T11-04-40.089127"
|
477 |
+
}
|
478 |
+
},
|
479 |
+
"gpqa_diamond": {
|
480 |
+
"subset": "lighteval|gpqa:diamond|0",
|
481 |
+
"metrics": ["extractive_match"],
|
482 |
+
"tags": {
|
483 |
+
"latest": "2025-03-07T11-04-40.089127"
|
484 |
+
}
|
485 |
+
},
|
486 |
+
"aime_24": {
|
487 |
+
"subset": "lighteval|aime24|0",
|
488 |
+
"metrics": ["extractive_match"],
|
489 |
+
"tags": {
|
490 |
+
"latest": "2025-03-10T10-36-07.886033"
|
491 |
+
}
|
492 |
+
},
|
493 |
+
"aime_25": {
|
494 |
+
"subset": "lighteval|aime25|0",
|
495 |
+
"metrics": ["extractive_match"],
|
496 |
+
"tags": {
|
497 |
+
"latest": "2025-03-10T10-36-07.886033"
|
498 |
+
}
|
499 |
+
},
|
500 |
+
"ifeval": {
|
501 |
+
"subset": "extended|ifeval|0",
|
502 |
+
"metrics": ["prompt_level_strict_acc"],
|
503 |
+
"tags": {
|
504 |
+
"latest": "2025-03-07T11-04-40.089127"
|
505 |
+
}
|
506 |
+
}
|
507 |
+
}
|
508 |
}
|
509 |
}
|