Spaces:
Runtime error
Runtime error
Commit
·
c5bf87e
1
Parent(s):
9b8ac8f
init
Browse files- app.py +26 -8
- experiments.json +87 -3
app.py
CHANGED
@@ -2,7 +2,6 @@ from datasets import load_dataset
|
|
2 |
from collections import defaultdict
|
3 |
import json
|
4 |
import gradio as gr
|
5 |
-
from functools import lru_cache
|
6 |
|
7 |
# Load models and experiments
|
8 |
|
@@ -10,9 +9,8 @@ with open("experiments.json") as f:
|
|
10 |
experiments = json.load(f)
|
11 |
|
12 |
MODELS = list(experiments.keys())
|
13 |
-
MODELS = [m for m in MODELS if m
|
14 |
|
15 |
-
@lru_cache
|
16 |
def load_details_and_results(model, benchmark, experiment_tag):
|
17 |
def worker(example):
|
18 |
example["predictions"] = example["predictions"]
|
@@ -20,7 +18,7 @@ def load_details_and_results(model, benchmark, experiment_tag):
|
|
20 |
example["metrics"] = example["metrics"]
|
21 |
return example
|
22 |
|
23 |
-
repo = f"
|
24 |
subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("|", "_").replace(":", "_")
|
25 |
split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_")
|
26 |
|
@@ -49,8 +47,14 @@ def display_model_comparison(selected_models, benchmark, example_index):
|
|
49 |
if not selected_models:
|
50 |
return "Please select at least one model to compare."
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
outputs = []
|
53 |
-
for model in selected_models
|
54 |
try:
|
55 |
example = experiment_details[model][benchmark][example_index]
|
56 |
outputs.append({
|
@@ -111,7 +115,9 @@ def display_model_comparison(selected_models, benchmark, example_index):
|
|
111 |
html_output += "<div style='margin-bottom: 10px;'>\n"
|
112 |
html_output += f"<strong>{role}:</strong>\n"
|
113 |
html_output += "<div style='overflow-x: auto;'>\n"
|
114 |
-
|
|
|
|
|
115 |
html_output += "</div>\n"
|
116 |
html_output += "</div>\n"
|
117 |
else:
|
@@ -123,8 +129,13 @@ def display_model_comparison(selected_models, benchmark, example_index):
|
|
123 |
else:
|
124 |
html_output += "<div style='overflow-x: auto;'>\n"
|
125 |
if isinstance(prompt_text, dict) and 'content' in prompt_text:
|
126 |
-
|
|
|
|
|
127 |
else:
|
|
|
|
|
|
|
128 |
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
|
129 |
html_output += "</div>\n"
|
130 |
|
@@ -140,7 +151,9 @@ def display_model_comparison(selected_models, benchmark, example_index):
|
|
140 |
html_output += "</summary>\n"
|
141 |
html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
|
142 |
html_output += "<div style='overflow-x: auto;'>\n"
|
143 |
-
|
|
|
|
|
144 |
html_output += "</div>\n"
|
145 |
html_output += "</div>\n"
|
146 |
html_output += "</details>\n"
|
@@ -156,6 +169,11 @@ available_benchmarks = list(set(
|
|
156 |
for benchmark in experiment_details[model].keys()
|
157 |
))
|
158 |
|
|
|
|
|
|
|
|
|
|
|
159 |
# Create the Gradio interface
|
160 |
demo = gr.Interface(
|
161 |
fn=display_model_comparison,
|
|
|
2 |
from collections import defaultdict
|
3 |
import json
|
4 |
import gradio as gr
|
|
|
5 |
|
6 |
# Load models and experiments
|
7 |
|
|
|
9 |
experiments = json.load(f)
|
10 |
|
11 |
MODELS = list(experiments.keys())
|
12 |
+
MODELS = [m for m in MODELS if m.startswith("google/gemma-3")]
|
13 |
|
|
|
14 |
def load_details_and_results(model, benchmark, experiment_tag):
|
15 |
def worker(example):
|
16 |
example["predictions"] = example["predictions"]
|
|
|
18 |
example["metrics"] = example["metrics"]
|
19 |
return example
|
20 |
|
21 |
+
repo = f"OpenEvals/details_{model.replace('/', '__')}_private"
|
22 |
subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("|", "_").replace(":", "_")
|
23 |
split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_")
|
24 |
|
|
|
47 |
if not selected_models:
|
48 |
return "Please select at least one model to compare."
|
49 |
|
50 |
+
# Filter out models that don't have the selected benchmark
|
51 |
+
available_models = [model for model in selected_models if benchmark in experiment_details[model]]
|
52 |
+
|
53 |
+
if not available_models:
|
54 |
+
return f"No models have results for benchmark: {benchmark}"
|
55 |
+
|
56 |
outputs = []
|
57 |
+
for model in available_models: # Changed from selected_models to available_models
|
58 |
try:
|
59 |
example = experiment_details[model][benchmark][example_index]
|
60 |
outputs.append({
|
|
|
115 |
html_output += "<div style='margin-bottom: 10px;'>\n"
|
116 |
html_output += f"<strong>{role}:</strong>\n"
|
117 |
html_output += "<div style='overflow-x: auto;'>\n"
|
118 |
+
# Escape HTML in content
|
119 |
+
content = msg['content'].replace('<', '<').replace('>', '>')
|
120 |
+
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
|
121 |
html_output += "</div>\n"
|
122 |
html_output += "</div>\n"
|
123 |
else:
|
|
|
129 |
else:
|
130 |
html_output += "<div style='overflow-x: auto;'>\n"
|
131 |
if isinstance(prompt_text, dict) and 'content' in prompt_text:
|
132 |
+
# Escape HTML in content
|
133 |
+
content = prompt_text['content'].replace('<', '<').replace('>', '>')
|
134 |
+
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
|
135 |
else:
|
136 |
+
# Escape HTML if prompt_text is a string
|
137 |
+
if isinstance(prompt_text, str):
|
138 |
+
prompt_text = prompt_text.replace('<', '<').replace('>', '>')
|
139 |
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
|
140 |
html_output += "</div>\n"
|
141 |
|
|
|
151 |
html_output += "</summary>\n"
|
152 |
html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
|
153 |
html_output += "<div style='overflow-x: auto;'>\n"
|
154 |
+
# Escape HTML in prediction
|
155 |
+
prediction = output['Prediction'].replace('<', '<').replace('>', '>')
|
156 |
+
html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{prediction}</code></pre>\n"
|
157 |
html_output += "</div>\n"
|
158 |
html_output += "</div>\n"
|
159 |
html_output += "</details>\n"
|
|
|
169 |
for benchmark in experiment_details[model].keys()
|
170 |
))
|
171 |
|
172 |
+
# Update the Gradio interface to dynamically filter models based on benchmark
|
173 |
+
def update_model_choices(benchmark):
|
174 |
+
available_models = [model for model in MODELS if benchmark in experiment_details[model]]
|
175 |
+
return gr.Dropdown(choices=sorted(available_models), value=sorted(available_models))
|
176 |
+
|
177 |
# Create the Gradio interface
|
178 |
demo = gr.Interface(
|
179 |
fn=display_model_comparison,
|
experiments.json
CHANGED
@@ -3,6 +3,7 @@
|
|
3 |
"display_name": "gpt 4o",
|
4 |
"provider": "openai",
|
5 |
"open": false,
|
|
|
6 |
"benchmarks": {
|
7 |
"math_500": {
|
8 |
"subset": "lighteval|math_500|0",
|
@@ -55,6 +56,7 @@
|
|
55 |
"display_name": "Claude 3.7 Sonnet",
|
56 |
"provider": "anthropic",
|
57 |
"open": false,
|
|
|
58 |
"benchmarks": {
|
59 |
"math_500": {
|
60 |
"subset": "lighteval|math_500|0",
|
@@ -112,6 +114,7 @@
|
|
112 |
"display_name": "o3-mini",
|
113 |
"provider": "openai",
|
114 |
"open": false,
|
|
|
115 |
"benchmarks": {
|
116 |
"math_500": {
|
117 |
"subset": "lighteval|math_500|0",
|
@@ -164,6 +167,7 @@
|
|
164 |
"display_name": "Moonlight",
|
165 |
"provider": "moonshotai",
|
166 |
"open": true,
|
|
|
167 |
"benchmarks": {
|
168 |
"math_500": {
|
169 |
"subset": "lighteval|math_500|0",
|
@@ -216,6 +220,7 @@
|
|
216 |
"display_name": "Llama 3.3 70B",
|
217 |
"provider": "meta",
|
218 |
"open": true,
|
|
|
219 |
"benchmarks": {
|
220 |
"math_500": {
|
221 |
"subset": "lighteval|math_500|0",
|
@@ -258,6 +263,7 @@
|
|
258 |
"display_name": "DeepSeek Llama 70B",
|
259 |
"provider": "deepseek",
|
260 |
"open": true,
|
|
|
261 |
"benchmarks": {
|
262 |
"math_500": {
|
263 |
"subset": "lighteval|math_500|0",
|
@@ -300,6 +306,7 @@
|
|
300 |
"display_name": "TinyR1 32B",
|
301 |
"provider": "qihoo360",
|
302 |
"open": true,
|
|
|
303 |
"benchmarks": {
|
304 |
"math_500": {
|
305 |
"subset": "lighteval|math_500|0",
|
@@ -342,6 +349,7 @@
|
|
342 |
"display_name": "gpt 4.5",
|
343 |
"provider": "openai",
|
344 |
"open": false,
|
|
|
345 |
"benchmarks": {
|
346 |
"math_500": {
|
347 |
"subset": "lighteval|math_500|0",
|
@@ -384,6 +392,7 @@
|
|
384 |
"display_name": "DeepSeek Qwen 32B",
|
385 |
"provider": "deepseek",
|
386 |
"open": true,
|
|
|
387 |
"benchmarks": {
|
388 |
"math_500": {
|
389 |
"subset": "lighteval|math_500|0",
|
@@ -426,6 +435,7 @@
|
|
426 |
"display_name": "DeepSeek R1",
|
427 |
"provider": "deepseek",
|
428 |
"open": true,
|
|
|
429 |
"benchmarks": {
|
430 |
"math_500": {
|
431 |
"subset": "lighteval|math_500|0",
|
@@ -468,19 +478,20 @@
|
|
468 |
"display_name": "QwQ 32B",
|
469 |
"provider": "Qwen",
|
470 |
"open": true,
|
|
|
471 |
"benchmarks": {
|
472 |
"math_500": {
|
473 |
"subset": "lighteval|math_500|0",
|
474 |
"metrics": ["extractive_match"],
|
475 |
"tags": {
|
476 |
-
"latest": "2025-03-
|
477 |
}
|
478 |
},
|
479 |
"gpqa_diamond": {
|
480 |
"subset": "lighteval|gpqa:diamond|0",
|
481 |
"metrics": ["extractive_match"],
|
482 |
"tags": {
|
483 |
-
"latest": "2025-03-
|
484 |
}
|
485 |
},
|
486 |
"aime_24": {
|
@@ -501,7 +512,80 @@
|
|
501 |
"subset": "extended|ifeval|0",
|
502 |
"metrics": ["prompt_level_strict_acc"],
|
503 |
"tags": {
|
504 |
-
"latest": "2025-03-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
505 |
}
|
506 |
}
|
507 |
}
|
|
|
3 |
"display_name": "gpt 4o",
|
4 |
"provider": "openai",
|
5 |
"open": false,
|
6 |
+
"size": "?B",
|
7 |
"benchmarks": {
|
8 |
"math_500": {
|
9 |
"subset": "lighteval|math_500|0",
|
|
|
56 |
"display_name": "Claude 3.7 Sonnet",
|
57 |
"provider": "anthropic",
|
58 |
"open": false,
|
59 |
+
"size": "?B",
|
60 |
"benchmarks": {
|
61 |
"math_500": {
|
62 |
"subset": "lighteval|math_500|0",
|
|
|
114 |
"display_name": "o3-mini",
|
115 |
"provider": "openai",
|
116 |
"open": false,
|
117 |
+
"size": "?B",
|
118 |
"benchmarks": {
|
119 |
"math_500": {
|
120 |
"subset": "lighteval|math_500|0",
|
|
|
167 |
"display_name": "Moonlight",
|
168 |
"provider": "moonshotai",
|
169 |
"open": true,
|
170 |
+
"size": "16B",
|
171 |
"benchmarks": {
|
172 |
"math_500": {
|
173 |
"subset": "lighteval|math_500|0",
|
|
|
220 |
"display_name": "Llama 3.3 70B",
|
221 |
"provider": "meta",
|
222 |
"open": true,
|
223 |
+
"size": "70B",
|
224 |
"benchmarks": {
|
225 |
"math_500": {
|
226 |
"subset": "lighteval|math_500|0",
|
|
|
263 |
"display_name": "DeepSeek Llama 70B",
|
264 |
"provider": "deepseek",
|
265 |
"open": true,
|
266 |
+
"size": "70B",
|
267 |
"benchmarks": {
|
268 |
"math_500": {
|
269 |
"subset": "lighteval|math_500|0",
|
|
|
306 |
"display_name": "TinyR1 32B",
|
307 |
"provider": "qihoo360",
|
308 |
"open": true,
|
309 |
+
"size": "32B",
|
310 |
"benchmarks": {
|
311 |
"math_500": {
|
312 |
"subset": "lighteval|math_500|0",
|
|
|
349 |
"display_name": "gpt 4.5",
|
350 |
"provider": "openai",
|
351 |
"open": false,
|
352 |
+
"size": "?B",
|
353 |
"benchmarks": {
|
354 |
"math_500": {
|
355 |
"subset": "lighteval|math_500|0",
|
|
|
392 |
"display_name": "DeepSeek Qwen 32B",
|
393 |
"provider": "deepseek",
|
394 |
"open": true,
|
395 |
+
"size": "32B",
|
396 |
"benchmarks": {
|
397 |
"math_500": {
|
398 |
"subset": "lighteval|math_500|0",
|
|
|
435 |
"display_name": "DeepSeek R1",
|
436 |
"provider": "deepseek",
|
437 |
"open": true,
|
438 |
+
"size": "671B",
|
439 |
"benchmarks": {
|
440 |
"math_500": {
|
441 |
"subset": "lighteval|math_500|0",
|
|
|
478 |
"display_name": "QwQ 32B",
|
479 |
"provider": "Qwen",
|
480 |
"open": true,
|
481 |
+
"size": "32B",
|
482 |
"benchmarks": {
|
483 |
"math_500": {
|
484 |
"subset": "lighteval|math_500|0",
|
485 |
"metrics": ["extractive_match"],
|
486 |
"tags": {
|
487 |
+
"latest": "2025-03-10T11-47-46.303371"
|
488 |
}
|
489 |
},
|
490 |
"gpqa_diamond": {
|
491 |
"subset": "lighteval|gpqa:diamond|0",
|
492 |
"metrics": ["extractive_match"],
|
493 |
"tags": {
|
494 |
+
"latest": "2025-03-10T11-47-46.303371"
|
495 |
}
|
496 |
},
|
497 |
"aime_24": {
|
|
|
512 |
"subset": "extended|ifeval|0",
|
513 |
"metrics": ["prompt_level_strict_acc"],
|
514 |
"tags": {
|
515 |
+
"latest": "2025-03-10T12-21-36.862202"
|
516 |
+
}
|
517 |
+
}
|
518 |
+
}
|
519 |
+
},
|
520 |
+
"google/gemma-3-1b-it": {
|
521 |
+
"display_name": "Gemma 3",
|
522 |
+
"provider": "google",
|
523 |
+
"open": true,
|
524 |
+
"size": "1B",
|
525 |
+
"benchmarks": {
|
526 |
+
"aime_25": {
|
527 |
+
"subset": "lighteval|aime25|0",
|
528 |
+
"metrics": ["extractive_match"],
|
529 |
+
"tags": {
|
530 |
+
"latest": "2025-03-18T14-25-56.178612"
|
531 |
+
}
|
532 |
+
}
|
533 |
+
}
|
534 |
+
},
|
535 |
+
"google/gemma-3-12b-it": {
|
536 |
+
"display_name": "Gemma 3 12B",
|
537 |
+
"provider": "google",
|
538 |
+
"open": true,
|
539 |
+
"size": "12B",
|
540 |
+
"benchmarks": {
|
541 |
+
"aime_25": {
|
542 |
+
"subset": "lighteval|aime25|0",
|
543 |
+
"metrics": ["extractive_match"],
|
544 |
+
"tags": {
|
545 |
+
"latest": "2025-03-18T14-36-23.368081"
|
546 |
+
}
|
547 |
+
}
|
548 |
+
}
|
549 |
+
},
|
550 |
+
"google/gemma-3-27b-it": {
|
551 |
+
"display_name": "Gemma 3 27B",
|
552 |
+
"provider": "google",
|
553 |
+
"open": true,
|
554 |
+
"size": "27B",
|
555 |
+
"benchmarks": {
|
556 |
+
"aime_25": {
|
557 |
+
"subset": "lighteval|aime25|0",
|
558 |
+
"metrics": ["extractive_match"],
|
559 |
+
"tags": {
|
560 |
+
"latest": "2025-03-18T14-41-33.181467"
|
561 |
+
}
|
562 |
+
},
|
563 |
+
"aime_24": {
|
564 |
+
"subset": "lighteval|aime24|0",
|
565 |
+
"metrics": ["extractive_match"],
|
566 |
+
"tags": {
|
567 |
+
"latest": "2025-03-18T15-11-34.174477"
|
568 |
+
}
|
569 |
+
},
|
570 |
+
"ifeval": {
|
571 |
+
"subset": "extended|ifeval|0",
|
572 |
+
"metrics": ["prompt_level_strict_acc"],
|
573 |
+
"tags": {
|
574 |
+
"latest": "2025-03-18T15-20-14.979833"
|
575 |
+
}
|
576 |
+
},
|
577 |
+
"gpqa_diamond": {
|
578 |
+
"subset": "lighteval|gpqa:diamond|0",
|
579 |
+
"metrics": ["extractive_match"],
|
580 |
+
"tags": {
|
581 |
+
"latest": "2025-03-18T15-20-14.979833"
|
582 |
+
}
|
583 |
+
},
|
584 |
+
"math_500": {
|
585 |
+
"subset": "lighteval|math_500|0",
|
586 |
+
"metrics": ["extractive_match"],
|
587 |
+
"tags": {
|
588 |
+
"latest": "2025-03-18T15-20-14.979833"
|
589 |
}
|
590 |
}
|
591 |
}
|