Linker1907 commited on
Commit
bcda822
·
1 Parent(s): 114fe52

add app file

Browse files
Files changed (2) hide show
  1. app.py +197 -0
  2. experiments.json +420 -0
app.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from collections import defaultdict
3
+ import json
4
+ import gradio as gr
5
+ from functools import lru_cache
6
+
7
+ # Load models and experiments
8
+ MODELS = [
9
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
10
+ "o3-mini-2025-01-31",
11
+ "meta-llama/Llama-3.3-70B-Instruct",
12
+ "moonshotai/Moonlight-16B-A3B-Instruct",
13
+ "gpt-4o",
14
+ "claude-3-7-sonnet-20250219",
15
+ "openai/gpt-4.5-preview-2025-02-27"
16
+ ]
17
+
18
+ with open("experiments.json") as f:
19
+ experiments = json.load(f)
20
+
21
+ @lru_cache
22
+ def load_details_and_results(model, benchmark, experiment_tag):
23
+ def worker(example):
24
+ example["predictions"] = example["predictions"]
25
+ example["gold"] = example["gold"][0]
26
+ example["metrics"] = example["metrics"]
27
+ return example
28
+
29
+ repo = f"SaylorTwift/details_{model.replace('/', '__')}_private"
30
+ subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("|", "_").replace(":", "_")
31
+ split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_")
32
+
33
+ details = load_dataset(repo, subset, split=split)
34
+ results = load_dataset(repo, "results", split=split)
35
+
36
+ results = eval(results[0]["results"])
37
+
38
+ columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions']
39
+ details = details.select_columns(columns_to_keep)
40
+ details = details.map(worker)
41
+
42
+ return details, results
43
+
44
+ # Load all experiment details
45
+ experiment_details = defaultdict(dict)
46
+
47
+ for model in MODELS:
48
+ for benchmark, benchmark_details in experiments[model]["benchmarks"].items():
49
+ subset = benchmark_details["subset"]
50
+ for experiment_tag in benchmark_details["tags"]:
51
+ details, _ = load_details_and_results(model, benchmark, experiment_tag)
52
+ experiment_details[model][subset] = details
53
+
54
+ def display_model_comparison(selected_models, benchmark, example_index):
55
+ if not selected_models:
56
+ return "Please select at least one model to compare."
57
+
58
+ outputs = []
59
+ for model in selected_models:
60
+ try:
61
+ example = experiment_details[model][benchmark][example_index]
62
+ outputs.append({
63
+ 'Model': model.split('/')[-1],
64
+ 'Prediction': example['predictions'][0] if example['predictions'] else '',
65
+ 'Prompt': example['full_prompt'],
66
+ 'Metrics': example['metrics'],
67
+ 'Gold': example['gold']
68
+ })
69
+ except (KeyError, IndexError):
70
+ continue
71
+
72
+ if not outputs:
73
+ return "No results found for the selected combination."
74
+
75
+ # Create HTML output with all models
76
+ html_output = "<div style='max-width: 800px; margin: 0 auto;'>\n\n"
77
+
78
+ # Show gold answer at the top with distinct styling
79
+ if outputs:
80
+ html_output += "<div style='background: #e6f3e6; padding: 20px; border-radius: 10px; margin-bottom: 20px;'>\n"
81
+ html_output += "<h3 style='margin-top: 0;'>Ground Truth</h3>\n"
82
+ html_output += "<div style='overflow-x: auto; max-width: 100%;'>\n"
83
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{outputs[0]['Gold']}</code></pre>\n"
84
+ html_output += "</div>\n"
85
+ html_output += "</div>\n"
86
+
87
+ for output in outputs:
88
+ html_output += "<div style='background: #f5f5f5; padding: 20px; margin-bottom: 20px; border-radius: 10px;'>\n"
89
+ html_output += f"<h2 style='margin-top: 0;'>{output['Model']}</h2>\n"
90
+
91
+ # Format metrics as a clean table
92
+ html_output += "<details open style='margin-bottom: 15px;'>\n"
93
+ html_output += "<summary><h3 style='display: inline; margin: 0;'>Metrics</h3></summary>\n"
94
+ metrics = output['Metrics']
95
+ if isinstance(metrics, str):
96
+ metrics = eval(metrics)
97
+ html_output += "<div style='overflow-x: auto;'>\n"
98
+ html_output += "<table style='width: 100%; margin: 10px 0; border-collapse: collapse;'>\n"
99
+ for key, value in metrics.items():
100
+ if isinstance(value, float):
101
+ value = f"{value:.3f}"
102
+ html_output += f"<tr><td style='padding: 5px; border-bottom: 1px solid #ddd;'><strong>{key}</strong></td><td style='padding: 5px; border-bottom: 1px solid #ddd;'>{value}</td></tr>\n"
103
+ html_output += "</table>\n"
104
+ html_output += "</div>\n"
105
+ html_output += "</details>\n\n"
106
+
107
+ # Handle prompt formatting with better styling
108
+ html_output += "<details style='margin-bottom: 15px;'>\n"
109
+ html_output += "<summary><h3 style='display: inline; margin: 0;'>Prompt</h3></summary>\n"
110
+ html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
111
+
112
+ prompt_text = output['Prompt']
113
+ if isinstance(prompt_text, list):
114
+ for i, msg in enumerate(prompt_text):
115
+ if isinstance(msg, dict) and 'content' in msg:
116
+ role = msg.get('role', 'message').title()
117
+ html_output += "<div style='margin-bottom: 10px;'>\n"
118
+ html_output += f"<strong>{role}:</strong>\n"
119
+ html_output += "<div style='overflow-x: auto;'>\n"
120
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{msg['content']}</code></pre>\n"
121
+ html_output += "</div>\n"
122
+ html_output += "</div>\n"
123
+ else:
124
+ html_output += "<div style='margin-bottom: 10px;'>\n"
125
+ html_output += "<div style='overflow-x: auto;'>\n"
126
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{json.dumps(msg, indent=2)}</code></pre>\n"
127
+ html_output += "</div>\n"
128
+ html_output += "</div>\n"
129
+ else:
130
+ html_output += "<div style='overflow-x: auto;'>\n"
131
+ if isinstance(prompt_text, dict) and 'content' in prompt_text:
132
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text['content']}</code></pre>\n"
133
+ else:
134
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
135
+ html_output += "</div>\n"
136
+
137
+ html_output += "</div>\n"
138
+ html_output += "</details>\n\n"
139
+
140
+ # Style prediction output - now in a collapsible section
141
+ html_output += "<details open style='margin-bottom: 15px;'>\n"
142
+ html_output += "<summary><h3 style='display: inline; margin: 0;'>Prediction</h3>"
143
+ # Add word count in a muted style
144
+ word_count = len(output['Prediction'].split())
145
+ html_output += f"<span style='color: #666; font-size: 0.8em; margin-left: 10px;'>({word_count} words)</span>"
146
+ html_output += "</summary>\n"
147
+ html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
148
+ html_output += "<div style='overflow-x: auto;'>\n"
149
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{output['Prediction']}</code></pre>\n"
150
+ html_output += "</div>\n"
151
+ html_output += "</div>\n"
152
+ html_output += "</details>\n"
153
+ html_output += "</div>\n\n"
154
+
155
+ html_output += "</div>"
156
+ return html_output
157
+
158
+ # Get unique benchmarks
159
+ available_benchmarks = list(set(
160
+ benchmark
161
+ for model in MODELS
162
+ for benchmark in experiment_details[model].keys()
163
+ ))
164
+
165
+ # Create the Gradio interface
166
+ demo = gr.Interface(
167
+ fn=display_model_comparison,
168
+ inputs=[
169
+ gr.Dropdown(
170
+ choices=sorted(MODELS),
171
+ label="Models",
172
+ multiselect=True,
173
+ value=MODELS,
174
+ info="Select models to compare"
175
+ ),
176
+ gr.Dropdown(
177
+ choices=sorted(available_benchmarks),
178
+ label="Benchmark",
179
+ value=sorted(available_benchmarks)[0] if available_benchmarks else None,
180
+ info="Choose the evaluation benchmark"
181
+ ),
182
+ gr.Number(
183
+ label="Example Index",
184
+ value=0,
185
+ step=1,
186
+ info="Navigate through different examples"
187
+ )
188
+ ],
189
+ outputs=gr.HTML(),
190
+ title="Model Generation Comparison",
191
+ description="Compare model outputs across different benchmarks and prompts",
192
+ theme=gr.themes.Soft(),
193
+ css="button { margin: 0 10px; padding: 5px 15px; }"
194
+ )
195
+
196
+ if __name__ == "__main__":
197
+ demo.launch()
experiments.json ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "gpt-4o": {
3
+ "display_name": "gpt 4o",
4
+ "provider": "openai",
5
+ "open": false,
6
+ "benchmarks": {
7
+ "math_500": {
8
+ "subset": "lighteval|math_500|0",
9
+ "metrics": [
10
+ "extractive_match"
11
+ ],
12
+ "tags": {
13
+ "latest": "2025-02-26T10-14-16.106571"
14
+ }
15
+ },
16
+ "gpqa_diamond": {
17
+ "subset": "lighteval|gpqa:diamond|0",
18
+ "metrics": [
19
+ "extractive_match"
20
+ ],
21
+ "tags": {
22
+ "latest": "2025-02-26T10-14-16.106571"
23
+ }
24
+ },
25
+ "aime_24": {
26
+ "subset": "lighteval|aime24|0",
27
+ "metrics": [
28
+ "extractive_match"
29
+ ],
30
+ "tags": {
31
+ "latest": "2025-02-26T10-14-16.106571"
32
+ }
33
+ },
34
+ "aime_25": {
35
+ "subset": "lighteval|aime25|0",
36
+ "metrics": [
37
+ "extractive_match"
38
+ ],
39
+ "tags": {
40
+ "latest": "2025-02-26T10-14-16.106571"
41
+ }
42
+ },
43
+ "ifeval": {
44
+ "subset": "extended|ifeval|0",
45
+ "metrics": [
46
+ "prompt_level_strict_acc"
47
+ ],
48
+ "tags": {
49
+ "latest": "2025-02-26T10-14-16.106571"
50
+ }
51
+ }
52
+ }
53
+ },
54
+ "claude-3-7-sonnet-20250219": {
55
+ "display_name": "Claude 3.7 Sonnet",
56
+ "provider": "anthropic",
57
+ "open": false,
58
+ "benchmarks": {
59
+ "math_500": {
60
+ "subset": "lighteval|math_500|0",
61
+ "metrics": [
62
+ "extractive_match"
63
+ ],
64
+ "tags": {
65
+ "latest": "2025-02-25T14-35-15.137825"
66
+ }
67
+ },
68
+ "gpqa_diamond": {
69
+ "subset": "lighteval|gpqa:diamond|0",
70
+ "metrics": [
71
+ "extractive_match"
72
+ ],
73
+ "tags": {
74
+ "latest": "2025-02-25T12-43-49.294245"
75
+ }
76
+ },
77
+ "aime_24": {
78
+ "subset": "lighteval|aime24|0",
79
+ "metrics": [
80
+ "extractive_match"
81
+ ],
82
+ "tags": {
83
+ "latest": "2025-02-25T12-37-52.771787"
84
+ }
85
+ },
86
+ "aime_25": {
87
+ "subset": "lighteval|aime25|0",
88
+ "metrics": [
89
+ "extractive_match"
90
+ ],
91
+ "tags": {
92
+ "latest": "2025-02-25T12-37-52.771787"
93
+ }
94
+ },
95
+ "ifeval": {
96
+ "subset": "extended|ifeval|0",
97
+ "metrics": [
98
+ "prompt_level_strict_acc"
99
+ ],
100
+ "tags": {
101
+ "latest": "2025-02-25T12-24-45.750753"
102
+ }
103
+ }
104
+ }
105
+ },
106
+ "o3-mini-2025-01-31": {
107
+ "display_name": "o3-mini",
108
+ "provider": "openai",
109
+ "open": false,
110
+ "benchmarks": {
111
+ "math_500": {
112
+ "subset": "lighteval|math_500|0",
113
+ "metrics": [
114
+ "extractive_match"
115
+ ],
116
+ "tags": {
117
+ "latest": "2025-02-26T11-37-01.193437"
118
+ }
119
+ },
120
+ "gpqa_diamond": {
121
+ "subset": "lighteval|gpqa:diamond|0",
122
+ "metrics": [
123
+ "extractive_match"
124
+ ],
125
+ "tags": {
126
+ "latest": "2025-02-26T11-37-01.193437"
127
+ }
128
+ },
129
+ "aime_24": {
130
+ "subset": "lighteval|aime24|0",
131
+ "metrics": [
132
+ "extractive_match"
133
+ ],
134
+ "tags": {
135
+ "latest": "2025-02-26T11-37-01.193437"
136
+ }
137
+ },
138
+ "aime_25": {
139
+ "subset": "lighteval|aime25|0",
140
+ "metrics": [
141
+ "extractive_match"
142
+ ],
143
+ "tags": {
144
+ "latest": "2025-02-26T11-37-01.193437"
145
+ }
146
+ },
147
+ "ifeval": {
148
+ "subset": "extended|ifeval|0",
149
+ "metrics": [
150
+ "prompt_level_strict_acc"
151
+ ],
152
+ "tags": {
153
+ "latest": "2025-02-26T11-37-01.193437"
154
+ }
155
+ }
156
+ }
157
+ },
158
+ "moonshotai/Moonlight-16B-A3B-Instruct": {
159
+ "display_name": "Moonlight",
160
+ "provider": "moonshotai",
161
+ "open": true,
162
+ "benchmarks": {
163
+ "math_500": {
164
+ "subset": "lighteval|math_500|0",
165
+ "metrics": [
166
+ "extractive_match"
167
+ ],
168
+ "tags": {
169
+ "latest": "2025_02_26T13_32_06.104265"
170
+ }
171
+ },
172
+ "gpqa_diamond": {
173
+ "subset": "lighteval|gpqa:diamond|0",
174
+ "metrics": [
175
+ "extractive_match"
176
+ ],
177
+ "tags": {
178
+ "latest": "2025_02_26T13_32_06.104265"
179
+ }
180
+ },
181
+ "aime_24": {
182
+ "subset": "lighteval|aime24|0",
183
+ "metrics": [
184
+ "extractive_match"
185
+ ],
186
+ "tags": {
187
+ "latest": "2025_02_26T13_32_06.104265"
188
+ }
189
+ },
190
+ "aime_25": {
191
+ "subset": "lighteval|aime25|0",
192
+ "metrics": [
193
+ "extractive_match"
194
+ ],
195
+ "tags": {
196
+ "latest": "2025_02_26T13_32_06.104265"
197
+ }
198
+ },
199
+ "ifeval": {
200
+ "subset": "extended|ifeval|0",
201
+ "metrics": [
202
+ "prompt_level_strict_acc"
203
+ ],
204
+ "tags": {
205
+ "latest": "2025_02_26T13_32_06.104265"
206
+ }
207
+ }
208
+ }
209
+ },
210
+ "meta-llama/Llama-3.3-70B-Instruct": {
211
+ "display_name": "Llama 3.3 70B",
212
+ "provider": "meta",
213
+ "open": true,
214
+ "benchmarks": {
215
+ "math_500": {
216
+ "subset": "lighteval|math_500|0",
217
+ "metrics": ["extractive_match"],
218
+ "tags": {
219
+ "latest": "2025-02-26T17-13-13.448521"
220
+ }
221
+ },
222
+ "gpqa_diamond": {
223
+ "subset": "lighteval|gpqa:diamond|0",
224
+ "metrics": ["extractive_match"],
225
+ "tags": {
226
+ "latest": "2025-02-26T17-13-13.448521"
227
+ }
228
+ },
229
+ "aime_24": {
230
+ "subset": "lighteval|aime24|0",
231
+ "metrics": ["extractive_match"],
232
+ "tags": {
233
+ "latest": "2025-02-26T17-13-13.448521"
234
+ }
235
+ },
236
+ "aime_25": {
237
+ "subset": "lighteval|aime25|0",
238
+ "metrics": ["extractive_match"],
239
+ "tags": {
240
+ "latest": "2025-02-26T17-13-13.448521"
241
+ }
242
+ },
243
+ "ifeval": {
244
+ "subset": "extended|ifeval|0",
245
+ "metrics": ["prompt_level_strict_acc"],
246
+ "tags": {
247
+ "latest": "2025-02-26T17-13-13.448521"
248
+ }
249
+ }
250
+ }
251
+ },
252
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B": {
253
+ "display_name": "DeepSeek Llama 70B",
254
+ "provider": "deepseek",
255
+ "open": true,
256
+ "benchmarks": {
257
+ "math_500": {
258
+ "subset": "lighteval|math_500|0",
259
+ "metrics": ["extractive_match"],
260
+ "tags": {
261
+ "latest": "2025-02-27T11-09-04.037858"
262
+ }
263
+ },
264
+ "gpqa_diamond": {
265
+ "subset": "lighteval|gpqa:diamond|0",
266
+ "metrics": ["extractive_match"],
267
+ "tags": {
268
+ "latest": "2025-02-27T11-09-04.037858"
269
+ }
270
+ },
271
+ "aime_24": {
272
+ "subset": "lighteval|aime24|0",
273
+ "metrics": ["extractive_match"],
274
+ "tags": {
275
+ "latest": "2025-02-27T11-09-04.037858"
276
+ }
277
+ },
278
+ "aime_25": {
279
+ "subset": "lighteval|aime25|0",
280
+ "metrics": ["extractive_match"],
281
+ "tags": {
282
+ "latest": "2025-02-27T11-09-04.037858"
283
+ }
284
+ },
285
+ "ifeval": {
286
+ "subset": "extended|ifeval|0",
287
+ "metrics": ["prompt_level_strict_acc"],
288
+ "tags": {
289
+ "latest": "2025-02-27T14-02-02.414381"
290
+ }
291
+ }
292
+ }
293
+ },
294
+ "qihoo360/TinyR1-32B-Preview": {
295
+ "display_name": "TinyR1 32B",
296
+ "provider": "qihoo360",
297
+ "open": true,
298
+ "benchmarks": {
299
+ "math_500": {
300
+ "subset": "lighteval|math_500|0",
301
+ "metrics": ["extractive_match"],
302
+ "tags": {
303
+ "latest": "2025-02-27T13-32-41.564652"
304
+ }
305
+ },
306
+ "gpqa_diamond": {
307
+ "subset": "lighteval|gpqa:diamond|0",
308
+ "metrics": ["extractive_match"],
309
+ "tags": {
310
+ "latest": "2025-02-27T13-32-41.564652"
311
+ }
312
+ },
313
+ "aime_24": {
314
+ "subset": "lighteval|aime24|0",
315
+ "metrics": ["extractive_match"],
316
+ "tags": {
317
+ "latest": "2025-02-27T13-32-41.564652"
318
+ }
319
+ },
320
+ "aime_25": {
321
+ "subset": "lighteval|aime25|0",
322
+ "metrics": ["extractive_match"],
323
+ "tags": {
324
+ "latest": "2025-02-27T13-32-41.564652"
325
+ }
326
+ },
327
+ "ifeval": {
328
+ "subset": "extended|ifeval|0",
329
+ "metrics": ["prompt_level_strict_acc"],
330
+ "tags": {
331
+ "latest": "2025-02-27T13-32-41.564652"
332
+ }
333
+ }
334
+ }
335
+ },
336
+ "openai/gpt-4.5-preview-2025-02-27": {
337
+ "display_name": "gpt 4.5",
338
+ "provider": "openai",
339
+ "open": false,
340
+ "benchmarks": {
341
+ "math_500": {
342
+ "subset": "lighteval|math_500|0",
343
+ "metrics": ["extractive_match"],
344
+ "tags": {
345
+ "latest": "2025-03-03T11-17-20.767980"
346
+ }
347
+ },
348
+ "gpqa_diamond": {
349
+ "subset": "lighteval|gpqa:diamond|0",
350
+ "metrics": ["extractive_match"],
351
+ "tags": {
352
+ "latest": "2025-03-03T11-35-34.241611"
353
+ }
354
+ },
355
+ "aime_24": {
356
+ "subset": "lighteval|aime24|0",
357
+ "metrics": ["extractive_match"],
358
+ "tags": {
359
+ "latest": "2025-03-03T11-15-32.836958"
360
+ }
361
+ },
362
+ "aime_25": {
363
+ "subset": "lighteval|aime25|0",
364
+ "metrics": ["extractive_match"],
365
+ "tags": {
366
+ "latest": "2025-03-03T11-15-32.836958"
367
+ }
368
+ },
369
+ "ifeval": {
370
+ "subset": "extended|ifeval|0",
371
+ "metrics": ["prompt_level_strict_acc"],
372
+ "tags": {
373
+ "latest": "2025-03-03T11-17-20.767980"
374
+ }
375
+ }
376
+ }
377
+ },
378
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": {
379
+ "display_name": "DeepSeek Qwen 32B",
380
+ "provider": "deepseek",
381
+ "open": true,
382
+ "benchmarks": {
383
+ "math_500": {
384
+ "subset": "lighteval|math_500|0",
385
+ "metrics": ["extractive_match"],
386
+ "tags": {
387
+ "latest": "2025-03-03T14-51-09.849491"
388
+ }
389
+ },
390
+ "gpqa_diamond": {
391
+ "subset": "lighteval|gpqa:diamond|0",
392
+ "metrics": ["extractive_match"],
393
+ "tags": {
394
+ "latest": "2025-03-03T14-51-09.849491"
395
+ }
396
+ },
397
+ "aime_24": {
398
+ "subset": "lighteval|aime24|0",
399
+ "metrics": ["extractive_match"],
400
+ "tags": {
401
+ "latest": "2025-03-03T14-51-09.849491"
402
+ }
403
+ },
404
+ "aime_25": {
405
+ "subset": "lighteval|aime25|0",
406
+ "metrics": ["extractive_match"],
407
+ "tags": {
408
+ "latest": "2025-03-03T14-51-09.849491"
409
+ }
410
+ },
411
+ "ifeval": {
412
+ "subset": "extended|ifeval|0",
413
+ "metrics": ["prompt_level_strict_acc"],
414
+ "tags": {
415
+ "latest": "2025-03-03T15-06-10.838105"
416
+ }
417
+ }
418
+ }
419
+ }
420
+ }