Linker1907 commited on
Commit
c5bf87e
·
1 Parent(s): 9b8ac8f
Files changed (2) hide show
  1. app.py +26 -8
  2. experiments.json +87 -3
app.py CHANGED
@@ -2,7 +2,6 @@ from datasets import load_dataset
2
  from collections import defaultdict
3
  import json
4
  import gradio as gr
5
- from functools import lru_cache
6
 
7
  # Load models and experiments
8
 
@@ -10,9 +9,8 @@ with open("experiments.json") as f:
10
  experiments = json.load(f)
11
 
12
  MODELS = list(experiments.keys())
13
- MODELS = [m for m in MODELS if m != "claude-3-7-sonnet-20250219"]
14
 
15
- @lru_cache
16
  def load_details_and_results(model, benchmark, experiment_tag):
17
  def worker(example):
18
  example["predictions"] = example["predictions"]
@@ -20,7 +18,7 @@ def load_details_and_results(model, benchmark, experiment_tag):
20
  example["metrics"] = example["metrics"]
21
  return example
22
 
23
- repo = f"SaylorTwift/details_{model.replace('/', '__')}_private"
24
  subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("|", "_").replace(":", "_")
25
  split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_")
26
 
@@ -49,8 +47,14 @@ def display_model_comparison(selected_models, benchmark, example_index):
49
  if not selected_models:
50
  return "Please select at least one model to compare."
51
 
 
 
 
 
 
 
52
  outputs = []
53
- for model in selected_models:
54
  try:
55
  example = experiment_details[model][benchmark][example_index]
56
  outputs.append({
@@ -111,7 +115,9 @@ def display_model_comparison(selected_models, benchmark, example_index):
111
  html_output += "<div style='margin-bottom: 10px;'>\n"
112
  html_output += f"<strong>{role}:</strong>\n"
113
  html_output += "<div style='overflow-x: auto;'>\n"
114
- html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{msg['content']}</code></pre>\n"
 
 
115
  html_output += "</div>\n"
116
  html_output += "</div>\n"
117
  else:
@@ -123,8 +129,13 @@ def display_model_comparison(selected_models, benchmark, example_index):
123
  else:
124
  html_output += "<div style='overflow-x: auto;'>\n"
125
  if isinstance(prompt_text, dict) and 'content' in prompt_text:
126
- html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text['content']}</code></pre>\n"
 
 
127
  else:
 
 
 
128
  html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
129
  html_output += "</div>\n"
130
 
@@ -140,7 +151,9 @@ def display_model_comparison(selected_models, benchmark, example_index):
140
  html_output += "</summary>\n"
141
  html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
142
  html_output += "<div style='overflow-x: auto;'>\n"
143
- html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{output['Prediction']}</code></pre>\n"
 
 
144
  html_output += "</div>\n"
145
  html_output += "</div>\n"
146
  html_output += "</details>\n"
@@ -156,6 +169,11 @@ available_benchmarks = list(set(
156
  for benchmark in experiment_details[model].keys()
157
  ))
158
 
 
 
 
 
 
159
  # Create the Gradio interface
160
  demo = gr.Interface(
161
  fn=display_model_comparison,
 
2
  from collections import defaultdict
3
  import json
4
  import gradio as gr
 
5
 
6
  # Load models and experiments
7
 
 
9
  experiments = json.load(f)
10
 
11
  MODELS = list(experiments.keys())
12
+ MODELS = [m for m in MODELS if m.startswith("google/gemma-3")]
13
 
 
14
  def load_details_and_results(model, benchmark, experiment_tag):
15
  def worker(example):
16
  example["predictions"] = example["predictions"]
 
18
  example["metrics"] = example["metrics"]
19
  return example
20
 
21
+ repo = f"OpenEvals/details_{model.replace('/', '__')}_private"
22
  subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("|", "_").replace(":", "_")
23
  split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_")
24
 
 
47
  if not selected_models:
48
  return "Please select at least one model to compare."
49
 
50
+ # Filter out models that don't have the selected benchmark
51
+ available_models = [model for model in selected_models if benchmark in experiment_details[model]]
52
+
53
+ if not available_models:
54
+ return f"No models have results for benchmark: {benchmark}"
55
+
56
  outputs = []
57
+ for model in available_models: # Changed from selected_models to available_models
58
  try:
59
  example = experiment_details[model][benchmark][example_index]
60
  outputs.append({
 
115
  html_output += "<div style='margin-bottom: 10px;'>\n"
116
  html_output += f"<strong>{role}:</strong>\n"
117
  html_output += "<div style='overflow-x: auto;'>\n"
118
+ # Escape HTML in content
119
+ content = msg['content'].replace('<', '&lt;').replace('>', '&gt;')
120
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
121
  html_output += "</div>\n"
122
  html_output += "</div>\n"
123
  else:
 
129
  else:
130
  html_output += "<div style='overflow-x: auto;'>\n"
131
  if isinstance(prompt_text, dict) and 'content' in prompt_text:
132
+ # Escape HTML in content
133
+ content = prompt_text['content'].replace('<', '&lt;').replace('>', '&gt;')
134
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
135
  else:
136
+ # Escape HTML if prompt_text is a string
137
+ if isinstance(prompt_text, str):
138
+ prompt_text = prompt_text.replace('<', '&lt;').replace('>', '&gt;')
139
  html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
140
  html_output += "</div>\n"
141
 
 
151
  html_output += "</summary>\n"
152
  html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
153
  html_output += "<div style='overflow-x: auto;'>\n"
154
+ # Escape HTML in prediction
155
+ prediction = output['Prediction'].replace('<', '&lt;').replace('>', '&gt;')
156
+ html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{prediction}</code></pre>\n"
157
  html_output += "</div>\n"
158
  html_output += "</div>\n"
159
  html_output += "</details>\n"
 
169
  for benchmark in experiment_details[model].keys()
170
  ))
171
 
172
+ # Update the Gradio interface to dynamically filter models based on benchmark
173
+ def update_model_choices(benchmark):
174
+ available_models = [model for model in MODELS if benchmark in experiment_details[model]]
175
+ return gr.Dropdown(choices=sorted(available_models), value=sorted(available_models))
176
+
177
  # Create the Gradio interface
178
  demo = gr.Interface(
179
  fn=display_model_comparison,
experiments.json CHANGED
@@ -3,6 +3,7 @@
3
  "display_name": "gpt 4o",
4
  "provider": "openai",
5
  "open": false,
 
6
  "benchmarks": {
7
  "math_500": {
8
  "subset": "lighteval|math_500|0",
@@ -55,6 +56,7 @@
55
  "display_name": "Claude 3.7 Sonnet",
56
  "provider": "anthropic",
57
  "open": false,
 
58
  "benchmarks": {
59
  "math_500": {
60
  "subset": "lighteval|math_500|0",
@@ -112,6 +114,7 @@
112
  "display_name": "o3-mini",
113
  "provider": "openai",
114
  "open": false,
 
115
  "benchmarks": {
116
  "math_500": {
117
  "subset": "lighteval|math_500|0",
@@ -164,6 +167,7 @@
164
  "display_name": "Moonlight",
165
  "provider": "moonshotai",
166
  "open": true,
 
167
  "benchmarks": {
168
  "math_500": {
169
  "subset": "lighteval|math_500|0",
@@ -216,6 +220,7 @@
216
  "display_name": "Llama 3.3 70B",
217
  "provider": "meta",
218
  "open": true,
 
219
  "benchmarks": {
220
  "math_500": {
221
  "subset": "lighteval|math_500|0",
@@ -258,6 +263,7 @@
258
  "display_name": "DeepSeek Llama 70B",
259
  "provider": "deepseek",
260
  "open": true,
 
261
  "benchmarks": {
262
  "math_500": {
263
  "subset": "lighteval|math_500|0",
@@ -300,6 +306,7 @@
300
  "display_name": "TinyR1 32B",
301
  "provider": "qihoo360",
302
  "open": true,
 
303
  "benchmarks": {
304
  "math_500": {
305
  "subset": "lighteval|math_500|0",
@@ -342,6 +349,7 @@
342
  "display_name": "gpt 4.5",
343
  "provider": "openai",
344
  "open": false,
 
345
  "benchmarks": {
346
  "math_500": {
347
  "subset": "lighteval|math_500|0",
@@ -384,6 +392,7 @@
384
  "display_name": "DeepSeek Qwen 32B",
385
  "provider": "deepseek",
386
  "open": true,
 
387
  "benchmarks": {
388
  "math_500": {
389
  "subset": "lighteval|math_500|0",
@@ -426,6 +435,7 @@
426
  "display_name": "DeepSeek R1",
427
  "provider": "deepseek",
428
  "open": true,
 
429
  "benchmarks": {
430
  "math_500": {
431
  "subset": "lighteval|math_500|0",
@@ -468,19 +478,20 @@
468
  "display_name": "QwQ 32B",
469
  "provider": "Qwen",
470
  "open": true,
 
471
  "benchmarks": {
472
  "math_500": {
473
  "subset": "lighteval|math_500|0",
474
  "metrics": ["extractive_match"],
475
  "tags": {
476
- "latest": "2025-03-07T11-04-40.089127"
477
  }
478
  },
479
  "gpqa_diamond": {
480
  "subset": "lighteval|gpqa:diamond|0",
481
  "metrics": ["extractive_match"],
482
  "tags": {
483
- "latest": "2025-03-07T11-04-40.089127"
484
  }
485
  },
486
  "aime_24": {
@@ -501,7 +512,80 @@
501
  "subset": "extended|ifeval|0",
502
  "metrics": ["prompt_level_strict_acc"],
503
  "tags": {
504
- "latest": "2025-03-07T11-04-40.089127"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
505
  }
506
  }
507
  }
 
3
  "display_name": "gpt 4o",
4
  "provider": "openai",
5
  "open": false,
6
+ "size": "?B",
7
  "benchmarks": {
8
  "math_500": {
9
  "subset": "lighteval|math_500|0",
 
56
  "display_name": "Claude 3.7 Sonnet",
57
  "provider": "anthropic",
58
  "open": false,
59
+ "size": "?B",
60
  "benchmarks": {
61
  "math_500": {
62
  "subset": "lighteval|math_500|0",
 
114
  "display_name": "o3-mini",
115
  "provider": "openai",
116
  "open": false,
117
+ "size": "?B",
118
  "benchmarks": {
119
  "math_500": {
120
  "subset": "lighteval|math_500|0",
 
167
  "display_name": "Moonlight",
168
  "provider": "moonshotai",
169
  "open": true,
170
+ "size": "16B",
171
  "benchmarks": {
172
  "math_500": {
173
  "subset": "lighteval|math_500|0",
 
220
  "display_name": "Llama 3.3 70B",
221
  "provider": "meta",
222
  "open": true,
223
+ "size": "70B",
224
  "benchmarks": {
225
  "math_500": {
226
  "subset": "lighteval|math_500|0",
 
263
  "display_name": "DeepSeek Llama 70B",
264
  "provider": "deepseek",
265
  "open": true,
266
+ "size": "70B",
267
  "benchmarks": {
268
  "math_500": {
269
  "subset": "lighteval|math_500|0",
 
306
  "display_name": "TinyR1 32B",
307
  "provider": "qihoo360",
308
  "open": true,
309
+ "size": "32B",
310
  "benchmarks": {
311
  "math_500": {
312
  "subset": "lighteval|math_500|0",
 
349
  "display_name": "gpt 4.5",
350
  "provider": "openai",
351
  "open": false,
352
+ "size": "?B",
353
  "benchmarks": {
354
  "math_500": {
355
  "subset": "lighteval|math_500|0",
 
392
  "display_name": "DeepSeek Qwen 32B",
393
  "provider": "deepseek",
394
  "open": true,
395
+ "size": "32B",
396
  "benchmarks": {
397
  "math_500": {
398
  "subset": "lighteval|math_500|0",
 
435
  "display_name": "DeepSeek R1",
436
  "provider": "deepseek",
437
  "open": true,
438
+ "size": "671B",
439
  "benchmarks": {
440
  "math_500": {
441
  "subset": "lighteval|math_500|0",
 
478
  "display_name": "QwQ 32B",
479
  "provider": "Qwen",
480
  "open": true,
481
+ "size": "32B",
482
  "benchmarks": {
483
  "math_500": {
484
  "subset": "lighteval|math_500|0",
485
  "metrics": ["extractive_match"],
486
  "tags": {
487
+ "latest": "2025-03-10T11-47-46.303371"
488
  }
489
  },
490
  "gpqa_diamond": {
491
  "subset": "lighteval|gpqa:diamond|0",
492
  "metrics": ["extractive_match"],
493
  "tags": {
494
+ "latest": "2025-03-10T11-47-46.303371"
495
  }
496
  },
497
  "aime_24": {
 
512
  "subset": "extended|ifeval|0",
513
  "metrics": ["prompt_level_strict_acc"],
514
  "tags": {
515
+ "latest": "2025-03-10T12-21-36.862202"
516
+ }
517
+ }
518
+ }
519
+ },
520
+ "google/gemma-3-1b-it": {
521
+ "display_name": "Gemma 3",
522
+ "provider": "google",
523
+ "open": true,
524
+ "size": "1B",
525
+ "benchmarks": {
526
+ "aime_25": {
527
+ "subset": "lighteval|aime25|0",
528
+ "metrics": ["extractive_match"],
529
+ "tags": {
530
+ "latest": "2025-03-18T14-25-56.178612"
531
+ }
532
+ }
533
+ }
534
+ },
535
+ "google/gemma-3-12b-it": {
536
+ "display_name": "Gemma 3 12B",
537
+ "provider": "google",
538
+ "open": true,
539
+ "size": "12B",
540
+ "benchmarks": {
541
+ "aime_25": {
542
+ "subset": "lighteval|aime25|0",
543
+ "metrics": ["extractive_match"],
544
+ "tags": {
545
+ "latest": "2025-03-18T14-36-23.368081"
546
+ }
547
+ }
548
+ }
549
+ },
550
+ "google/gemma-3-27b-it": {
551
+ "display_name": "Gemma 3 27B",
552
+ "provider": "google",
553
+ "open": true,
554
+ "size": "27B",
555
+ "benchmarks": {
556
+ "aime_25": {
557
+ "subset": "lighteval|aime25|0",
558
+ "metrics": ["extractive_match"],
559
+ "tags": {
560
+ "latest": "2025-03-18T14-41-33.181467"
561
+ }
562
+ },
563
+ "aime_24": {
564
+ "subset": "lighteval|aime24|0",
565
+ "metrics": ["extractive_match"],
566
+ "tags": {
567
+ "latest": "2025-03-18T15-11-34.174477"
568
+ }
569
+ },
570
+ "ifeval": {
571
+ "subset": "extended|ifeval|0",
572
+ "metrics": ["prompt_level_strict_acc"],
573
+ "tags": {
574
+ "latest": "2025-03-18T15-20-14.979833"
575
+ }
576
+ },
577
+ "gpqa_diamond": {
578
+ "subset": "lighteval|gpqa:diamond|0",
579
+ "metrics": ["extractive_match"],
580
+ "tags": {
581
+ "latest": "2025-03-18T15-20-14.979833"
582
+ }
583
+ },
584
+ "math_500": {
585
+ "subset": "lighteval|math_500|0",
586
+ "metrics": ["extractive_match"],
587
+ "tags": {
588
+ "latest": "2025-03-18T15-20-14.979833"
589
  }
590
  }
591
  }