Linker1907 commited on
Commit
9b8ac8f
·
1 Parent(s): bcda822

add app file

Browse files
Files changed (2) hide show
  1. app.py +3 -9
  2. experiments.json +94 -5
app.py CHANGED
@@ -5,19 +5,13 @@ import gradio as gr
5
  from functools import lru_cache
6
 
7
  # Load models and experiments
8
- MODELS = [
9
- "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
10
- "o3-mini-2025-01-31",
11
- "meta-llama/Llama-3.3-70B-Instruct",
12
- "moonshotai/Moonlight-16B-A3B-Instruct",
13
- "gpt-4o",
14
- "claude-3-7-sonnet-20250219",
15
- "openai/gpt-4.5-preview-2025-02-27"
16
- ]
17
 
18
  with open("experiments.json") as f:
19
  experiments = json.load(f)
20
 
 
 
 
21
  @lru_cache
22
  def load_details_and_results(model, benchmark, experiment_tag):
23
  def worker(example):
 
5
  from functools import lru_cache
6
 
7
  # Load models and experiments
 
 
 
 
 
 
 
 
 
8
 
9
  with open("experiments.json") as f:
10
  experiments = json.load(f)
11
 
12
+ MODELS = list(experiments.keys())
13
+ MODELS = [m for m in MODELS if m != "claude-3-7-sonnet-20250219"]
14
+
15
  @lru_cache
16
  def load_details_and_results(model, benchmark, experiment_tag):
17
  def worker(example):
experiments.json CHANGED
@@ -62,7 +62,8 @@
62
  "extractive_match"
63
  ],
64
  "tags": {
65
- "latest": "2025-02-25T14-35-15.137825"
 
66
  }
67
  },
68
  "gpqa_diamond": {
@@ -71,7 +72,8 @@
71
  "extractive_match"
72
  ],
73
  "tags": {
74
- "latest": "2025-02-25T12-43-49.294245"
 
75
  }
76
  },
77
  "aime_24": {
@@ -80,7 +82,8 @@
80
  "extractive_match"
81
  ],
82
  "tags": {
83
- "latest": "2025-02-25T12-37-52.771787"
 
84
  }
85
  },
86
  "aime_25": {
@@ -89,7 +92,8 @@
89
  "extractive_match"
90
  ],
91
  "tags": {
92
- "latest": "2025-02-25T12-37-52.771787"
 
93
  }
94
  },
95
  "ifeval": {
@@ -98,7 +102,8 @@
98
  "prompt_level_strict_acc"
99
  ],
100
  "tags": {
101
- "latest": "2025-02-25T12-24-45.750753"
 
102
  }
103
  }
104
  }
@@ -416,5 +421,89 @@
416
  }
417
  }
418
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
  }
420
  }
 
62
  "extractive_match"
63
  ],
64
  "tags": {
65
+ "default": "2025-02-25T14-35-15.137825",
66
+ "thinking": "2025-03-05T10-14-44.802711"
67
  }
68
  },
69
  "gpqa_diamond": {
 
72
  "extractive_match"
73
  ],
74
  "tags": {
75
+ "default": "2025-02-25T12-43-49.294245",
76
+ "thinking": "2025-03-05T15-37-37.180318"
77
  }
78
  },
79
  "aime_24": {
 
82
  "extractive_match"
83
  ],
84
  "tags": {
85
+ "default": "2025-02-25T12-37-52.771787",
86
+ "thinking": "2025-03-05T12-39-13.627801"
87
  }
88
  },
89
  "aime_25": {
 
92
  "extractive_match"
93
  ],
94
  "tags": {
95
+ "default": "2025-02-25T12-37-52.771787",
96
+ "thinking": "2025-03-05T12-39-13.627801"
97
  }
98
  },
99
  "ifeval": {
 
102
  "prompt_level_strict_acc"
103
  ],
104
  "tags": {
105
+ "default": "2025-02-25T12-24-45.750753",
106
+ "thinking": "2025-03-05T15-37-37.180318"
107
  }
108
  }
109
  }
 
421
  }
422
  }
423
  }
424
+ },
425
+ "openai/deepseek-ai/DeepSeek-R1": {
426
+ "display_name": "DeepSeek R1",
427
+ "provider": "deepseek",
428
+ "open": true,
429
+ "benchmarks": {
430
+ "math_500": {
431
+ "subset": "lighteval|math_500|0",
432
+ "metrics": ["extractive_match"],
433
+ "tags": {
434
+ "latest": "2025-03-04T17-06-33.124766"
435
+ }
436
+ },
437
+ "gpqa_diamond": {
438
+ "subset": "lighteval|gpqa:diamond|0",
439
+ "metrics": ["extractive_match"],
440
+ "tags": {
441
+ "latest": "2025-03-04T17-06-33.124766"
442
+ }
443
+ },
444
+ "aime_24": {
445
+ "subset": "lighteval|aime24|0",
446
+ "metrics": ["extractive_match"],
447
+ "tags": {
448
+ "latest": "2025-03-04T14-52-35.594174"
449
+ }
450
+ },
451
+ "aime_25": {
452
+ "subset": "lighteval|aime25|0",
453
+ "metrics": ["extractive_match"],
454
+ "tags": {
455
+ "latest": "2025-03-04T14-25-05.009799"
456
+ }
457
+ },
458
+ "ifeval": {
459
+ "subset": "extended|ifeval|0",
460
+ "metrics": ["prompt_level_strict_acc"],
461
+ "tags": {
462
+ "latest": "2025-03-04T15-24-42.488745"
463
+ }
464
+ }
465
+ }
466
+ },
467
+ "Qwen/QwQ-32B": {
468
+ "display_name": "QwQ 32B",
469
+ "provider": "Qwen",
470
+ "open": true,
471
+ "benchmarks": {
472
+ "math_500": {
473
+ "subset": "lighteval|math_500|0",
474
+ "metrics": ["extractive_match"],
475
+ "tags": {
476
+ "latest": "2025-03-07T11-04-40.089127"
477
+ }
478
+ },
479
+ "gpqa_diamond": {
480
+ "subset": "lighteval|gpqa:diamond|0",
481
+ "metrics": ["extractive_match"],
482
+ "tags": {
483
+ "latest": "2025-03-07T11-04-40.089127"
484
+ }
485
+ },
486
+ "aime_24": {
487
+ "subset": "lighteval|aime24|0",
488
+ "metrics": ["extractive_match"],
489
+ "tags": {
490
+ "latest": "2025-03-10T10-36-07.886033"
491
+ }
492
+ },
493
+ "aime_25": {
494
+ "subset": "lighteval|aime25|0",
495
+ "metrics": ["extractive_match"],
496
+ "tags": {
497
+ "latest": "2025-03-10T10-36-07.886033"
498
+ }
499
+ },
500
+ "ifeval": {
501
+ "subset": "extended|ifeval|0",
502
+ "metrics": ["prompt_level_strict_acc"],
503
+ "tags": {
504
+ "latest": "2025-03-07T11-04-40.089127"
505
+ }
506
+ }
507
+ }
508
  }
509
  }