Spaces:
Running
Running
Update ZeroEval-main/result_dirs/zebra-grid.summary.json
Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json
CHANGED
@@ -1,4 +1,21 @@
|
|
1 |
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
{
|
3 |
"Model": "o3-mini-2025-01-31-high",
|
4 |
"Mode": "greedy",
|
@@ -50,6 +67,23 @@
|
|
50 |
"N_Mode": "single",
|
51 |
"N_Size": 1
|
52 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
{
|
54 |
"Model": "deepseek-R1",
|
55 |
"Mode": "greedy",
|
|
|
1 |
[
|
2 |
+
{
|
3 |
+
"Model": "grok-3-mini-fast-beta-high",
|
4 |
+
"Mode": "greedy",
|
5 |
+
"Puzzle Acc": "92.60",
|
6 |
+
"Cell Acc": "94.63",
|
7 |
+
"No answer": "1.00",
|
8 |
+
"Easy Puzzle Acc": "98.93",
|
9 |
+
"Hard Puzzle Acc": "90.14",
|
10 |
+
"Small Puzzle Acc": "98.75",
|
11 |
+
"Medium Puzzle Acc": "96.43",
|
12 |
+
"Large Puzzle Acc": "93.50",
|
13 |
+
"XL Puzzle Acc": "76.50",
|
14 |
+
"Total Puzzles": 1000,
|
15 |
+
"Reason Lens": "782.25",
|
16 |
+
"N_Mode": "single",
|
17 |
+
"N_Size": 1
|
18 |
+
},
|
19 |
{
|
20 |
"Model": "o3-mini-2025-01-31-high",
|
21 |
"Mode": "greedy",
|
|
|
67 |
"N_Mode": "single",
|
68 |
"N_Size": 1
|
69 |
},
|
70 |
+
{
|
71 |
+
"Model": "grok-3-mini-fast-beta-low",
|
72 |
+
"Mode": "greedy",
|
73 |
+
"Puzzle Acc": "80.70",
|
74 |
+
"Cell Acc": "84.22",
|
75 |
+
"No answer": "0.00",
|
76 |
+
"Easy Puzzle Acc": "98.57",
|
77 |
+
"Hard Puzzle Acc": "73.75",
|
78 |
+
"Small Puzzle Acc": "98.75",
|
79 |
+
"Medium Puzzle Acc": "96.43",
|
80 |
+
"Large Puzzle Acc": "77.00",
|
81 |
+
"XL Puzzle Acc": "33.50",
|
82 |
+
"Total Puzzles": 1000,
|
83 |
+
"Reason Lens": "874.09",
|
84 |
+
"N_Mode": "single",
|
85 |
+
"N_Size": 1
|
86 |
+
},
|
87 |
{
|
88 |
"Model": "deepseek-R1",
|
89 |
"Mode": "greedy",
|