Spaces:
Running
Running
Commit
·
cf2253a
1
Parent(s):
72d60b0
Updating metrics
Browse files
src/data/metrics/absolute_improvement_to_baseline.json
CHANGED
@@ -1,56 +1,74 @@
|
|
1 |
{
|
2 |
"perception_temporal_action_loc": {
|
3 |
-
"MLAB (claude-3-5-sonnet-v2)": 2.
|
4 |
-
"Top Human in Competition": 284.
|
5 |
-
"MLAB (gemini-exp-1206)": -1.
|
6 |
-
"MLAB (o3-mini)": 0.
|
7 |
-
"MLAB (gpt-4o)": 0.
|
8 |
-
"MLAB (llama3-1-405b-instruct)": 1.
|
9 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 0
|
10 |
},
|
11 |
"llm-merging": {
|
12 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.
|
13 |
-
"Top Human in Competition": 68.
|
14 |
-
"MLAB (claude-3-5-sonnet-v2)": 3.
|
15 |
-
"MLAB (gemini-exp-1206)": 3.
|
16 |
-
"MLAB (o3-mini)": -0.
|
17 |
-
"MLAB (gpt-4o)": 1.
|
18 |
-
"MLAB (llama3-1-405b-instruct)": -0.
|
19 |
},
|
20 |
"meta-learning": {
|
21 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 5.
|
22 |
-
"Top Human in Competition": 304.
|
23 |
-
"MLAB (claude-3-5-sonnet-v2)": 5.
|
24 |
-
"MLAB (gemini-exp-1206)": 5.
|
25 |
-
"MLAB (o3-mini)": -14.
|
26 |
-
"MLAB (gpt-4o)": 5.
|
27 |
-
"MLAB (llama3-1-405b-instruct)": 5.
|
28 |
},
|
29 |
"product-recommendation": {
|
30 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)":
|
31 |
-
"Top Human in Competition": 412.
|
32 |
-
"MLAB (claude-3-5-sonnet-v2)": 12.
|
33 |
-
"MLAB (gemini-exp-1206)": 0.
|
34 |
-
"MLAB (o3-mini)": 0.
|
35 |
-
"MLAB (gpt-4o)": 2.
|
36 |
-
"MLAB (llama3-1-405b-instruct)": -
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
},
|
38 |
"machine_unlearning": {
|
39 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)":
|
40 |
-
"Top Human in Competition": 61.
|
41 |
-
"MLAB (claude-3-5-sonnet-v2)": -58.
|
42 |
-
"MLAB (gemini-exp-1206)": 3.
|
43 |
-
"MLAB (o3-mini)": 2.
|
44 |
-
"MLAB (gpt-4o)": -11.
|
45 |
-
"MLAB (llama3-1-405b-instruct)": 3.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
},
|
47 |
"backdoor-trigger-recovery": {
|
48 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)":
|
49 |
-
"Top Human in Competition": 621.
|
50 |
-
"MLAB (claude-3-5-sonnet-v2)": 247.
|
51 |
-
"MLAB (gemini-exp-1206)": 80.
|
52 |
-
"MLAB (o3-mini)": 38.
|
53 |
-
"MLAB (gpt-4o)": 64.
|
54 |
-
"MLAB (llama3-1-405b-instruct)": 71.
|
55 |
}
|
56 |
}
|
|
|
1 |
{
|
2 |
"perception_temporal_action_loc": {
|
3 |
+
"MLAB (claude-3-5-sonnet-v2)": 2.2,
|
4 |
+
"Top Human in Competition": 284.6,
|
5 |
+
"MLAB (gemini-exp-1206)": -1.3,
|
6 |
+
"MLAB (o3-mini)": 0.9,
|
7 |
+
"MLAB (gpt-4o)": 0.9,
|
8 |
+
"MLAB (llama3-1-405b-instruct)": 1.5,
|
9 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.0
|
10 |
},
|
11 |
"llm-merging": {
|
12 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.7,
|
13 |
+
"Top Human in Competition": 68.2,
|
14 |
+
"MLAB (claude-3-5-sonnet-v2)": 3.4,
|
15 |
+
"MLAB (gemini-exp-1206)": 3.4,
|
16 |
+
"MLAB (o3-mini)": -0.7,
|
17 |
+
"MLAB (gpt-4o)": 1.4,
|
18 |
+
"MLAB (llama3-1-405b-instruct)": -0.7
|
19 |
},
|
20 |
"meta-learning": {
|
21 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 5.4,
|
22 |
+
"Top Human in Competition": 304.5,
|
23 |
+
"MLAB (claude-3-5-sonnet-v2)": 5.4,
|
24 |
+
"MLAB (gemini-exp-1206)": 5.4,
|
25 |
+
"MLAB (o3-mini)": -14.9,
|
26 |
+
"MLAB (gpt-4o)": 5.4,
|
27 |
+
"MLAB (llama3-1-405b-instruct)": 5.4
|
28 |
},
|
29 |
"product-recommendation": {
|
30 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 2.3,
|
31 |
+
"Top Human in Competition": 412.6,
|
32 |
+
"MLAB (claude-3-5-sonnet-v2)": 12.3,
|
33 |
+
"MLAB (gemini-exp-1206)": 0.6,
|
34 |
+
"MLAB (o3-mini)": 0.6,
|
35 |
+
"MLAB (gpt-4o)": 2.6,
|
36 |
+
"MLAB (llama3-1-405b-instruct)": -0.0
|
37 |
+
},
|
38 |
+
"weather_forcast": {
|
39 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 83.6,
|
40 |
+
"Top Human in Competition": 399.4,
|
41 |
+
"MLAB (claude-3-5-sonnet-v2)": 31.0,
|
42 |
+
"MLAB (gemini-exp-1206)": 91.4,
|
43 |
+
"MLAB (o3-mini)": 53.3,
|
44 |
+
"MLAB (gpt-4o)": 100.8,
|
45 |
+
"MLAB (llama3-1-405b-instruct)": 66.7
|
46 |
},
|
47 |
"machine_unlearning": {
|
48 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 8.8,
|
49 |
+
"Top Human in Competition": 61.9,
|
50 |
+
"MLAB (claude-3-5-sonnet-v2)": -58.6,
|
51 |
+
"MLAB (gemini-exp-1206)": 3.5,
|
52 |
+
"MLAB (o3-mini)": 2.2,
|
53 |
+
"MLAB (gpt-4o)": -11.1,
|
54 |
+
"MLAB (llama3-1-405b-instruct)": 3.8
|
55 |
+
},
|
56 |
+
"erasing_invisible_watermarks": {
|
57 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.0,
|
58 |
+
"Top Human in Competition": -95.6,
|
59 |
+
"MLAB (claude-3-5-sonnet-v2)": -0.0,
|
60 |
+
"MLAB (gemini-exp-1206)": -0.0,
|
61 |
+
"MLAB (o3-mini)": -0.0,
|
62 |
+
"MLAB (gpt-4o)": 0.5,
|
63 |
+
"MLAB (llama3-1-405b-instruct)": -0.0
|
64 |
},
|
65 |
"backdoor-trigger-recovery": {
|
66 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 85.0,
|
67 |
+
"Top Human in Competition": 621.3,
|
68 |
+
"MLAB (claude-3-5-sonnet-v2)": 247.9,
|
69 |
+
"MLAB (gemini-exp-1206)": 80.4,
|
70 |
+
"MLAB (o3-mini)": 38.8,
|
71 |
+
"MLAB (gpt-4o)": 64.5,
|
72 |
+
"MLAB (llama3-1-405b-instruct)": 71.7
|
73 |
}
|
74 |
}
|
src/data/metrics/relative_improvement_to_human.json
CHANGED
@@ -1,56 +1,74 @@
|
|
1 |
{
|
2 |
"perception_temporal_action_loc": {
|
3 |
-
"MLAB (claude-3-5-sonnet-v2)": 0.
|
4 |
"Top Human in Competition": 100.0,
|
5 |
-
"MLAB (gemini-exp-1206)": -0.
|
6 |
-
"MLAB (o3-mini)": 0.
|
7 |
-
"MLAB (gpt-4o)": 0.
|
8 |
-
"MLAB (llama3-1-405b-instruct)": 0.
|
9 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.
|
10 |
},
|
11 |
"llm-merging": {
|
12 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": -0
|
13 |
"Top Human in Competition": 100.0,
|
14 |
-
"MLAB (claude-3-5-sonnet-v2)":
|
15 |
-
"MLAB (gemini-exp-1206)":
|
16 |
-
"MLAB (o3-mini)": -0
|
17 |
-
"MLAB (gpt-4o)":
|
18 |
-
"MLAB (llama3-1-405b-instruct)": -0
|
19 |
},
|
20 |
"meta-learning": {
|
21 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.
|
22 |
"Top Human in Competition": 100.0,
|
23 |
-
"MLAB (claude-3-5-sonnet-v2)": 1.
|
24 |
-
"MLAB (gemini-exp-1206)": 1.
|
25 |
-
"MLAB (o3-mini)": -4.
|
26 |
-
"MLAB (gpt-4o)": 1.
|
27 |
-
"MLAB (llama3-1-405b-instruct)": 1.
|
28 |
},
|
29 |
"product-recommendation": {
|
30 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.
|
31 |
"Top Human in Competition": 100.0,
|
32 |
-
"MLAB (claude-3-5-sonnet-v2)":
|
33 |
-
"MLAB (gemini-exp-1206)": 0.
|
34 |
-
"MLAB (o3-mini)": 0.
|
35 |
-
"MLAB (gpt-4o)": 0.
|
36 |
-
"MLAB (llama3-1-405b-instruct)": -
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
},
|
38 |
"machine_unlearning": {
|
39 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
"Top Human in Competition": 100.0,
|
41 |
-
"MLAB (claude-3-5-sonnet-v2)":
|
42 |
-
"MLAB (gemini-exp-1206)": 5
|
43 |
-
"MLAB (o3-mini)":
|
44 |
-
"MLAB (gpt-4o)":
|
45 |
-
"MLAB (llama3-1-405b-instruct)":
|
46 |
},
|
47 |
"backdoor-trigger-recovery": {
|
48 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)":
|
49 |
"Top Human in Competition": 100.0,
|
50 |
-
"MLAB (claude-3-5-sonnet-v2)": 39.
|
51 |
-
"MLAB (gemini-exp-1206)": 12.
|
52 |
-
"MLAB (o3-mini)": 6.
|
53 |
-
"MLAB (gpt-4o)": 10.
|
54 |
-
"MLAB (llama3-1-405b-instruct)": 11.
|
55 |
}
|
56 |
}
|
|
|
1 |
{
|
2 |
"perception_temporal_action_loc": {
|
3 |
+
"MLAB (claude-3-5-sonnet-v2)": 0.8,
|
4 |
"Top Human in Competition": 100.0,
|
5 |
+
"MLAB (gemini-exp-1206)": -0.5,
|
6 |
+
"MLAB (o3-mini)": 0.3,
|
7 |
+
"MLAB (gpt-4o)": 0.3,
|
8 |
+
"MLAB (llama3-1-405b-instruct)": 0.5,
|
9 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.4
|
10 |
},
|
11 |
"llm-merging": {
|
12 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": -1.0,
|
13 |
"Top Human in Competition": 100.0,
|
14 |
+
"MLAB (claude-3-5-sonnet-v2)": 5.0,
|
15 |
+
"MLAB (gemini-exp-1206)": 5.0,
|
16 |
+
"MLAB (o3-mini)": -1.0,
|
17 |
+
"MLAB (gpt-4o)": 2.0,
|
18 |
+
"MLAB (llama3-1-405b-instruct)": -1.0
|
19 |
},
|
20 |
"meta-learning": {
|
21 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.8,
|
22 |
"Top Human in Competition": 100.0,
|
23 |
+
"MLAB (claude-3-5-sonnet-v2)": 1.8,
|
24 |
+
"MLAB (gemini-exp-1206)": 1.8,
|
25 |
+
"MLAB (o3-mini)": -4.9,
|
26 |
+
"MLAB (gpt-4o)": 1.8,
|
27 |
+
"MLAB (llama3-1-405b-instruct)": 1.8
|
28 |
},
|
29 |
"product-recommendation": {
|
30 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.6,
|
31 |
"Top Human in Competition": 100.0,
|
32 |
+
"MLAB (claude-3-5-sonnet-v2)": 3.0,
|
33 |
+
"MLAB (gemini-exp-1206)": 0.1,
|
34 |
+
"MLAB (o3-mini)": 0.1,
|
35 |
+
"MLAB (gpt-4o)": 0.6,
|
36 |
+
"MLAB (llama3-1-405b-instruct)": -0.0
|
37 |
+
},
|
38 |
+
"weather_forcast": {
|
39 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 20.9,
|
40 |
+
"Top Human in Competition": 100.0,
|
41 |
+
"MLAB (claude-3-5-sonnet-v2)": 7.8,
|
42 |
+
"MLAB (gemini-exp-1206)": 22.9,
|
43 |
+
"MLAB (o3-mini)": 13.3,
|
44 |
+
"MLAB (gpt-4o)": 25.2,
|
45 |
+
"MLAB (llama3-1-405b-instruct)": 16.7
|
46 |
},
|
47 |
"machine_unlearning": {
|
48 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 14.2,
|
49 |
+
"Top Human in Competition": 100.0,
|
50 |
+
"MLAB (claude-3-5-sonnet-v2)": -94.7,
|
51 |
+
"MLAB (gemini-exp-1206)": 5.6,
|
52 |
+
"MLAB (o3-mini)": 3.6,
|
53 |
+
"MLAB (gpt-4o)": -18.0,
|
54 |
+
"MLAB (llama3-1-405b-instruct)": 6.2
|
55 |
+
},
|
56 |
+
"erasing_invisible_watermarks": {
|
57 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 84.0,
|
58 |
"Top Human in Competition": 100.0,
|
59 |
+
"MLAB (claude-3-5-sonnet-v2)": 87.6,
|
60 |
+
"MLAB (gemini-exp-1206)": 97.5,
|
61 |
+
"MLAB (o3-mini)": 83.4,
|
62 |
+
"MLAB (gpt-4o)": 83.4,
|
63 |
+
"MLAB (llama3-1-405b-instruct)": 83.4
|
64 |
},
|
65 |
"backdoor-trigger-recovery": {
|
66 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 13.7,
|
67 |
"Top Human in Competition": 100.0,
|
68 |
+
"MLAB (claude-3-5-sonnet-v2)": 39.9,
|
69 |
+
"MLAB (gemini-exp-1206)": 12.9,
|
70 |
+
"MLAB (o3-mini)": 6.2,
|
71 |
+
"MLAB (gpt-4o)": 10.4,
|
72 |
+
"MLAB (llama3-1-405b-instruct)": 11.5
|
73 |
}
|
74 |
}
|