Armeddinosaur commited on
Commit
cf2253a
·
1 Parent(s): 72d60b0

Updating metrics

Browse files
src/data/metrics/absolute_improvement_to_baseline.json CHANGED
@@ -1,56 +1,74 @@
1
  {
2
  "perception_temporal_action_loc": {
3
- "MLAB (claude-3-5-sonnet-v2)": 2.222443094482299,
4
- "Top Human in Competition": 284.55703321316366,
5
- "MLAB (gemini-exp-1206)": -1.34633272895098,
6
- "MLAB (o3-mini)": 0.8724822663469414,
7
- "MLAB (gpt-4o)": 0.9384906166574135,
8
- "MLAB (llama3-1-405b-instruct)": 1.474927454740455,
9
- "CoI-Agent (o1) + MLAB (gpt-4o)": 0.9888962417416385
10
  },
11
  "llm-merging": {
12
- "CoI-Agent (o1) + MLAB (gpt-4o)": -0.6756756689645764,
13
- "Top Human in Competition": 68.24324325461103,
14
- "MLAB (claude-3-5-sonnet-v2)": 3.3783783853634035,
15
- "MLAB (gemini-exp-1206)": 3.3783783853634035,
16
- "MLAB (o3-mini)": -0.6756756689645764,
17
- "MLAB (gpt-4o)": 1.3513513581994137,
18
- "MLAB (llama3-1-405b-instruct)": -0.6756756689645764
19
  },
20
  "meta-learning": {
21
- "CoI-Agent (o1) + MLAB (gpt-4o)": 5.424978139166417,
22
- "Top Human in Competition": 304.53435579895256,
23
- "MLAB (claude-3-5-sonnet-v2)": 5.424978139166417,
24
- "MLAB (gemini-exp-1206)": 5.424978139166417,
25
- "MLAB (o3-mini)": -14.923192223926499,
26
- "MLAB (gpt-4o)": 5.424978139166417,
27
- "MLAB (llama3-1-405b-instruct)": 5.424978139166417
28
  },
29
  "product-recommendation": {
30
- "CoI-Agent (o1) + MLAB (gpt-4o)": 0.6021227441680528,
31
- "Top Human in Competition": 412.59793394031675,
32
- "MLAB (claude-3-5-sonnet-v2)": 12.283606772997718,
33
- "MLAB (gemini-exp-1206)": 0.6021227441680528,
34
- "MLAB (o3-mini)": 0.6035316323448103,
35
- "MLAB (gpt-4o)": 2.6400767209619422,
36
- "MLAB (llama3-1-405b-instruct)": -2.9066701147102995e-09
 
 
 
 
 
 
 
 
 
37
  },
38
  "machine_unlearning": {
39
- "CoI-Agent (o1) + MLAB (gpt-4o)": 7.318484292638537,
40
- "Top Human in Competition": 61.85258904854873,
41
- "MLAB (claude-3-5-sonnet-v2)": -58.58540153334969,
42
- "MLAB (gemini-exp-1206)": 3.4837676447981045,
43
- "MLAB (o3-mini)": 2.2414490971518704,
44
- "MLAB (gpt-4o)": -11.131587250139926,
45
- "MLAB (llama3-1-405b-instruct)": 3.8409541040677597
 
 
 
 
 
 
 
 
 
46
  },
47
  "backdoor-trigger-recovery": {
48
- "CoI-Agent (o1) + MLAB (gpt-4o)": 38.252918051116,
49
- "Top Human in Competition": 621.2635313337943,
50
- "MLAB (claude-3-5-sonnet-v2)": 247.90785034564928,
51
- "MLAB (gemini-exp-1206)": 80.40937239150493,
52
- "MLAB (o3-mini)": 38.75953643366491,
53
- "MLAB (gpt-4o)": 64.52832837042699,
54
- "MLAB (llama3-1-405b-instruct)": 71.70765816958271
55
  }
56
  }
 
1
  {
2
  "perception_temporal_action_loc": {
3
+ "MLAB (claude-3-5-sonnet-v2)": 2.2,
4
+ "Top Human in Competition": 284.6,
5
+ "MLAB (gemini-exp-1206)": -1.3,
6
+ "MLAB (o3-mini)": 0.9,
7
+ "MLAB (gpt-4o)": 0.9,
8
+ "MLAB (llama3-1-405b-instruct)": 1.5,
9
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 1.0
10
  },
11
  "llm-merging": {
12
+ "CoI-Agent (o1) + MLAB (gpt-4o)": -0.7,
13
+ "Top Human in Competition": 68.2,
14
+ "MLAB (claude-3-5-sonnet-v2)": 3.4,
15
+ "MLAB (gemini-exp-1206)": 3.4,
16
+ "MLAB (o3-mini)": -0.7,
17
+ "MLAB (gpt-4o)": 1.4,
18
+ "MLAB (llama3-1-405b-instruct)": -0.7
19
  },
20
  "meta-learning": {
21
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 5.4,
22
+ "Top Human in Competition": 304.5,
23
+ "MLAB (claude-3-5-sonnet-v2)": 5.4,
24
+ "MLAB (gemini-exp-1206)": 5.4,
25
+ "MLAB (o3-mini)": -14.9,
26
+ "MLAB (gpt-4o)": 5.4,
27
+ "MLAB (llama3-1-405b-instruct)": 5.4
28
  },
29
  "product-recommendation": {
30
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 2.3,
31
+ "Top Human in Competition": 412.6,
32
+ "MLAB (claude-3-5-sonnet-v2)": 12.3,
33
+ "MLAB (gemini-exp-1206)": 0.6,
34
+ "MLAB (o3-mini)": 0.6,
35
+ "MLAB (gpt-4o)": 2.6,
36
+ "MLAB (llama3-1-405b-instruct)": -0.0
37
+ },
38
+ "weather_forcast": {
39
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 83.6,
40
+ "Top Human in Competition": 399.4,
41
+ "MLAB (claude-3-5-sonnet-v2)": 31.0,
42
+ "MLAB (gemini-exp-1206)": 91.4,
43
+ "MLAB (o3-mini)": 53.3,
44
+ "MLAB (gpt-4o)": 100.8,
45
+ "MLAB (llama3-1-405b-instruct)": 66.7
46
  },
47
  "machine_unlearning": {
48
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 8.8,
49
+ "Top Human in Competition": 61.9,
50
+ "MLAB (claude-3-5-sonnet-v2)": -58.6,
51
+ "MLAB (gemini-exp-1206)": 3.5,
52
+ "MLAB (o3-mini)": 2.2,
53
+ "MLAB (gpt-4o)": -11.1,
54
+ "MLAB (llama3-1-405b-instruct)": 3.8
55
+ },
56
+ "erasing_invisible_watermarks": {
57
+ "CoI-Agent (o1) + MLAB (gpt-4o)": -0.0,
58
+ "Top Human in Competition": -95.6,
59
+ "MLAB (claude-3-5-sonnet-v2)": -0.0,
60
+ "MLAB (gemini-exp-1206)": -0.0,
61
+ "MLAB (o3-mini)": -0.0,
62
+ "MLAB (gpt-4o)": 0.5,
63
+ "MLAB (llama3-1-405b-instruct)": -0.0
64
  },
65
  "backdoor-trigger-recovery": {
66
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 85.0,
67
+ "Top Human in Competition": 621.3,
68
+ "MLAB (claude-3-5-sonnet-v2)": 247.9,
69
+ "MLAB (gemini-exp-1206)": 80.4,
70
+ "MLAB (o3-mini)": 38.8,
71
+ "MLAB (gpt-4o)": 64.5,
72
+ "MLAB (llama3-1-405b-instruct)": 71.7
73
  }
74
  }
src/data/metrics/relative_improvement_to_human.json CHANGED
@@ -1,56 +1,74 @@
1
  {
2
  "perception_temporal_action_loc": {
3
- "MLAB (claude-3-5-sonnet-v2)": 0.7810185077440877,
4
  "Top Human in Competition": 100.0,
5
- "MLAB (gemini-exp-1206)": -0.4731328246392113,
6
- "MLAB (o3-mini)": 0.3066106841553126,
7
- "MLAB (gpt-4o)": 0.3298075630252947,
8
- "MLAB (llama3-1-405b-instruct)": 0.5183240203504569,
9
- "CoI-Agent (o1) + MLAB (gpt-4o)": 0.3475212791527979
10
  },
11
  "llm-merging": {
12
- "CoI-Agent (o1) + MLAB (gpt-4o)": -0.9900989999019761,
13
  "Top Human in Competition": 100.0,
14
- "MLAB (claude-3-5-sonnet-v2)": 4.950495058915793,
15
- "MLAB (gemini-exp-1206)": 4.950495058915793,
16
- "MLAB (o3-mini)": -0.9900989999019761,
17
- "MLAB (gpt-4o)": 1.9801980295069084,
18
- "MLAB (llama3-1-405b-instruct)": -0.9900989999019761
19
  },
20
  "meta-learning": {
21
- "CoI-Agent (o1) + MLAB (gpt-4o)": 1.781401026144938,
22
  "Top Human in Competition": 100.0,
23
- "MLAB (claude-3-5-sonnet-v2)": 1.781401026144938,
24
- "MLAB (gemini-exp-1206)": 1.781401026144938,
25
- "MLAB (o3-mini)": -4.900331256476853,
26
- "MLAB (gpt-4o)": 1.781401026144938,
27
- "MLAB (llama3-1-405b-instruct)": 1.781401026144938
28
  },
29
  "product-recommendation": {
30
- "CoI-Agent (o1) + MLAB (gpt-4o)": 0.1459345029718814,
31
  "Top Human in Competition": 100.0,
32
- "MLAB (claude-3-5-sonnet-v2)": 2.9771372473170388,
33
- "MLAB (gemini-exp-1206)": 0.1459345029718814,
34
- "MLAB (o3-mini)": 0.1462759705510577,
35
- "MLAB (gpt-4o)": 0.6398666846799662,
36
- "MLAB (llama3-1-405b-instruct)": -7.044800459739471e-10
 
 
 
 
 
 
 
 
 
37
  },
38
  "machine_unlearning": {
39
- "CoI-Agent (o1) + MLAB (gpt-4o)": 11.832138969791846,
 
 
 
 
 
 
 
 
 
40
  "Top Human in Competition": 100.0,
41
- "MLAB (claude-3-5-sonnet-v2)": -94.71778374121965,
42
- "MLAB (gemini-exp-1206)": 5.632371576335568,
43
- "MLAB (o3-mini)": 3.623856546073656,
44
- "MLAB (gpt-4o)": -17.996962489965668,
45
- "MLAB (llama3-1-405b-instruct)": 6.2098517833311
46
  },
47
  "backdoor-trigger-recovery": {
48
- "CoI-Agent (o1) + MLAB (gpt-4o)": 6.1572772457753295,
49
  "Top Human in Competition": 100.0,
50
- "MLAB (claude-3-5-sonnet-v2)": 39.903815022493674,
51
- "MLAB (gemini-exp-1206)": 12.94287662739089,
52
- "MLAB (o3-mini)": 6.238823700218141,
53
- "MLAB (gpt-4o)": 10.386627431983776,
54
- "MLAB (llama3-1-405b-instruct)": 11.542228789066877
55
  }
56
  }
 
1
  {
2
  "perception_temporal_action_loc": {
3
+ "MLAB (claude-3-5-sonnet-v2)": 0.8,
4
  "Top Human in Competition": 100.0,
5
+ "MLAB (gemini-exp-1206)": -0.5,
6
+ "MLAB (o3-mini)": 0.3,
7
+ "MLAB (gpt-4o)": 0.3,
8
+ "MLAB (llama3-1-405b-instruct)": 0.5,
9
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 0.4
10
  },
11
  "llm-merging": {
12
+ "CoI-Agent (o1) + MLAB (gpt-4o)": -1.0,
13
  "Top Human in Competition": 100.0,
14
+ "MLAB (claude-3-5-sonnet-v2)": 5.0,
15
+ "MLAB (gemini-exp-1206)": 5.0,
16
+ "MLAB (o3-mini)": -1.0,
17
+ "MLAB (gpt-4o)": 2.0,
18
+ "MLAB (llama3-1-405b-instruct)": -1.0
19
  },
20
  "meta-learning": {
21
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 1.8,
22
  "Top Human in Competition": 100.0,
23
+ "MLAB (claude-3-5-sonnet-v2)": 1.8,
24
+ "MLAB (gemini-exp-1206)": 1.8,
25
+ "MLAB (o3-mini)": -4.9,
26
+ "MLAB (gpt-4o)": 1.8,
27
+ "MLAB (llama3-1-405b-instruct)": 1.8
28
  },
29
  "product-recommendation": {
30
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 0.6,
31
  "Top Human in Competition": 100.0,
32
+ "MLAB (claude-3-5-sonnet-v2)": 3.0,
33
+ "MLAB (gemini-exp-1206)": 0.1,
34
+ "MLAB (o3-mini)": 0.1,
35
+ "MLAB (gpt-4o)": 0.6,
36
+ "MLAB (llama3-1-405b-instruct)": -0.0
37
+ },
38
+ "weather_forcast": {
39
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 20.9,
40
+ "Top Human in Competition": 100.0,
41
+ "MLAB (claude-3-5-sonnet-v2)": 7.8,
42
+ "MLAB (gemini-exp-1206)": 22.9,
43
+ "MLAB (o3-mini)": 13.3,
44
+ "MLAB (gpt-4o)": 25.2,
45
+ "MLAB (llama3-1-405b-instruct)": 16.7
46
  },
47
  "machine_unlearning": {
48
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 14.2,
49
+ "Top Human in Competition": 100.0,
50
+ "MLAB (claude-3-5-sonnet-v2)": -94.7,
51
+ "MLAB (gemini-exp-1206)": 5.6,
52
+ "MLAB (o3-mini)": 3.6,
53
+ "MLAB (gpt-4o)": -18.0,
54
+ "MLAB (llama3-1-405b-instruct)": 6.2
55
+ },
56
+ "erasing_invisible_watermarks": {
57
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 84.0,
58
  "Top Human in Competition": 100.0,
59
+ "MLAB (claude-3-5-sonnet-v2)": 87.6,
60
+ "MLAB (gemini-exp-1206)": 97.5,
61
+ "MLAB (o3-mini)": 83.4,
62
+ "MLAB (gpt-4o)": 83.4,
63
+ "MLAB (llama3-1-405b-instruct)": 83.4
64
  },
65
  "backdoor-trigger-recovery": {
66
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 13.7,
67
  "Top Human in Competition": 100.0,
68
+ "MLAB (claude-3-5-sonnet-v2)": 39.9,
69
+ "MLAB (gemini-exp-1206)": 12.9,
70
+ "MLAB (o3-mini)": 6.2,
71
+ "MLAB (gpt-4o)": 10.4,
72
+ "MLAB (llama3-1-405b-instruct)": 11.5
73
  }
74
  }