Armeddinosaur commited on
Commit
678bdbb
·
1 Parent(s): eea50e2

Updating metrics

Browse files
src/components/filters.py CHANGED
@@ -20,6 +20,10 @@ def initialize_session_state(df):
20
  if 'selected_tasks' not in st.session_state:
21
  # Select all tasks by default, excluding Model Type
22
  st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']]
 
 
 
 
23
 
24
  if 'selected_model_types' not in st.session_state:
25
  # Ensure all model types are selected by default
 
20
  if 'selected_tasks' not in st.session_state:
21
  # Select all tasks by default, excluding Model Type
22
  st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']]
23
+ else:
24
+ # Make sure selected_tasks only includes actual tasks from the dataframe
25
+ valid_tasks = [col for col in df.columns if col not in ['Model Type']]
26
+ st.session_state.selected_tasks = [task for task in st.session_state.selected_tasks if task in valid_tasks]
27
 
28
  if 'selected_model_types' not in st.session_state:
29
  # Ensure all model types are selected by default
src/data/metrics/absolute_improvement_to_baseline.json CHANGED
@@ -6,7 +6,8 @@
6
  "MLAB (o3-mini)": 0.9,
7
  "MLAB (gpt-4o)": 0.9,
8
  "MLAB (llama3-1-405b-instruct)": 1.5,
9
- "CoI-Agent (o1) + MLAB (gpt-4o)": 1.0
 
10
  },
11
  "llm-merging": {
12
  "CoI-Agent (o1) + MLAB (gpt-4o)": -0.7,
@@ -15,60 +16,57 @@
15
  "MLAB (gemini-exp-1206)": 3.4,
16
  "MLAB (o3-mini)": -0.7,
17
  "MLAB (gpt-4o)": 1.4,
18
- "MLAB (llama3-1-405b-instruct)": -0.7
19
- },
20
- "meta-learning": {
21
- "CoI-Agent (o1) + MLAB (gpt-4o)": 5.4,
22
- "Top Human in Competition": 304.5,
23
- "MLAB (claude-3-5-sonnet-v2)": 5.4,
24
- "MLAB (gemini-exp-1206)": 5.4,
25
- "MLAB (o3-mini)": -14.9,
26
- "MLAB (gpt-4o)": 5.4,
27
- "MLAB (llama3-1-405b-instruct)": 5.4
28
  },
29
  "product-recommendation": {
30
- "CoI-Agent (o1) + MLAB (gpt-4o)": 2.3,
31
- "Top Human in Competition": 412.6,
32
  "MLAB (claude-3-5-sonnet-v2)": 12.3,
 
33
  "MLAB (gemini-exp-1206)": 0.6,
34
  "MLAB (o3-mini)": 0.6,
35
  "MLAB (gpt-4o)": 2.6,
36
- "MLAB (llama3-1-405b-instruct)": -0.0
 
 
37
  },
38
  "weather_forcast": {
39
  "CoI-Agent (o1) + MLAB (gpt-4o)": 83.6,
40
- "Top Human in Competition": 399.4,
 
41
  "MLAB (claude-3-5-sonnet-v2)": 31.0,
42
  "MLAB (gemini-exp-1206)": 91.4,
43
  "MLAB (o3-mini)": 53.3,
44
  "MLAB (gpt-4o)": 100.8,
45
  "MLAB (llama3-1-405b-instruct)": 66.7
46
  },
 
 
 
 
 
 
 
 
 
 
47
  "machine_unlearning": {
48
- "CoI-Agent (o1) + MLAB (gpt-4o)": 8.8,
49
  "Top Human in Competition": 61.9,
 
50
  "MLAB (claude-3-5-sonnet-v2)": -58.6,
51
  "MLAB (gemini-exp-1206)": 3.5,
52
  "MLAB (o3-mini)": 2.2,
53
  "MLAB (gpt-4o)": -11.1,
54
  "MLAB (llama3-1-405b-instruct)": 3.8
55
  },
56
- "erasing_invisible_watermarks": {
57
- "CoI-Agent (o1) + MLAB (gpt-4o)": 80.3,
58
- "Top Human in Competition": 95.6,
59
- "MLAB (claude-3-5-sonnet-v2)": 83.7,
60
- "MLAB (gemini-exp-1206)": 93.3,
61
- "MLAB (o3-mini)": 79.8,
62
- "MLAB (gpt-4o)": 79.8,
63
- "MLAB (llama3-1-405b-instruct)": 79.8
64
- },
65
  "backdoor-trigger-recovery": {
66
- "CoI-Agent (o1) + MLAB (gpt-4o)": 85.0,
67
  "Top Human in Competition": 621.3,
 
68
  "MLAB (claude-3-5-sonnet-v2)": 247.9,
69
  "MLAB (gemini-exp-1206)": 80.4,
70
  "MLAB (o3-mini)": 38.8,
71
- "MLAB (gpt-4o)": 64.5,
72
- "MLAB (llama3-1-405b-instruct)": 71.7
73
  }
74
  }
 
6
  "MLAB (o3-mini)": 0.9,
7
  "MLAB (gpt-4o)": 0.9,
8
  "MLAB (llama3-1-405b-instruct)": 1.5,
9
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 1.0,
10
+ "Human Idea + MLAB (gpt-4o)": 1.5
11
  },
12
  "llm-merging": {
13
  "CoI-Agent (o1) + MLAB (gpt-4o)": -0.7,
 
16
  "MLAB (gemini-exp-1206)": 3.4,
17
  "MLAB (o3-mini)": -0.7,
18
  "MLAB (gpt-4o)": 1.4,
19
+ "MLAB (llama3-1-405b-instruct)": -0.7,
20
+ "Human Idea + MLAB (gpt-4o)": -0.7
 
 
 
 
 
 
 
 
21
  },
22
  "product-recommendation": {
 
 
23
  "MLAB (claude-3-5-sonnet-v2)": 12.3,
24
+ "Top Human in Competition": 412.6,
25
  "MLAB (gemini-exp-1206)": 0.6,
26
  "MLAB (o3-mini)": 0.6,
27
  "MLAB (gpt-4o)": 2.6,
28
+ "MLAB (llama3-1-405b-instruct)": -0.0,
29
+ "Human Idea + MLAB (gpt-4o)": 8.9,
30
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 0.6
31
  },
32
  "weather_forcast": {
33
  "CoI-Agent (o1) + MLAB (gpt-4o)": 83.6,
34
+ "Top Human in Competition": 212.0,
35
+ "Human Idea + MLAB (gpt-4o)": 26.1,
36
  "MLAB (claude-3-5-sonnet-v2)": 31.0,
37
  "MLAB (gemini-exp-1206)": 91.4,
38
  "MLAB (o3-mini)": 53.3,
39
  "MLAB (gpt-4o)": 100.8,
40
  "MLAB (llama3-1-405b-instruct)": 66.7
41
  },
42
+ "meta-learning": {
43
+ "MLAB (claude-3-5-sonnet-v2)": -14.9,
44
+ "Top Human in Competition": 304.5,
45
+ "MLAB (gemini-exp-1206)": -3.2,
46
+ "MLAB (o3-mini)": -14.9,
47
+ "MLAB (gpt-4o)": -14.9,
48
+ "MLAB (llama3-1-405b-instruct)": -14.9,
49
+ "Human Idea + MLAB (gpt-4o)": -14.9,
50
+ "CoI-Agent (o1) + MLAB (gpt-4o)": -14.9
51
+ },
52
  "machine_unlearning": {
53
+ "Human Idea + MLAB (gpt-4o)": 4.2,
54
  "Top Human in Competition": 61.9,
55
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 7.3,
56
  "MLAB (claude-3-5-sonnet-v2)": -58.6,
57
  "MLAB (gemini-exp-1206)": 3.5,
58
  "MLAB (o3-mini)": 2.2,
59
  "MLAB (gpt-4o)": -11.1,
60
  "MLAB (llama3-1-405b-instruct)": 3.8
61
  },
 
 
 
 
 
 
 
 
 
62
  "backdoor-trigger-recovery": {
63
+ "MLAB (gpt-4o)": 74.0,
64
  "Top Human in Competition": 621.3,
65
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 24.9,
66
  "MLAB (claude-3-5-sonnet-v2)": 247.9,
67
  "MLAB (gemini-exp-1206)": 80.4,
68
  "MLAB (o3-mini)": 38.8,
69
+ "MLAB (llama3-1-405b-instruct)": 71.7,
70
+ "Human Idea + MLAB (gpt-4o)": 54.5
71
  }
72
  }
src/data/metrics/relative_improvement_to_human.json CHANGED
@@ -6,7 +6,8 @@
6
  "MLAB (o3-mini)": 0.3,
7
  "MLAB (gpt-4o)": 0.3,
8
  "MLAB (llama3-1-405b-instruct)": 0.5,
9
- "CoI-Agent (o1) + MLAB (gpt-4o)": 0.4
 
10
  },
11
  "llm-merging": {
12
  "CoI-Agent (o1) + MLAB (gpt-4o)": -1.0,
@@ -15,60 +16,57 @@
15
  "MLAB (gemini-exp-1206)": 5.0,
16
  "MLAB (o3-mini)": -1.0,
17
  "MLAB (gpt-4o)": 2.0,
18
- "MLAB (llama3-1-405b-instruct)": -1.0
19
- },
20
- "meta-learning": {
21
- "CoI-Agent (o1) + MLAB (gpt-4o)": 1.8,
22
- "Top Human in Competition": 100.0,
23
- "MLAB (claude-3-5-sonnet-v2)": 1.8,
24
- "MLAB (gemini-exp-1206)": 1.8,
25
- "MLAB (o3-mini)": -4.9,
26
- "MLAB (gpt-4o)": 1.8,
27
- "MLAB (llama3-1-405b-instruct)": 1.8
28
  },
29
  "product-recommendation": {
30
- "CoI-Agent (o1) + MLAB (gpt-4o)": 0.6,
31
- "Top Human in Competition": 100.0,
32
  "MLAB (claude-3-5-sonnet-v2)": 3.0,
 
33
  "MLAB (gemini-exp-1206)": 0.1,
34
  "MLAB (o3-mini)": 0.1,
35
  "MLAB (gpt-4o)": 0.6,
36
- "MLAB (llama3-1-405b-instruct)": -0.0
 
 
37
  },
38
  "weather_forcast": {
39
- "CoI-Agent (o1) + MLAB (gpt-4o)": 20.9,
40
  "Top Human in Competition": 100.0,
41
- "MLAB (claude-3-5-sonnet-v2)": 7.8,
42
- "MLAB (gemini-exp-1206)": 22.9,
43
- "MLAB (o3-mini)": 13.3,
44
- "MLAB (gpt-4o)": 25.2,
45
- "MLAB (llama3-1-405b-instruct)": 16.7
 
 
 
 
 
 
 
 
 
 
 
46
  },
47
  "machine_unlearning": {
48
- "CoI-Agent (o1) + MLAB (gpt-4o)": 14.2,
49
  "Top Human in Competition": 100.0,
 
50
  "MLAB (claude-3-5-sonnet-v2)": -94.7,
51
  "MLAB (gemini-exp-1206)": 5.6,
52
  "MLAB (o3-mini)": 3.6,
53
  "MLAB (gpt-4o)": -18.0,
54
  "MLAB (llama3-1-405b-instruct)": 6.2
55
  },
56
- "erasing_invisible_watermarks": {
57
- "CoI-Agent (o1) + MLAB (gpt-4o)": 84.0,
58
- "Top Human in Competition": 100.0,
59
- "MLAB (claude-3-5-sonnet-v2)": 87.6,
60
- "MLAB (gemini-exp-1206)": 97.5,
61
- "MLAB (o3-mini)": 83.4,
62
- "MLAB (gpt-4o)": 83.4,
63
- "MLAB (llama3-1-405b-instruct)": 83.4
64
- },
65
  "backdoor-trigger-recovery": {
66
- "CoI-Agent (o1) + MLAB (gpt-4o)": 13.7,
67
  "Top Human in Competition": 100.0,
 
68
  "MLAB (claude-3-5-sonnet-v2)": 39.9,
69
  "MLAB (gemini-exp-1206)": 12.9,
70
  "MLAB (o3-mini)": 6.2,
71
- "MLAB (gpt-4o)": 10.4,
72
- "MLAB (llama3-1-405b-instruct)": 11.5
73
  }
74
  }
 
6
  "MLAB (o3-mini)": 0.3,
7
  "MLAB (gpt-4o)": 0.3,
8
  "MLAB (llama3-1-405b-instruct)": 0.5,
9
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 0.4,
10
+ "Human Idea + MLAB (gpt-4o)": 0.5
11
  },
12
  "llm-merging": {
13
  "CoI-Agent (o1) + MLAB (gpt-4o)": -1.0,
 
16
  "MLAB (gemini-exp-1206)": 5.0,
17
  "MLAB (o3-mini)": -1.0,
18
  "MLAB (gpt-4o)": 2.0,
19
+ "MLAB (llama3-1-405b-instruct)": -1.0,
20
+ "Human Idea + MLAB (gpt-4o)": -1.0
 
 
 
 
 
 
 
 
21
  },
22
  "product-recommendation": {
 
 
23
  "MLAB (claude-3-5-sonnet-v2)": 3.0,
24
+ "Top Human in Competition": 100.0,
25
  "MLAB (gemini-exp-1206)": 0.1,
26
  "MLAB (o3-mini)": 0.1,
27
  "MLAB (gpt-4o)": 0.6,
28
+ "MLAB (llama3-1-405b-instruct)": -0.0,
29
+ "Human Idea + MLAB (gpt-4o)": 2.2,
30
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 0.1
31
  },
32
  "weather_forcast": {
33
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 39.4,
34
  "Top Human in Competition": 100.0,
35
+ "Human Idea + MLAB (gpt-4o)": 12.3,
36
+ "MLAB (claude-3-5-sonnet-v2)": 14.6,
37
+ "MLAB (gemini-exp-1206)": 43.1,
38
+ "MLAB (o3-mini)": 25.1,
39
+ "MLAB (gpt-4o)": 47.5,
40
+ "MLAB (llama3-1-405b-instruct)": 31.5
41
+ },
42
+ "meta-learning": {
43
+ "MLAB (claude-3-5-sonnet-v2)": -4.9,
44
+ "Top Human in Competition": 100.0,
45
+ "MLAB (gemini-exp-1206)": -1.1,
46
+ "MLAB (o3-mini)": -4.9,
47
+ "MLAB (gpt-4o)": -4.9,
48
+ "MLAB (llama3-1-405b-instruct)": -4.9,
49
+ "Human Idea + MLAB (gpt-4o)": -4.9,
50
+ "CoI-Agent (o1) + MLAB (gpt-4o)": -4.9
51
  },
52
  "machine_unlearning": {
53
+ "Human Idea + MLAB (gpt-4o)": 6.8,
54
  "Top Human in Competition": 100.0,
55
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 11.8,
56
  "MLAB (claude-3-5-sonnet-v2)": -94.7,
57
  "MLAB (gemini-exp-1206)": 5.6,
58
  "MLAB (o3-mini)": 3.6,
59
  "MLAB (gpt-4o)": -18.0,
60
  "MLAB (llama3-1-405b-instruct)": 6.2
61
  },
 
 
 
 
 
 
 
 
 
62
  "backdoor-trigger-recovery": {
63
+ "MLAB (gpt-4o)": 11.9,
64
  "Top Human in Competition": 100.0,
65
+ "CoI-Agent (o1) + MLAB (gpt-4o)": 4.0,
66
  "MLAB (claude-3-5-sonnet-v2)": 39.9,
67
  "MLAB (gemini-exp-1206)": 12.9,
68
  "MLAB (o3-mini)": 6.2,
69
+ "MLAB (llama3-1-405b-instruct)": 11.5,
70
+ "Human Idea + MLAB (gpt-4o)": 8.8
71
  }
72
  }
src/utils/config.py CHANGED
@@ -82,7 +82,8 @@ model_categories = {
82
  "MLAB (llama3-1-405b-instruct)": "Open Weights",
83
  "CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
84
  "Human": "Human",
85
- "Top Human in Competition": "Human"
 
86
  # More models would be added here as needed
87
  }
88
 
 
82
  "MLAB (llama3-1-405b-instruct)": "Open Weights",
83
  "CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
84
  "Human": "Human",
85
+ "Top Human in Competition": "Human",
86
+ "Human Idea + MLAB (gpt-4o)": "Closed Source"
87
  # More models would be added here as needed
88
  }
89
 
src/utils/task_mapping.py CHANGED
@@ -8,6 +8,7 @@ task_display_names = {
8
  "Llm Merging": "LLM Merging",
9
  "Meta Learning": "Meta Learning",
10
  "Product Recommendation": "Next Product Recommendation",
 
11
  "Machine Unlearning": "Machine Unlearning",
12
  "Backdoor Trigger Recovery": "Backdoor Trigger Recovery"
13
  }
 
8
  "Llm Merging": "LLM Merging",
9
  "Meta Learning": "Meta Learning",
10
  "Product Recommendation": "Next Product Recommendation",
11
+ "Weather Forcast": "Rainfall Prediction",
12
  "Machine Unlearning": "Machine Unlearning",
13
  "Backdoor Trigger Recovery": "Backdoor Trigger Recovery"
14
  }