Spaces:
Running
Running
Commit
·
678bdbb
1
Parent(s):
eea50e2
Updating metrics
Browse files
src/components/filters.py
CHANGED
@@ -20,6 +20,10 @@ def initialize_session_state(df):
|
|
20 |
if 'selected_tasks' not in st.session_state:
|
21 |
# Select all tasks by default, excluding Model Type
|
22 |
st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']]
|
|
|
|
|
|
|
|
|
23 |
|
24 |
if 'selected_model_types' not in st.session_state:
|
25 |
# Ensure all model types are selected by default
|
|
|
20 |
if 'selected_tasks' not in st.session_state:
|
21 |
# Select all tasks by default, excluding Model Type
|
22 |
st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']]
|
23 |
+
else:
|
24 |
+
# Make sure selected_tasks only includes actual tasks from the dataframe
|
25 |
+
valid_tasks = [col for col in df.columns if col not in ['Model Type']]
|
26 |
+
st.session_state.selected_tasks = [task for task in st.session_state.selected_tasks if task in valid_tasks]
|
27 |
|
28 |
if 'selected_model_types' not in st.session_state:
|
29 |
# Ensure all model types are selected by default
|
src/data/metrics/absolute_improvement_to_baseline.json
CHANGED
@@ -6,7 +6,8 @@
|
|
6 |
"MLAB (o3-mini)": 0.9,
|
7 |
"MLAB (gpt-4o)": 0.9,
|
8 |
"MLAB (llama3-1-405b-instruct)": 1.5,
|
9 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.0
|
|
|
10 |
},
|
11 |
"llm-merging": {
|
12 |
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.7,
|
@@ -15,60 +16,57 @@
|
|
15 |
"MLAB (gemini-exp-1206)": 3.4,
|
16 |
"MLAB (o3-mini)": -0.7,
|
17 |
"MLAB (gpt-4o)": 1.4,
|
18 |
-
"MLAB (llama3-1-405b-instruct)": -0.7
|
19 |
-
|
20 |
-
"meta-learning": {
|
21 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 5.4,
|
22 |
-
"Top Human in Competition": 304.5,
|
23 |
-
"MLAB (claude-3-5-sonnet-v2)": 5.4,
|
24 |
-
"MLAB (gemini-exp-1206)": 5.4,
|
25 |
-
"MLAB (o3-mini)": -14.9,
|
26 |
-
"MLAB (gpt-4o)": 5.4,
|
27 |
-
"MLAB (llama3-1-405b-instruct)": 5.4
|
28 |
},
|
29 |
"product-recommendation": {
|
30 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 2.3,
|
31 |
-
"Top Human in Competition": 412.6,
|
32 |
"MLAB (claude-3-5-sonnet-v2)": 12.3,
|
|
|
33 |
"MLAB (gemini-exp-1206)": 0.6,
|
34 |
"MLAB (o3-mini)": 0.6,
|
35 |
"MLAB (gpt-4o)": 2.6,
|
36 |
-
"MLAB (llama3-1-405b-instruct)": -0.0
|
|
|
|
|
37 |
},
|
38 |
"weather_forcast": {
|
39 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 83.6,
|
40 |
-
"Top Human in Competition":
|
|
|
41 |
"MLAB (claude-3-5-sonnet-v2)": 31.0,
|
42 |
"MLAB (gemini-exp-1206)": 91.4,
|
43 |
"MLAB (o3-mini)": 53.3,
|
44 |
"MLAB (gpt-4o)": 100.8,
|
45 |
"MLAB (llama3-1-405b-instruct)": 66.7
|
46 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
"machine_unlearning": {
|
48 |
-
"
|
49 |
"Top Human in Competition": 61.9,
|
|
|
50 |
"MLAB (claude-3-5-sonnet-v2)": -58.6,
|
51 |
"MLAB (gemini-exp-1206)": 3.5,
|
52 |
"MLAB (o3-mini)": 2.2,
|
53 |
"MLAB (gpt-4o)": -11.1,
|
54 |
"MLAB (llama3-1-405b-instruct)": 3.8
|
55 |
},
|
56 |
-
"erasing_invisible_watermarks": {
|
57 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 80.3,
|
58 |
-
"Top Human in Competition": 95.6,
|
59 |
-
"MLAB (claude-3-5-sonnet-v2)": 83.7,
|
60 |
-
"MLAB (gemini-exp-1206)": 93.3,
|
61 |
-
"MLAB (o3-mini)": 79.8,
|
62 |
-
"MLAB (gpt-4o)": 79.8,
|
63 |
-
"MLAB (llama3-1-405b-instruct)": 79.8
|
64 |
-
},
|
65 |
"backdoor-trigger-recovery": {
|
66 |
-
"
|
67 |
"Top Human in Competition": 621.3,
|
|
|
68 |
"MLAB (claude-3-5-sonnet-v2)": 247.9,
|
69 |
"MLAB (gemini-exp-1206)": 80.4,
|
70 |
"MLAB (o3-mini)": 38.8,
|
71 |
-
"MLAB (
|
72 |
-
"MLAB (
|
73 |
}
|
74 |
}
|
|
|
6 |
"MLAB (o3-mini)": 0.9,
|
7 |
"MLAB (gpt-4o)": 0.9,
|
8 |
"MLAB (llama3-1-405b-instruct)": 1.5,
|
9 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.0,
|
10 |
+
"Human Idea + MLAB (gpt-4o)": 1.5
|
11 |
},
|
12 |
"llm-merging": {
|
13 |
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.7,
|
|
|
16 |
"MLAB (gemini-exp-1206)": 3.4,
|
17 |
"MLAB (o3-mini)": -0.7,
|
18 |
"MLAB (gpt-4o)": 1.4,
|
19 |
+
"MLAB (llama3-1-405b-instruct)": -0.7,
|
20 |
+
"Human Idea + MLAB (gpt-4o)": -0.7
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
},
|
22 |
"product-recommendation": {
|
|
|
|
|
23 |
"MLAB (claude-3-5-sonnet-v2)": 12.3,
|
24 |
+
"Top Human in Competition": 412.6,
|
25 |
"MLAB (gemini-exp-1206)": 0.6,
|
26 |
"MLAB (o3-mini)": 0.6,
|
27 |
"MLAB (gpt-4o)": 2.6,
|
28 |
+
"MLAB (llama3-1-405b-instruct)": -0.0,
|
29 |
+
"Human Idea + MLAB (gpt-4o)": 8.9,
|
30 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.6
|
31 |
},
|
32 |
"weather_forcast": {
|
33 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 83.6,
|
34 |
+
"Top Human in Competition": 212.0,
|
35 |
+
"Human Idea + MLAB (gpt-4o)": 26.1,
|
36 |
"MLAB (claude-3-5-sonnet-v2)": 31.0,
|
37 |
"MLAB (gemini-exp-1206)": 91.4,
|
38 |
"MLAB (o3-mini)": 53.3,
|
39 |
"MLAB (gpt-4o)": 100.8,
|
40 |
"MLAB (llama3-1-405b-instruct)": 66.7
|
41 |
},
|
42 |
+
"meta-learning": {
|
43 |
+
"MLAB (claude-3-5-sonnet-v2)": -14.9,
|
44 |
+
"Top Human in Competition": 304.5,
|
45 |
+
"MLAB (gemini-exp-1206)": -3.2,
|
46 |
+
"MLAB (o3-mini)": -14.9,
|
47 |
+
"MLAB (gpt-4o)": -14.9,
|
48 |
+
"MLAB (llama3-1-405b-instruct)": -14.9,
|
49 |
+
"Human Idea + MLAB (gpt-4o)": -14.9,
|
50 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": -14.9
|
51 |
+
},
|
52 |
"machine_unlearning": {
|
53 |
+
"Human Idea + MLAB (gpt-4o)": 4.2,
|
54 |
"Top Human in Competition": 61.9,
|
55 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 7.3,
|
56 |
"MLAB (claude-3-5-sonnet-v2)": -58.6,
|
57 |
"MLAB (gemini-exp-1206)": 3.5,
|
58 |
"MLAB (o3-mini)": 2.2,
|
59 |
"MLAB (gpt-4o)": -11.1,
|
60 |
"MLAB (llama3-1-405b-instruct)": 3.8
|
61 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
"backdoor-trigger-recovery": {
|
63 |
+
"MLAB (gpt-4o)": 74.0,
|
64 |
"Top Human in Competition": 621.3,
|
65 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 24.9,
|
66 |
"MLAB (claude-3-5-sonnet-v2)": 247.9,
|
67 |
"MLAB (gemini-exp-1206)": 80.4,
|
68 |
"MLAB (o3-mini)": 38.8,
|
69 |
+
"MLAB (llama3-1-405b-instruct)": 71.7,
|
70 |
+
"Human Idea + MLAB (gpt-4o)": 54.5
|
71 |
}
|
72 |
}
|
src/data/metrics/relative_improvement_to_human.json
CHANGED
@@ -6,7 +6,8 @@
|
|
6 |
"MLAB (o3-mini)": 0.3,
|
7 |
"MLAB (gpt-4o)": 0.3,
|
8 |
"MLAB (llama3-1-405b-instruct)": 0.5,
|
9 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.4
|
|
|
10 |
},
|
11 |
"llm-merging": {
|
12 |
"CoI-Agent (o1) + MLAB (gpt-4o)": -1.0,
|
@@ -15,60 +16,57 @@
|
|
15 |
"MLAB (gemini-exp-1206)": 5.0,
|
16 |
"MLAB (o3-mini)": -1.0,
|
17 |
"MLAB (gpt-4o)": 2.0,
|
18 |
-
"MLAB (llama3-1-405b-instruct)": -1.0
|
19 |
-
|
20 |
-
"meta-learning": {
|
21 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.8,
|
22 |
-
"Top Human in Competition": 100.0,
|
23 |
-
"MLAB (claude-3-5-sonnet-v2)": 1.8,
|
24 |
-
"MLAB (gemini-exp-1206)": 1.8,
|
25 |
-
"MLAB (o3-mini)": -4.9,
|
26 |
-
"MLAB (gpt-4o)": 1.8,
|
27 |
-
"MLAB (llama3-1-405b-instruct)": 1.8
|
28 |
},
|
29 |
"product-recommendation": {
|
30 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.6,
|
31 |
-
"Top Human in Competition": 100.0,
|
32 |
"MLAB (claude-3-5-sonnet-v2)": 3.0,
|
|
|
33 |
"MLAB (gemini-exp-1206)": 0.1,
|
34 |
"MLAB (o3-mini)": 0.1,
|
35 |
"MLAB (gpt-4o)": 0.6,
|
36 |
-
"MLAB (llama3-1-405b-instruct)": -0.0
|
|
|
|
|
37 |
},
|
38 |
"weather_forcast": {
|
39 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)":
|
40 |
"Top Human in Competition": 100.0,
|
41 |
-
"MLAB (
|
42 |
-
"MLAB (
|
43 |
-
"MLAB (
|
44 |
-
"MLAB (
|
45 |
-
"MLAB (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
},
|
47 |
"machine_unlearning": {
|
48 |
-
"
|
49 |
"Top Human in Competition": 100.0,
|
|
|
50 |
"MLAB (claude-3-5-sonnet-v2)": -94.7,
|
51 |
"MLAB (gemini-exp-1206)": 5.6,
|
52 |
"MLAB (o3-mini)": 3.6,
|
53 |
"MLAB (gpt-4o)": -18.0,
|
54 |
"MLAB (llama3-1-405b-instruct)": 6.2
|
55 |
},
|
56 |
-
"erasing_invisible_watermarks": {
|
57 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 84.0,
|
58 |
-
"Top Human in Competition": 100.0,
|
59 |
-
"MLAB (claude-3-5-sonnet-v2)": 87.6,
|
60 |
-
"MLAB (gemini-exp-1206)": 97.5,
|
61 |
-
"MLAB (o3-mini)": 83.4,
|
62 |
-
"MLAB (gpt-4o)": 83.4,
|
63 |
-
"MLAB (llama3-1-405b-instruct)": 83.4
|
64 |
-
},
|
65 |
"backdoor-trigger-recovery": {
|
66 |
-
"
|
67 |
"Top Human in Competition": 100.0,
|
|
|
68 |
"MLAB (claude-3-5-sonnet-v2)": 39.9,
|
69 |
"MLAB (gemini-exp-1206)": 12.9,
|
70 |
"MLAB (o3-mini)": 6.2,
|
71 |
-
"MLAB (
|
72 |
-
"MLAB (
|
73 |
}
|
74 |
}
|
|
|
6 |
"MLAB (o3-mini)": 0.3,
|
7 |
"MLAB (gpt-4o)": 0.3,
|
8 |
"MLAB (llama3-1-405b-instruct)": 0.5,
|
9 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.4,
|
10 |
+
"Human Idea + MLAB (gpt-4o)": 0.5
|
11 |
},
|
12 |
"llm-merging": {
|
13 |
"CoI-Agent (o1) + MLAB (gpt-4o)": -1.0,
|
|
|
16 |
"MLAB (gemini-exp-1206)": 5.0,
|
17 |
"MLAB (o3-mini)": -1.0,
|
18 |
"MLAB (gpt-4o)": 2.0,
|
19 |
+
"MLAB (llama3-1-405b-instruct)": -1.0,
|
20 |
+
"Human Idea + MLAB (gpt-4o)": -1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
},
|
22 |
"product-recommendation": {
|
|
|
|
|
23 |
"MLAB (claude-3-5-sonnet-v2)": 3.0,
|
24 |
+
"Top Human in Competition": 100.0,
|
25 |
"MLAB (gemini-exp-1206)": 0.1,
|
26 |
"MLAB (o3-mini)": 0.1,
|
27 |
"MLAB (gpt-4o)": 0.6,
|
28 |
+
"MLAB (llama3-1-405b-instruct)": -0.0,
|
29 |
+
"Human Idea + MLAB (gpt-4o)": 2.2,
|
30 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.1
|
31 |
},
|
32 |
"weather_forcast": {
|
33 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 39.4,
|
34 |
"Top Human in Competition": 100.0,
|
35 |
+
"Human Idea + MLAB (gpt-4o)": 12.3,
|
36 |
+
"MLAB (claude-3-5-sonnet-v2)": 14.6,
|
37 |
+
"MLAB (gemini-exp-1206)": 43.1,
|
38 |
+
"MLAB (o3-mini)": 25.1,
|
39 |
+
"MLAB (gpt-4o)": 47.5,
|
40 |
+
"MLAB (llama3-1-405b-instruct)": 31.5
|
41 |
+
},
|
42 |
+
"meta-learning": {
|
43 |
+
"MLAB (claude-3-5-sonnet-v2)": -4.9,
|
44 |
+
"Top Human in Competition": 100.0,
|
45 |
+
"MLAB (gemini-exp-1206)": -1.1,
|
46 |
+
"MLAB (o3-mini)": -4.9,
|
47 |
+
"MLAB (gpt-4o)": -4.9,
|
48 |
+
"MLAB (llama3-1-405b-instruct)": -4.9,
|
49 |
+
"Human Idea + MLAB (gpt-4o)": -4.9,
|
50 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": -4.9
|
51 |
},
|
52 |
"machine_unlearning": {
|
53 |
+
"Human Idea + MLAB (gpt-4o)": 6.8,
|
54 |
"Top Human in Competition": 100.0,
|
55 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 11.8,
|
56 |
"MLAB (claude-3-5-sonnet-v2)": -94.7,
|
57 |
"MLAB (gemini-exp-1206)": 5.6,
|
58 |
"MLAB (o3-mini)": 3.6,
|
59 |
"MLAB (gpt-4o)": -18.0,
|
60 |
"MLAB (llama3-1-405b-instruct)": 6.2
|
61 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
"backdoor-trigger-recovery": {
|
63 |
+
"MLAB (gpt-4o)": 11.9,
|
64 |
"Top Human in Competition": 100.0,
|
65 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 4.0,
|
66 |
"MLAB (claude-3-5-sonnet-v2)": 39.9,
|
67 |
"MLAB (gemini-exp-1206)": 12.9,
|
68 |
"MLAB (o3-mini)": 6.2,
|
69 |
+
"MLAB (llama3-1-405b-instruct)": 11.5,
|
70 |
+
"Human Idea + MLAB (gpt-4o)": 8.8
|
71 |
}
|
72 |
}
|
src/utils/config.py
CHANGED
@@ -82,7 +82,8 @@ model_categories = {
|
|
82 |
"MLAB (llama3-1-405b-instruct)": "Open Weights",
|
83 |
"CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
|
84 |
"Human": "Human",
|
85 |
-
"Top Human in Competition": "Human"
|
|
|
86 |
# More models would be added here as needed
|
87 |
}
|
88 |
|
|
|
82 |
"MLAB (llama3-1-405b-instruct)": "Open Weights",
|
83 |
"CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
|
84 |
"Human": "Human",
|
85 |
+
"Top Human in Competition": "Human",
|
86 |
+
"Human Idea + MLAB (gpt-4o)": "Closed Source"
|
87 |
# More models would be added here as needed
|
88 |
}
|
89 |
|
src/utils/task_mapping.py
CHANGED
@@ -8,6 +8,7 @@ task_display_names = {
|
|
8 |
"Llm Merging": "LLM Merging",
|
9 |
"Meta Learning": "Meta Learning",
|
10 |
"Product Recommendation": "Next Product Recommendation",
|
|
|
11 |
"Machine Unlearning": "Machine Unlearning",
|
12 |
"Backdoor Trigger Recovery": "Backdoor Trigger Recovery"
|
13 |
}
|
|
|
8 |
"Llm Merging": "LLM Merging",
|
9 |
"Meta Learning": "Meta Learning",
|
10 |
"Product Recommendation": "Next Product Recommendation",
|
11 |
+
"Weather Forcast": "Rainfall Prediction",
|
12 |
"Machine Unlearning": "Machine Unlearning",
|
13 |
"Backdoor Trigger Recovery": "Backdoor Trigger Recovery"
|
14 |
}
|