Spaces:
Running
Running
update
Browse files- constants.py +3 -0
- static/eval_results/Default/self_reported.json +23 -6
- utils.py +13 -4
constants.py
CHANGED
@@ -261,6 +261,9 @@ MODEL_URLS = {
|
|
261 |
"InternVL3_14B": "https://huggingface.co/OpenGVLab/InternVL3-14B",
|
262 |
"InternVL3_38B": "https://huggingface.co/OpenGVLab/InternVL3-38B",
|
263 |
"InternVL3_78B": "https://huggingface.co/OpenGVLab/InternVL3-78B",
|
|
|
|
|
|
|
264 |
}
|
265 |
|
266 |
# Define the base MODEL_GROUPS structure
|
|
|
261 |
"InternVL3_14B": "https://huggingface.co/OpenGVLab/InternVL3-14B",
|
262 |
"InternVL3_38B": "https://huggingface.co/OpenGVLab/InternVL3-38B",
|
263 |
"InternVL3_78B": "https://huggingface.co/OpenGVLab/InternVL3-78B",
|
264 |
+
"GPT-o1": "https://openai.com/o1/",
|
265 |
+
"GPT-o1-mini": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
|
266 |
+
"Seed1.5-VL": "https://github.com/ByteDance-Seed/Seed1.5-VL",
|
267 |
}
|
268 |
|
269 |
# Define the base MODEL_GROUPS structure
|
static/eval_results/Default/self_reported.json
CHANGED
@@ -1,8 +1,25 @@
|
|
1 |
{
|
2 |
-
"MiniMax-VL-01":
|
3 |
-
|
4 |
-
|
5 |
-
"Qwen2.5-VL-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
}
|
|
|
1 |
{
|
2 |
+
"MiniMax-VL-01": {
|
3 |
+
"overall": 47.4
|
4 |
+
},
|
5 |
+
"Qwen2.5-VL-72B": {
|
6 |
+
"overall": 51.3
|
7 |
+
},
|
8 |
+
"Qwen2.5-VL-7B": {
|
9 |
+
"overall": 36.8
|
10 |
+
},
|
11 |
+
"Qwen2.5-VL-3B": {
|
12 |
+
"overall": 28.9
|
13 |
+
},
|
14 |
+
"GPT-o1": {
|
15 |
+
"overall": 58.0
|
16 |
+
},
|
17 |
+
"GPT-o1-mini": {
|
18 |
+
"overall": 54.2
|
19 |
+
},
|
20 |
+
"Seed1.5-VL": {
|
21 |
+
"overall": 59.85,
|
22 |
+
"core": 58.58,
|
23 |
+
"open": 68.46
|
24 |
+
}
|
25 |
}
|
utils.py
CHANGED
@@ -106,11 +106,20 @@ class MEGABenchEvalDataLoader:
|
|
106 |
|
107 |
# Add asterisk for self-reported results
|
108 |
if model in self.SELF_REPORTED:
|
|
|
109 |
# Store numeric value for sorting but display with asterisk
|
110 |
-
row["Overall"] =
|
111 |
-
row["Overall_display"] = f"{
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
115 |
row[display_name] = None
|
116 |
else:
|
|
|
106 |
|
107 |
# Add asterisk for self-reported results
|
108 |
if model in self.SELF_REPORTED:
|
109 |
+
model_scores = self.SELF_REPORTED[model]
|
110 |
# Store numeric value for sorting but display with asterisk
|
111 |
+
row["Overall"] = model_scores["overall"]
|
112 |
+
row["Overall_display"] = f"{model_scores['overall']:.2f}*"
|
113 |
+
# Handle optional core and open scores
|
114 |
+
core_score = model_scores.get("core")
|
115 |
+
open_score = model_scores.get("open")
|
116 |
+
row["Core"] = core_score
|
117 |
+
row["Open-ended"] = open_score
|
118 |
+
# Add asterisk to core and open scores if they exist
|
119 |
+
if core_score is not None:
|
120 |
+
row["Core"] = f"{core_score:.2f}*"
|
121 |
+
if open_score is not None:
|
122 |
+
row["Open-ended"] = f"{open_score:.2f}*"
|
123 |
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
124 |
row[display_name] = None
|
125 |
else:
|