hardcoded model list for multifinben
Browse files- frontend/src/pages/LeaderboardPage/LeaderboardPage.js +31 -36
- frontend/src/pages/LeaderboardPage/components/Leaderboard/context/LeaderboardContext.js +16 -1
- frontend/src/pages/LeaderboardPage/components/Leaderboard/hooks/useDataUtils.js +163 -286
- frontend/src/pages/LeaderboardPage/components/Leaderboard/hooks/useLeaderboardData.js +4 -53
- frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js +197 -0
frontend/src/pages/LeaderboardPage/LeaderboardPage.js
CHANGED
@@ -1,49 +1,44 @@
|
|
1 |
-
import
|
|
|
2 |
import Leaderboard from "./components/Leaderboard/Leaderboard";
|
3 |
-
import { Box } from "@mui/material";
|
4 |
-
import PageHeader from "../../components/shared/PageHeader";
|
5 |
-
import Logo from "../../components/Logo/Logo";
|
6 |
-
import { useLeaderboardData } from "../../pages/LeaderboardPage/components/Leaderboard/hooks/useLeaderboardData";
|
7 |
-
import { useLeaderboard } from "../../pages/LeaderboardPage/components/Leaderboard/context/LeaderboardContext";
|
8 |
-
|
9 |
-
function LeaderboardPage() {
|
10 |
-
const { data, isLoading, error } = useLeaderboardData();
|
11 |
-
const { actions } = useLeaderboard();
|
12 |
-
|
13 |
-
useEffect(() => {
|
14 |
-
if (data) {
|
15 |
-
actions.setModels(data);
|
16 |
-
}
|
17 |
-
actions.setLoading(isLoading);
|
18 |
-
actions.setError(error);
|
19 |
-
}, [data, isLoading, error, actions]);
|
20 |
|
|
|
21 |
return (
|
22 |
-
<
|
|
|
23 |
sx={{
|
24 |
-
|
25 |
display: "flex",
|
26 |
flexDirection: "column",
|
|
|
|
|
|
|
|
|
27 |
}}
|
28 |
>
|
29 |
-
<Box
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
</Box>
|
34 |
-
<PageHeader
|
35 |
-
title="Open Financial LLM Leaderboard"
|
36 |
-
subtitle={
|
37 |
-
<>
|
38 |
-
Benchmark for large language models in {" "}
|
39 |
-
<span style={{ fontWeight: 600 }}>financial</span> domain {" "}
|
40 |
-
across multiple languages
|
41 |
-
</>
|
42 |
-
}
|
43 |
-
/>
|
44 |
<Leaderboard />
|
45 |
-
</
|
46 |
);
|
47 |
-
}
|
48 |
|
49 |
export default LeaderboardPage;
|
|
|
1 |
+
import React from "react";
|
2 |
+
import { Box, Typography, Container } from "@mui/material";
|
3 |
import Leaderboard from "./components/Leaderboard/Leaderboard";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
const LeaderboardPage = () => {
|
6 |
return (
|
7 |
+
<Container
|
8 |
+
maxWidth={false}
|
9 |
sx={{
|
10 |
+
p: { xs: 1, sm: 2, md: 3 },
|
11 |
display: "flex",
|
12 |
flexDirection: "column",
|
13 |
+
alignItems: "center",
|
14 |
+
height: "100%",
|
15 |
+
maxWidth: "100vw",
|
16 |
+
overflow: "hidden"
|
17 |
}}
|
18 |
>
|
19 |
+
<Box sx={{ mb: 3, width: "100%", textAlign: "center" }}>
|
20 |
+
<Typography
|
21 |
+
variant="h4"
|
22 |
+
component="h1"
|
23 |
+
sx={{
|
24 |
+
fontWeight: 700,
|
25 |
+
mb: 1,
|
26 |
+
fontSize: { xs: "1.5rem", sm: "1.75rem", md: "2rem" },
|
27 |
+
}}
|
28 |
+
>
|
29 |
+
Open Financial LLM Leaderboard - Multi-modal & Multi-lingual
|
30 |
+
</Typography>
|
31 |
+
<Typography
|
32 |
+
variant="body1"
|
33 |
+
color="text.secondary"
|
34 |
+
sx={{ maxWidth: "800px", mx: "auto" }}
|
35 |
+
>
|
36 |
+
Comprehensive evaluation of language models on financial tasks across multiple languages and modalities
|
37 |
+
</Typography>
|
38 |
</Box>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
<Leaderboard />
|
40 |
+
</Container>
|
41 |
);
|
42 |
+
};
|
43 |
|
44 |
export default LeaderboardPage;
|
frontend/src/pages/LeaderboardPage/components/Leaderboard/context/LeaderboardContext.js
CHANGED
@@ -29,7 +29,22 @@ const DEFAULT_DISPLAY = {
|
|
29 |
scoreDisplay: TABLE_DEFAULTS.SCORE_DISPLAY,
|
30 |
averageMode: TABLE_DEFAULTS.AVERAGE_MODE,
|
31 |
rankingMode: TABLE_DEFAULTS.RANKING_MODE,
|
32 |
-
visibleColumns:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
};
|
34 |
|
35 |
// Create initial counter structure
|
|
|
29 |
scoreDisplay: TABLE_DEFAULTS.SCORE_DISPLAY,
|
30 |
averageMode: TABLE_DEFAULTS.AVERAGE_MODE,
|
31 |
rankingMode: TABLE_DEFAULTS.RANKING_MODE,
|
32 |
+
visibleColumns: [
|
33 |
+
'isPinned',
|
34 |
+
'rank',
|
35 |
+
'model_type',
|
36 |
+
'id',
|
37 |
+
'model.average_score',
|
38 |
+
'evaluations.vision_average',
|
39 |
+
'evaluations.audio_average',
|
40 |
+
'evaluations.english_average',
|
41 |
+
'evaluations.chinese_average',
|
42 |
+
'evaluations.japanese_average',
|
43 |
+
'evaluations.spanish_average',
|
44 |
+
'evaluations.greek_average',
|
45 |
+
'evaluations.bilingual_average',
|
46 |
+
'evaluations.multilingual_average'
|
47 |
+
],
|
48 |
};
|
49 |
|
50 |
// Create initial counter structure
|
frontend/src/pages/LeaderboardPage/components/Leaderboard/hooks/useDataUtils.js
CHANGED
@@ -6,6 +6,91 @@ import {
|
|
6 |
} from "../utils/searchUtils";
|
7 |
import { ALLOWED_MODELS, isModelAllowed } from "../constants/allowedModels";
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
// Calculate min/max averages
|
10 |
export const useAverageRange = (data) => {
|
11 |
return useMemo(() => {
|
@@ -39,142 +124,96 @@ export const useColorGenerator = (minAverage, maxAverage) => {
|
|
39 |
// Process data with boolean standardization
|
40 |
export const useProcessedData = (data, averageMode, visibleColumns) => {
|
41 |
return useMemo(() => {
|
42 |
-
//
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
};
|
59 |
|
60 |
-
//
|
61 |
-
const
|
62 |
-
|
63 |
-
Object.entries(item.evaluations).forEach(([key, value]) => {
|
64 |
-
if (!greekDatasets.includes(key)) {
|
65 |
-
includedEvaluations[key] = value;
|
66 |
-
}
|
67 |
-
});
|
68 |
-
// Add Greek average
|
69 |
-
if (greekAverage !== null) {
|
70 |
-
includedEvaluations.greek_average = { normalized_score: greekAverage };
|
71 |
-
}
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
return visibleColumns.includes(`evaluations.${key}.normalized_score`);
|
77 |
-
})
|
78 |
-
.map(([, value]) => value.normalized_score);
|
79 |
-
|
80 |
-
const average =
|
81 |
-
evaluationScores.length > 0
|
82 |
-
? evaluationScores.reduce((a, b) => a + b, 0) /
|
83 |
-
evaluationScores.length
|
84 |
-
: averageMode === "visible"
|
85 |
-
? null
|
86 |
-
: 0;
|
87 |
-
|
88 |
-
// Boolean standardization
|
89 |
-
const standardizedFeatures = {
|
90 |
-
...item.features,
|
91 |
-
is_moe: Boolean(item.features.is_moe),
|
92 |
-
is_flagged: Boolean(item.features.is_flagged),
|
93 |
-
is_highlighted_by_maintainer: Boolean(
|
94 |
-
item.features.is_highlighted_by_maintainer
|
95 |
-
),
|
96 |
-
is_merged: Boolean(item.features.is_merged),
|
97 |
-
is_not_available_on_hub: Boolean(item.features.is_not_available_on_hub),
|
98 |
-
};
|
99 |
-
|
100 |
-
return {
|
101 |
-
...item,
|
102 |
-
features: standardizedFeatures,
|
103 |
-
evaluations: enhancedEvaluations, // Use enhanced evaluations
|
104 |
model: {
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
},
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
|
|
121 |
});
|
122 |
|
123 |
-
//
|
124 |
-
|
125 |
-
// Check if a matching model already exists
|
126 |
-
const modelExists = Object.keys(existingModelsMap).some(name =>
|
127 |
-
name.toLowerCase().includes(allowedModelName.toLowerCase())
|
128 |
-
);
|
129 |
-
|
130 |
-
if (!modelExists) {
|
131 |
-
// Create a "missing" placeholder
|
132 |
-
filteredModels.push({
|
133 |
-
id: `missing-${allowedModelName}`,
|
134 |
-
model: {
|
135 |
-
name: allowedModelName,
|
136 |
-
average_score: null,
|
137 |
-
type: "Unknown",
|
138 |
-
},
|
139 |
-
evaluations: {
|
140 |
-
greek_average: null
|
141 |
-
},
|
142 |
-
features: {
|
143 |
-
is_moe: false,
|
144 |
-
is_flagged: false,
|
145 |
-
is_highlighted_by_maintainer: false,
|
146 |
-
is_merged: false,
|
147 |
-
is_not_available_on_hub: true,
|
148 |
-
},
|
149 |
-
metadata: {
|
150 |
-
submission_date: new Date().toISOString(),
|
151 |
-
},
|
152 |
-
isMissing: true, // Mark as missing
|
153 |
-
});
|
154 |
-
}
|
155 |
-
});
|
156 |
-
|
157 |
-
// Sort the results
|
158 |
-
filteredModels.sort((a, b) => {
|
159 |
-
// Place missing models at the end
|
160 |
-
if (a.isMissing && !b.isMissing) return 1;
|
161 |
-
if (!a.isMissing && b.isMissing) return -1;
|
162 |
-
|
163 |
-
// If both are missing or both are not missing, sort by average score
|
164 |
if (a.model.average_score === null && b.model.average_score === null)
|
165 |
return 0;
|
166 |
if (a.model.average_score === null) return 1;
|
167 |
if (b.model.average_score === null) return -1;
|
168 |
return b.model.average_score - a.model.average_score;
|
169 |
});
|
170 |
-
|
171 |
-
|
|
|
172 |
...item,
|
173 |
static_rank: index + 1,
|
174 |
}));
|
175 |
}, [data, averageMode, visibleColumns]);
|
176 |
};
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
// Common filtering logic
|
179 |
export const useFilteredData = (
|
180 |
processedData,
|
@@ -188,179 +227,17 @@ export const useFilteredData = (
|
|
188 |
isOfficialProviderActive = false
|
189 |
) => {
|
190 |
return useMemo(() => {
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
let filteredUnpinned = unpinnedData;
|
199 |
-
|
200 |
-
// Filter by official providers
|
201 |
-
if (isOfficialProviderActive) {
|
202 |
-
filteredUnpinned = filteredUnpinned.filter(
|
203 |
-
(row) =>
|
204 |
-
row.features?.is_highlighted_by_maintainer ||
|
205 |
-
row.metadata?.is_highlighted_by_maintainer
|
206 |
-
);
|
207 |
-
}
|
208 |
-
|
209 |
-
// Filter by precision
|
210 |
-
if (selectedPrecisions.length > 0) {
|
211 |
-
filteredUnpinned = filteredUnpinned.filter((row) =>
|
212 |
-
selectedPrecisions.includes(row.model.precision)
|
213 |
-
);
|
214 |
-
}
|
215 |
-
|
216 |
-
// Filter by type
|
217 |
-
if (selectedTypes.length > 0) {
|
218 |
-
filteredUnpinned = filteredUnpinned.filter((row) => {
|
219 |
-
const modelType = row.model.type?.toLowerCase().trim();
|
220 |
-
return selectedTypes.some((type) => modelType?.includes(type));
|
221 |
-
});
|
222 |
-
}
|
223 |
-
|
224 |
-
// Filter by parameters
|
225 |
-
filteredUnpinned = filteredUnpinned.filter((row) => {
|
226 |
-
// Skip parameter filtering if no filter is active
|
227 |
-
if (paramsRange[0] === -1 && paramsRange[1] === 140) return true;
|
228 |
-
|
229 |
-
const params =
|
230 |
-
row.metadata?.params_billions || row.features?.params_billions;
|
231 |
-
if (params === undefined || params === null) return false;
|
232 |
-
return params >= paramsRange[0] && params < paramsRange[1];
|
233 |
-
});
|
234 |
-
|
235 |
-
// Filter by search
|
236 |
-
if (searchValue) {
|
237 |
-
const searchQueries = searchValue
|
238 |
-
.split(";")
|
239 |
-
.map((q) => q.trim())
|
240 |
-
.filter((q) => q);
|
241 |
-
if (searchQueries.length > 0) {
|
242 |
-
filteredUnpinned = filteredUnpinned.filter((row) => {
|
243 |
-
return searchQueries.some((query) => {
|
244 |
-
const { specialSearches, textSearch } = parseSearchQuery(query);
|
245 |
-
|
246 |
-
const specialSearchMatch = specialSearches.every(
|
247 |
-
({ field, value }) => {
|
248 |
-
const fieldValue = getValueByPath(row, field)
|
249 |
-
?.toString()
|
250 |
-
.toLowerCase();
|
251 |
-
return fieldValue?.includes(value.toLowerCase());
|
252 |
-
}
|
253 |
-
);
|
254 |
-
|
255 |
-
if (!specialSearchMatch) return false;
|
256 |
-
if (!textSearch) return true;
|
257 |
-
|
258 |
-
const modelName = row.model.name.toLowerCase();
|
259 |
-
const searchLower = textSearch.toLowerCase();
|
260 |
-
|
261 |
-
if (looksLikeRegex(textSearch)) {
|
262 |
-
try {
|
263 |
-
const regex = new RegExp(textSearch, "i");
|
264 |
-
return regex.test(modelName);
|
265 |
-
} catch (e) {
|
266 |
-
return modelName.includes(searchLower);
|
267 |
-
}
|
268 |
-
} else {
|
269 |
-
return modelName.includes(searchLower);
|
270 |
-
}
|
271 |
-
});
|
272 |
-
});
|
273 |
-
}
|
274 |
-
}
|
275 |
-
|
276 |
-
// Filter by booleans
|
277 |
-
if (selectedBooleanFilters.length > 0) {
|
278 |
-
filteredUnpinned = filteredUnpinned.filter((row) => {
|
279 |
-
return selectedBooleanFilters.every((filter) => {
|
280 |
-
const filterValue =
|
281 |
-
typeof filter === "object" ? filter.value : filter;
|
282 |
-
|
283 |
-
// Maintainer's Highlight keeps positive logic
|
284 |
-
if (filterValue === "is_highlighted_by_maintainer") {
|
285 |
-
return row.features[filterValue];
|
286 |
-
}
|
287 |
-
|
288 |
-
// For all other filters, invert the logic
|
289 |
-
if (filterValue === "is_not_available_on_hub") {
|
290 |
-
return row.features[filterValue];
|
291 |
-
}
|
292 |
-
|
293 |
-
return !row.features[filterValue];
|
294 |
-
});
|
295 |
-
});
|
296 |
-
}
|
297 |
-
|
298 |
-
// Create ordered array of pinned models respecting pinnedModels order
|
299 |
-
const orderedPinnedData = pinnedModels
|
300 |
-
.map((pinnedModelId) =>
|
301 |
-
pinnedData.find((item) => item.id === pinnedModelId)
|
302 |
-
)
|
303 |
-
.filter(Boolean);
|
304 |
-
|
305 |
-
// Combine all filtered data
|
306 |
-
const allFilteredData = [...filteredUnpinned, ...orderedPinnedData];
|
307 |
-
|
308 |
-
// Sort all data by average_score for dynamic_rank
|
309 |
-
const sortedByScore = [...allFilteredData].sort((a, b) => {
|
310 |
-
// Si les scores moyens sont différents, trier par score
|
311 |
-
if (a.model.average_score !== b.model.average_score) {
|
312 |
-
if (a.model.average_score === null && b.model.average_score === null)
|
313 |
-
return 0;
|
314 |
-
if (a.model.average_score === null) return 1;
|
315 |
-
if (b.model.average_score === null) return -1;
|
316 |
-
return b.model.average_score - a.model.average_score;
|
317 |
-
}
|
318 |
-
|
319 |
-
// Si les scores sont égaux, comparer le nom du modèle et la date de soumission
|
320 |
-
if (a.model.name === b.model.name) {
|
321 |
-
// Si même nom, trier par date de soumission (la plus récente d'abord)
|
322 |
-
const dateA = new Date(a.metadata?.submission_date || 0);
|
323 |
-
const dateB = new Date(b.metadata?.submission_date || 0);
|
324 |
-
return dateB - dateA;
|
325 |
-
}
|
326 |
-
|
327 |
-
// Si noms différents, trier par nom
|
328 |
-
return a.model.name.localeCompare(b.model.name);
|
329 |
-
});
|
330 |
-
|
331 |
-
// Create Map to store dynamic_ranks
|
332 |
-
const dynamicRankMap = new Map();
|
333 |
-
sortedByScore.forEach((item, index) => {
|
334 |
-
dynamicRankMap.set(item.id, index + 1);
|
335 |
-
});
|
336 |
-
|
337 |
-
// Add ranks to final data
|
338 |
-
const finalData = [...orderedPinnedData, ...filteredUnpinned].map(
|
339 |
-
(item) => {
|
340 |
-
return {
|
341 |
-
...item,
|
342 |
-
dynamic_rank: dynamicRankMap.get(item.id),
|
343 |
-
rank: item.isPinned
|
344 |
-
? pinnedModels.indexOf(item.id) + 1
|
345 |
-
: rankingMode === "static"
|
346 |
-
? item.static_rank
|
347 |
-
: dynamicRankMap.get(item.id),
|
348 |
-
isPinned: pinnedModels.includes(item.id),
|
349 |
-
};
|
350 |
-
}
|
351 |
-
);
|
352 |
-
|
353 |
-
return finalData;
|
354 |
}, [
|
355 |
processedData,
|
356 |
-
selectedPrecisions,
|
357 |
-
selectedTypes,
|
358 |
-
paramsRange,
|
359 |
-
searchValue,
|
360 |
-
selectedBooleanFilters,
|
361 |
rankingMode,
|
362 |
pinnedModels,
|
363 |
-
isOfficialProviderActive,
|
364 |
]);
|
365 |
};
|
366 |
|
|
|
6 |
} from "../utils/searchUtils";
|
7 |
import { ALLOWED_MODELS, isModelAllowed } from "../constants/allowedModels";
|
8 |
|
9 |
+
// 硬编码数据集
|
10 |
+
const HARDCODED_SCORES = {
|
11 |
+
vision: {
|
12 |
+
"GPT-4o": 55.54, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 16.27,
|
13 |
+
"meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 14.97, "google/gemma-3-27b-it": 25.57,
|
14 |
+
"Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 24.97, "TheFinAI/finma-7b-full": 0.00,
|
15 |
+
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00,
|
16 |
+
"TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "Qwen-VL-MAX": 18.47,
|
17 |
+
"LLaVA-1.6 Vicuna-13B": 19.77, "Deepseek-VL-7B-Chat": 19.10, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
|
18 |
+
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
19 |
+
},
|
20 |
+
audio: {
|
21 |
+
"GPT-4o": 55.56, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00,
|
22 |
+
"meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00,
|
23 |
+
"Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 48.22, "TheFinAI/finma-7b-full": 0.00,
|
24 |
+
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00,
|
25 |
+
"TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "Qwen-VL-MAX": 0.00,
|
26 |
+
"LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 51.58, "Qwen2-Audio-7B": 48.02,
|
27 |
+
"Qwen2-Audio-7B-Instruct": 50.06, "SALMONN-7B": 24.24, "SALMONN-13B": 24.59
|
28 |
+
},
|
29 |
+
english: {
|
30 |
+
"GPT-4o": 42.18, "o3-Mini": 20.20, "Deepseek-V3": 18.04, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 24.16,
|
31 |
+
"meta-llama/Llama-3.1-70B-Instruct": 38.71, "google/gemma-3-4b-it": 16.13, "google/gemma-3-27b-it": 17.19,
|
32 |
+
"Qwen/Qwen2.5-32B-Instruct": 32.01, "Qwen/Qwen2.5-Omni-7B": 24.99, "TheFinAI/finma-7b-full": 28.89,
|
33 |
+
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 29.39, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 26.38,
|
34 |
+
"TheFinAI/FinMA-ES-Bilingual": 31.72, "TheFinAI/plutus-8B-instruct": 27.82, "Qwen-VL-MAX": 0.00,
|
35 |
+
"LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
|
36 |
+
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
37 |
+
},
|
38 |
+
chinese: {
|
39 |
+
"GPT-4o": 60.34, "o3-Mini": 0.00, "Deepseek-V3": 60.94, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 64.51,
|
40 |
+
"meta-llama/Llama-3.1-70B-Instruct": 56.74, "google/gemma-3-4b-it": 26.23, "google/gemma-3-27b-it": 26.24,
|
41 |
+
"Qwen/Qwen2.5-32B-Instruct": 56.62, "Qwen/Qwen2.5-Omni-7B": 53.09, "TheFinAI/finma-7b-full": 24.42,
|
42 |
+
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 23.04, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 13.18,
|
43 |
+
"TheFinAI/FinMA-ES-Bilingual": 21.50, "TheFinAI/plutus-8B-instruct": 31.04, "Qwen-VL-MAX": 0.00,
|
44 |
+
"LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
|
45 |
+
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
46 |
+
},
|
47 |
+
japanese: {
|
48 |
+
"GPT-4o": 0.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 48.43,
|
49 |
+
"meta-llama/Llama-3.1-70B-Instruct": 32.17, "google/gemma-3-4b-it": 8.98, "google/gemma-3-27b-it": 23.96,
|
50 |
+
"Qwen/Qwen2.5-32B-Instruct": 4.54, "Qwen/Qwen2.5-Omni-7B": 44.35, "TheFinAI/finma-7b-full": 46.94,
|
51 |
+
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 47.59, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 23.96,
|
52 |
+
"TheFinAI/FinMA-ES-Bilingual": 57.36, "TheFinAI/plutus-8B-instruct": 34.62, "Qwen-VL-MAX": 0.00,
|
53 |
+
"LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
|
54 |
+
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
55 |
+
},
|
56 |
+
spanish: {
|
57 |
+
"GPT-4o": 29.80, "o3-Mini": 4.53, "Deepseek-V3": 25.49, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 47.90,
|
58 |
+
"meta-llama/Llama-3.1-70B-Instruct": 37.84, "google/gemma-3-4b-it": 27.66, "google/gemma-3-27b-it": 27.77,
|
59 |
+
"Qwen/Qwen2.5-32B-Instruct": 37.47, "Qwen/Qwen2.5-Omni-7B": 39.16, "TheFinAI/finma-7b-full": 27.04,
|
60 |
+
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 42.86, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 28.01,
|
61 |
+
"TheFinAI/FinMA-ES-Bilingual": 38.69, "TheFinAI/plutus-8B-instruct": 40.16, "Qwen-VL-MAX": 0.00,
|
62 |
+
"LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
|
63 |
+
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
64 |
+
},
|
65 |
+
greek: {
|
66 |
+
"GPT-4o": 43.04, "o3-Mini": 9.48, "Deepseek-V3": 39.07, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 48.95,
|
67 |
+
"meta-llama/Llama-3.1-70B-Instruct": 43.60, "google/gemma-3-4b-it": 15.45, "google/gemma-3-27b-it": 15.44,
|
68 |
+
"Qwen/Qwen2.5-32B-Instruct": 44.32, "Qwen/Qwen2.5-Omni-7B": 23.45, "TheFinAI/finma-7b-full": 17.93,
|
69 |
+
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 29.49, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 20.91,
|
70 |
+
"TheFinAI/FinMA-ES-Bilingual": 15.47, "TheFinAI/plutus-8B-instruct": 60.19, "Qwen-VL-MAX": 0.00,
|
71 |
+
"LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
|
72 |
+
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
73 |
+
},
|
74 |
+
bilingual: {
|
75 |
+
"GPT-4o": 92.29, "o3-Mini": 90.13, "Deepseek-V3": 86.26, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 89.17,
|
76 |
+
"meta-llama/Llama-3.1-70B-Instruct": 92.13, "google/gemma-3-4b-it": 35.92, "google/gemma-3-27b-it": 35.92,
|
77 |
+
"Qwen/Qwen2.5-32B-Instruct": 92.29, "Qwen/Qwen2.5-Omni-7B": 91.80, "TheFinAI/finma-7b-full": 69.24,
|
78 |
+
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 91.60, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 71.81,
|
79 |
+
"TheFinAI/FinMA-ES-Bilingual": 66.57, "TheFinAI/plutus-8B-instruct": 91.59, "Qwen-VL-MAX": 0.00,
|
80 |
+
"LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
|
81 |
+
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
82 |
+
},
|
83 |
+
multilingual: {
|
84 |
+
"GPT-4o": 6.53, "o3-Mini": 7.80, "Deepseek-V3": 36.99, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 13.52,
|
85 |
+
"meta-llama/Llama-3.1-70B-Instruct": 21.97, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00,
|
86 |
+
"Qwen/Qwen2.5-32B-Instruct": 18.48, "Qwen/Qwen2.5-Omni-7B": 16.29, "TheFinAI/finma-7b-full": 3.10,
|
87 |
+
"Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 1.76, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 10.25,
|
88 |
+
"TheFinAI/FinMA-ES-Bilingual": 0.35, "TheFinAI/plutus-8B-instruct": 7.24, "Qwen-VL-MAX": 0.00,
|
89 |
+
"LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
|
90 |
+
"Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
|
91 |
+
}
|
92 |
+
};
|
93 |
+
|
94 |
// Calculate min/max averages
|
95 |
export const useAverageRange = (data) => {
|
96 |
return useMemo(() => {
|
|
|
124 |
// Process data with boolean standardization
|
125 |
export const useProcessedData = (data, averageMode, visibleColumns) => {
|
126 |
return useMemo(() => {
|
127 |
+
// 直接使用硬编码数据创建模型列表
|
128 |
+
const modelList = [];
|
129 |
+
|
130 |
+
// 从HARDCODED_SCORES中获取所有模型名称
|
131 |
+
const modelNames = new Set();
|
132 |
+
Object.values(HARDCODED_SCORES).forEach(categoryData => {
|
133 |
+
Object.entries(categoryData).forEach(([modelName, score]) => {
|
134 |
+
// 添加所有模型,不管分数是否为0
|
135 |
+
modelNames.add(modelName);
|
136 |
+
});
|
137 |
+
});
|
138 |
+
|
139 |
+
// 为每个模型创建条目
|
140 |
+
Array.from(modelNames).forEach((modelName, index) => {
|
141 |
+
// 创建硬编码评估数据
|
142 |
+
const hardcodedEvaluations = {
|
143 |
+
vision_average: getHardcodedScore(modelName, 'vision'),
|
144 |
+
audio_average: getHardcodedScore(modelName, 'audio'),
|
145 |
+
english_average: getHardcodedScore(modelName, 'english'),
|
146 |
+
chinese_average: getHardcodedScore(modelName, 'chinese'),
|
147 |
+
japanese_average: getHardcodedScore(modelName, 'japanese'),
|
148 |
+
spanish_average: getHardcodedScore(modelName, 'spanish'),
|
149 |
+
greek_average: getHardcodedScore(modelName, 'greek'),
|
150 |
+
bilingual_average: getHardcodedScore(modelName, 'bilingual'),
|
151 |
+
multilingual_average: getHardcodedScore(modelName, 'multilingual')
|
152 |
};
|
153 |
|
154 |
+
// 计算总平均分(包含分数为0的类别)
|
155 |
+
const scores = Object.values(hardcodedEvaluations).filter(score => score !== null);
|
156 |
+
const averageScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : null;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
+
// 创建模型数据
|
159 |
+
modelList.push({
|
160 |
+
id: `model-${index}`,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
model: {
|
162 |
+
name: modelName,
|
163 |
+
average_score: averageScore,
|
164 |
+
type: "chat", // 统一设为chat类型
|
165 |
},
|
166 |
+
evaluations: hardcodedEvaluations,
|
167 |
+
features: {
|
168 |
+
is_moe: false,
|
169 |
+
is_flagged: false,
|
170 |
+
is_highlighted_by_maintainer: false,
|
171 |
+
is_merged: false,
|
172 |
+
is_not_available_on_hub: false,
|
173 |
+
},
|
174 |
+
metadata: {
|
175 |
+
submission_date: new Date().toISOString(),
|
176 |
+
},
|
177 |
+
isMissing: false,
|
178 |
+
});
|
179 |
});
|
180 |
|
181 |
+
// 根据平均分排序
|
182 |
+
modelList.sort((a, b) => {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
if (a.model.average_score === null && b.model.average_score === null)
|
184 |
return 0;
|
185 |
if (a.model.average_score === null) return 1;
|
186 |
if (b.model.average_score === null) return -1;
|
187 |
return b.model.average_score - a.model.average_score;
|
188 |
});
|
189 |
+
|
190 |
+
// 添加排名
|
191 |
+
return modelList.map((item, index) => ({
|
192 |
...item,
|
193 |
static_rank: index + 1,
|
194 |
}));
|
195 |
}, [data, averageMode, visibleColumns]);
|
196 |
};
|
197 |
|
198 |
+
// 辅助函数:从硬编码数据中获取分数
|
199 |
+
function getHardcodedScore(modelName, category) {
|
200 |
+
if (!HARDCODED_SCORES[category]) return null;
|
201 |
+
|
202 |
+
// 尝试精确匹配
|
203 |
+
if (HARDCODED_SCORES[category][modelName] !== undefined) {
|
204 |
+
return HARDCODED_SCORES[category][modelName];
|
205 |
+
}
|
206 |
+
|
207 |
+
// 尝试部分匹配
|
208 |
+
for (const key in HARDCODED_SCORES[category]) {
|
209 |
+
if (modelName.includes(key) || key.includes(modelName)) {
|
210 |
+
return HARDCODED_SCORES[category][key];
|
211 |
+
}
|
212 |
+
}
|
213 |
+
|
214 |
+
return null;
|
215 |
+
}
|
216 |
+
|
217 |
// Common filtering logic
|
218 |
export const useFilteredData = (
|
219 |
processedData,
|
|
|
227 |
isOfficialProviderActive = false
|
228 |
) => {
|
229 |
return useMemo(() => {
|
230 |
+
// 由于使用的是硬编码数据,这里直接返回所有数据而不进行过滤
|
231 |
+
return processedData.map((item, index) => ({
|
232 |
+
...item,
|
233 |
+
dynamic_rank: index + 1,
|
234 |
+
rank: rankingMode === "static" ? item.static_rank : index + 1,
|
235 |
+
isPinned: pinnedModels.includes(item.id),
|
236 |
+
}));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
}, [
|
238 |
processedData,
|
|
|
|
|
|
|
|
|
|
|
239 |
rankingMode,
|
240 |
pinnedModels,
|
|
|
241 |
]);
|
242 |
};
|
243 |
|
frontend/src/pages/LeaderboardPage/components/Leaderboard/hooks/useLeaderboardData.js
CHANGED
@@ -8,60 +8,11 @@ const CACHE_KEY = "leaderboardData";
|
|
8 |
const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
|
9 |
|
10 |
export const useLeaderboardData = () => {
|
11 |
-
const queryClient = useQueryClient();
|
12 |
-
const [searchParams] = useSearchParams();
|
13 |
-
const isInitialLoadRef = useRef(true);
|
14 |
-
|
15 |
-
const { data, isLoading, error } = useQuery({
|
16 |
-
queryKey: ["leaderboard"],
|
17 |
-
queryFn: async () => {
|
18 |
-
try {
|
19 |
-
const cachedData = localStorage.getItem(CACHE_KEY);
|
20 |
-
if (cachedData) {
|
21 |
-
const { data: cached, timestamp } = JSON.parse(cachedData);
|
22 |
-
const age = Date.now() - timestamp;
|
23 |
-
if (age < CACHE_DURATION) {
|
24 |
-
return cached;
|
25 |
-
}
|
26 |
-
}
|
27 |
-
|
28 |
-
const response = await fetch("/api/leaderboard/formatted");
|
29 |
-
if (!response.ok) {
|
30 |
-
throw new Error(`HTTP error! status: ${response.status}`);
|
31 |
-
}
|
32 |
-
|
33 |
-
const newData = await response.json();
|
34 |
-
localStorage.setItem(
|
35 |
-
CACHE_KEY,
|
36 |
-
JSON.stringify({
|
37 |
-
data: newData,
|
38 |
-
timestamp: Date.now(),
|
39 |
-
})
|
40 |
-
);
|
41 |
-
|
42 |
-
return newData;
|
43 |
-
} catch (error) {
|
44 |
-
console.error("Detailed error:", error);
|
45 |
-
throw error;
|
46 |
-
}
|
47 |
-
},
|
48 |
-
staleTime: CACHE_DURATION,
|
49 |
-
cacheTime: CACHE_DURATION * 2,
|
50 |
-
refetchOnWindowFocus: false,
|
51 |
-
enabled: isInitialLoadRef.current || !!searchParams.toString(),
|
52 |
-
});
|
53 |
-
|
54 |
-
useMemo(() => {
|
55 |
-
if (data && isInitialLoadRef.current) {
|
56 |
-
isInitialLoadRef.current = false;
|
57 |
-
}
|
58 |
-
}, [data]);
|
59 |
-
|
60 |
return {
|
61 |
-
data,
|
62 |
-
isLoading,
|
63 |
-
error,
|
64 |
-
refetch: () =>
|
65 |
};
|
66 |
};
|
67 |
|
|
|
8 |
const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
|
9 |
|
10 |
export const useLeaderboardData = () => {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
return {
|
12 |
+
data: [], // 直接返回空数组,我们使用硬编码数据
|
13 |
+
isLoading: false,
|
14 |
+
error: null,
|
15 |
+
refetch: () => {}
|
16 |
};
|
17 |
};
|
18 |
|
frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js
CHANGED
@@ -499,6 +499,67 @@ const createGreekLeaderboardHeader = (header) => (
|
|
499 |
</Box>
|
500 |
);
|
501 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
502 |
export const createColumns = (
|
503 |
getColorForValue,
|
504 |
scoreDisplay = "normalized",
|
@@ -928,6 +989,142 @@ export const createColumns = (
|
|
928 |
}),
|
929 |
},
|
930 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
931 |
];
|
932 |
|
933 |
const optionalColumns = [
|
|
|
499 |
</Box>
|
500 |
);
|
501 |
|
502 |
+
// 为各种类型的Leaderboard创建自定义标题组件
|
503 |
+
const createLeaderboardHeader = (label, tooltip, linkUrl) => (header) => (
|
504 |
+
<Box
|
505 |
+
className="header-content"
|
506 |
+
sx={{
|
507 |
+
display: "flex",
|
508 |
+
alignItems: "center",
|
509 |
+
width: "100%",
|
510 |
+
position: "relative",
|
511 |
+
}}
|
512 |
+
>
|
513 |
+
<HeaderLabel
|
514 |
+
label={`${label} Leaderboard`}
|
515 |
+
tooltip={tooltip}
|
516 |
+
className="header-label"
|
517 |
+
isSorted={header?.column?.getIsSorted()}
|
518 |
+
/>
|
519 |
+
|
520 |
+
<Box
|
521 |
+
sx={{
|
522 |
+
display: "flex",
|
523 |
+
alignItems: "center",
|
524 |
+
gap: 0.5,
|
525 |
+
ml: "auto",
|
526 |
+
flexShrink: 0,
|
527 |
+
}}
|
528 |
+
>
|
529 |
+
<InfoIcon tooltip={tooltip} />
|
530 |
+
{linkUrl && (
|
531 |
+
<Link
|
532 |
+
href={linkUrl}
|
533 |
+
target="_blank"
|
534 |
+
rel="noopener noreferrer"
|
535 |
+
aria-label={`View ${label} Leaderboard`}
|
536 |
+
sx={{
|
537 |
+
color: "info.main",
|
538 |
+
display: "flex",
|
539 |
+
alignItems: "center",
|
540 |
+
ml: 0.5,
|
541 |
+
textDecoration: "none",
|
542 |
+
"&:hover": {
|
543 |
+
textDecoration: "underline",
|
544 |
+
"& svg": {
|
545 |
+
opacity: 0.8,
|
546 |
+
},
|
547 |
+
},
|
548 |
+
}}
|
549 |
+
>
|
550 |
+
<OpenInNewIcon
|
551 |
+
sx={{
|
552 |
+
fontSize: "1rem",
|
553 |
+
opacity: 0.6,
|
554 |
+
transition: "opacity 0.2s ease-in-out",
|
555 |
+
}}
|
556 |
+
/>
|
557 |
+
</Link>
|
558 |
+
)}
|
559 |
+
</Box>
|
560 |
+
</Box>
|
561 |
+
);
|
562 |
+
|
563 |
export const createColumns = (
|
564 |
getColorForValue,
|
565 |
scoreDisplay = "normalized",
|
|
|
989 |
}),
|
990 |
},
|
991 |
},
|
992 |
+
{
|
993 |
+
accessorKey: "evaluations.vision_average",
|
994 |
+
header: createLeaderboardHeader("Vision", "Average performance on vision tasks", null),
|
995 |
+
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.vision_average"),
|
996 |
+
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
997 |
+
meta: {
|
998 |
+
headerStyle: {
|
999 |
+
backgroundColor: (theme) => alpha(theme.palette.primary.light, 0.05),
|
1000 |
+
},
|
1001 |
+
cellStyle: (value) => ({
|
1002 |
+
position: "relative",
|
1003 |
+
overflow: "hidden",
|
1004 |
+
padding: "8px 16px",
|
1005 |
+
backgroundColor: (theme) => alpha(theme.palette.primary.light, 0.05),
|
1006 |
+
}),
|
1007 |
+
},
|
1008 |
+
},
|
1009 |
+
{
|
1010 |
+
accessorKey: "evaluations.audio_average",
|
1011 |
+
header: createLeaderboardHeader("Audio", "Average performance on audio tasks", null),
|
1012 |
+
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.audio_average"),
|
1013 |
+
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
1014 |
+
meta: {
|
1015 |
+
headerStyle: {
|
1016 |
+
backgroundColor: (theme) => alpha(theme.palette.secondary.light, 0.05),
|
1017 |
+
},
|
1018 |
+
cellStyle: (value) => ({
|
1019 |
+
position: "relative",
|
1020 |
+
overflow: "hidden",
|
1021 |
+
padding: "8px 16px",
|
1022 |
+
backgroundColor: (theme) => alpha(theme.palette.secondary.light, 0.05),
|
1023 |
+
}),
|
1024 |
+
},
|
1025 |
+
},
|
1026 |
+
{
|
1027 |
+
accessorKey: "evaluations.english_average",
|
1028 |
+
header: createLeaderboardHeader("English", "Average performance on English language tasks", null),
|
1029 |
+
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.english_average"),
|
1030 |
+
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
1031 |
+
meta: {
|
1032 |
+
headerStyle: {
|
1033 |
+
backgroundColor: (theme) => alpha(theme.palette.success.light, 0.05),
|
1034 |
+
},
|
1035 |
+
cellStyle: (value) => ({
|
1036 |
+
position: "relative",
|
1037 |
+
overflow: "hidden",
|
1038 |
+
padding: "8px 16px",
|
1039 |
+
backgroundColor: (theme) => alpha(theme.palette.success.light, 0.05),
|
1040 |
+
}),
|
1041 |
+
},
|
1042 |
+
},
|
1043 |
+
{
|
1044 |
+
accessorKey: "evaluations.chinese_average",
|
1045 |
+
header: createLeaderboardHeader("Chinese", "Average performance on Chinese language tasks", null),
|
1046 |
+
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.chinese_average"),
|
1047 |
+
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
1048 |
+
meta: {
|
1049 |
+
headerStyle: {
|
1050 |
+
backgroundColor: (theme) => alpha(theme.palette.warning.light, 0.05),
|
1051 |
+
},
|
1052 |
+
cellStyle: (value) => ({
|
1053 |
+
position: "relative",
|
1054 |
+
overflow: "hidden",
|
1055 |
+
padding: "8px 16px",
|
1056 |
+
backgroundColor: (theme) => alpha(theme.palette.warning.light, 0.05),
|
1057 |
+
}),
|
1058 |
+
},
|
1059 |
+
},
|
1060 |
+
{
|
1061 |
+
accessorKey: "evaluations.japanese_average",
|
1062 |
+
header: createLeaderboardHeader("Japanese", "Average performance on Japanese language tasks", null),
|
1063 |
+
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.japanese_average"),
|
1064 |
+
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
1065 |
+
meta: {
|
1066 |
+
headerStyle: {
|
1067 |
+
backgroundColor: (theme) => alpha(theme.palette.error.light, 0.05),
|
1068 |
+
},
|
1069 |
+
cellStyle: (value) => ({
|
1070 |
+
position: "relative",
|
1071 |
+
overflow: "hidden",
|
1072 |
+
padding: "8px 16px",
|
1073 |
+
backgroundColor: (theme) => alpha(theme.palette.error.light, 0.05),
|
1074 |
+
}),
|
1075 |
+
},
|
1076 |
+
},
|
1077 |
+
{
|
1078 |
+
accessorKey: "evaluations.spanish_average",
|
1079 |
+
header: createLeaderboardHeader("Spanish", "Average performance on Spanish language tasks", null),
|
1080 |
+
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.spanish_average"),
|
1081 |
+
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
1082 |
+
meta: {
|
1083 |
+
headerStyle: {
|
1084 |
+
backgroundColor: (theme) => alpha(theme.palette.info.main, 0.05),
|
1085 |
+
},
|
1086 |
+
cellStyle: (value) => ({
|
1087 |
+
position: "relative",
|
1088 |
+
overflow: "hidden",
|
1089 |
+
padding: "8px 16px",
|
1090 |
+
backgroundColor: (theme) => alpha(theme.palette.info.main, 0.05),
|
1091 |
+
}),
|
1092 |
+
},
|
1093 |
+
},
|
1094 |
+
{
|
1095 |
+
accessorKey: "evaluations.bilingual_average",
|
1096 |
+
header: createLeaderboardHeader("Bilingual", "Average performance on bilingual tasks", null),
|
1097 |
+
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.bilingual_average"),
|
1098 |
+
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
1099 |
+
meta: {
|
1100 |
+
headerStyle: {
|
1101 |
+
backgroundColor: (theme) => alpha(theme.palette.primary.main, 0.05),
|
1102 |
+
},
|
1103 |
+
cellStyle: (value) => ({
|
1104 |
+
position: "relative",
|
1105 |
+
overflow: "hidden",
|
1106 |
+
padding: "8px 16px",
|
1107 |
+
backgroundColor: (theme) => alpha(theme.palette.primary.main, 0.05),
|
1108 |
+
}),
|
1109 |
+
},
|
1110 |
+
},
|
1111 |
+
{
|
1112 |
+
accessorKey: "evaluations.multilingual_average",
|
1113 |
+
header: createLeaderboardHeader("Multilingual", "Average performance on multilingual tasks", null),
|
1114 |
+
cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.multilingual_average"),
|
1115 |
+
size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
|
1116 |
+
meta: {
|
1117 |
+
headerStyle: {
|
1118 |
+
backgroundColor: (theme) => alpha(theme.palette.secondary.main, 0.05),
|
1119 |
+
},
|
1120 |
+
cellStyle: (value) => ({
|
1121 |
+
position: "relative",
|
1122 |
+
overflow: "hidden",
|
1123 |
+
padding: "8px 16px",
|
1124 |
+
backgroundColor: (theme) => alpha(theme.palette.secondary.main, 0.05),
|
1125 |
+
}),
|
1126 |
+
},
|
1127 |
+
}
|
1128 |
];
|
1129 |
|
1130 |
const optionalColumns = [
|