mirageco commited on
Commit
b8db958
·
1 Parent(s): 39637bc

hardcoded model list for multifinben

Browse files
frontend/src/pages/LeaderboardPage/LeaderboardPage.js CHANGED
@@ -1,49 +1,44 @@
1
- import { useEffect } from "react";
 
2
  import Leaderboard from "./components/Leaderboard/Leaderboard";
3
- import { Box } from "@mui/material";
4
- import PageHeader from "../../components/shared/PageHeader";
5
- import Logo from "../../components/Logo/Logo";
6
- import { useLeaderboardData } from "../../pages/LeaderboardPage/components/Leaderboard/hooks/useLeaderboardData";
7
- import { useLeaderboard } from "../../pages/LeaderboardPage/components/Leaderboard/context/LeaderboardContext";
8
-
9
- function LeaderboardPage() {
10
- const { data, isLoading, error } = useLeaderboardData();
11
- const { actions } = useLeaderboard();
12
-
13
- useEffect(() => {
14
- if (data) {
15
- actions.setModels(data);
16
- }
17
- actions.setLoading(isLoading);
18
- actions.setError(error);
19
- }, [data, isLoading, error, actions]);
20
 
 
21
  return (
22
- <Box
 
23
  sx={{
24
- ph: 2,
25
  display: "flex",
26
  flexDirection: "column",
 
 
 
 
27
  }}
28
  >
29
- <Box
30
- sx={{ display: "flex", justifyContent: "center", pt: 6, mb: -4, pb: 0 }}
31
- >
32
- <Logo height="80px" />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  </Box>
34
- <PageHeader
35
- title="Open Financial LLM Leaderboard"
36
- subtitle={
37
- <>
38
- Benchmark for large language models in {" "}
39
- <span style={{ fontWeight: 600 }}>financial</span> domain {" "}
40
- across multiple languages
41
- </>
42
- }
43
- />
44
  <Leaderboard />
45
- </Box>
46
  );
47
- }
48
 
49
  export default LeaderboardPage;
 
1
+ import React from "react";
2
+ import { Box, Typography, Container } from "@mui/material";
3
  import Leaderboard from "./components/Leaderboard/Leaderboard";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ const LeaderboardPage = () => {
6
  return (
7
+ <Container
8
+ maxWidth={false}
9
  sx={{
10
+ p: { xs: 1, sm: 2, md: 3 },
11
  display: "flex",
12
  flexDirection: "column",
13
+ alignItems: "center",
14
+ height: "100%",
15
+ maxWidth: "100vw",
16
+ overflow: "hidden"
17
  }}
18
  >
19
+ <Box sx={{ mb: 3, width: "100%", textAlign: "center" }}>
20
+ <Typography
21
+ variant="h4"
22
+ component="h1"
23
+ sx={{
24
+ fontWeight: 700,
25
+ mb: 1,
26
+ fontSize: { xs: "1.5rem", sm: "1.75rem", md: "2rem" },
27
+ }}
28
+ >
29
+ Open Financial LLM Leaderboard - Multi-modal & Multi-lingual
30
+ </Typography>
31
+ <Typography
32
+ variant="body1"
33
+ color="text.secondary"
34
+ sx={{ maxWidth: "800px", mx: "auto" }}
35
+ >
36
+ Comprehensive evaluation of language models on financial tasks across multiple languages and modalities
37
+ </Typography>
38
  </Box>
 
 
 
 
 
 
 
 
 
 
39
  <Leaderboard />
40
+ </Container>
41
  );
42
+ };
43
 
44
  export default LeaderboardPage;
frontend/src/pages/LeaderboardPage/components/Leaderboard/context/LeaderboardContext.js CHANGED
@@ -29,7 +29,22 @@ const DEFAULT_DISPLAY = {
29
  scoreDisplay: TABLE_DEFAULTS.SCORE_DISPLAY,
30
  averageMode: TABLE_DEFAULTS.AVERAGE_MODE,
31
  rankingMode: TABLE_DEFAULTS.RANKING_MODE,
32
- visibleColumns: TABLE_DEFAULTS.COLUMNS.DEFAULT_VISIBLE,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  };
34
 
35
  // Create initial counter structure
 
29
  scoreDisplay: TABLE_DEFAULTS.SCORE_DISPLAY,
30
  averageMode: TABLE_DEFAULTS.AVERAGE_MODE,
31
  rankingMode: TABLE_DEFAULTS.RANKING_MODE,
32
+ visibleColumns: [
33
+ 'isPinned',
34
+ 'rank',
35
+ 'model_type',
36
+ 'id',
37
+ 'model.average_score',
38
+ 'evaluations.vision_average',
39
+ 'evaluations.audio_average',
40
+ 'evaluations.english_average',
41
+ 'evaluations.chinese_average',
42
+ 'evaluations.japanese_average',
43
+ 'evaluations.spanish_average',
44
+ 'evaluations.greek_average',
45
+ 'evaluations.bilingual_average',
46
+ 'evaluations.multilingual_average'
47
+ ],
48
  };
49
 
50
  // Create initial counter structure
frontend/src/pages/LeaderboardPage/components/Leaderboard/hooks/useDataUtils.js CHANGED
@@ -6,6 +6,91 @@ import {
6
  } from "../utils/searchUtils";
7
  import { ALLOWED_MODELS, isModelAllowed } from "../constants/allowedModels";
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  // Calculate min/max averages
10
  export const useAverageRange = (data) => {
11
  return useMemo(() => {
@@ -39,142 +124,96 @@ export const useColorGenerator = (minAverage, maxAverage) => {
39
  // Process data with boolean standardization
40
  export const useProcessedData = (data, averageMode, visibleColumns) => {
41
  return useMemo(() => {
42
- // First filter and process existing models
43
- let processed = data.map((item) => {
44
- // Calculate average score for Greek datasets
45
- const greekDatasets = ['multifin', 'qa', 'fns', 'finnum', 'fintext'];
46
- const greekScores = greekDatasets
47
- .filter(dataset => item.evaluations[dataset]?.normalized_score !== undefined)
48
- .map(dataset => item.evaluations[dataset].normalized_score);
49
-
50
- const greekAverage = greekScores.length > 0
51
- ? greekScores.reduce((a, b) => a + b, 0) / greekScores.length
52
- : null;
53
-
54
- // Add Greek average to evaluations object
55
- const enhancedEvaluations = {
56
- ...item.evaluations,
57
- greek_average: greekAverage
 
 
 
 
 
 
 
 
 
58
  };
59
 
60
- // Calculate average score for all visible evaluations (including greek_average, but excluding specific Greek datasets)
61
- const includedEvaluations = {};
62
- // Copy all non-Greek evaluation data
63
- Object.entries(item.evaluations).forEach(([key, value]) => {
64
- if (!greekDatasets.includes(key)) {
65
- includedEvaluations[key] = value;
66
- }
67
- });
68
- // Add Greek average
69
- if (greekAverage !== null) {
70
- includedEvaluations.greek_average = { normalized_score: greekAverage };
71
- }
72
 
73
- const evaluationScores = Object.entries(includedEvaluations)
74
- .filter(([key]) => {
75
- if (averageMode === "all") return true;
76
- return visibleColumns.includes(`evaluations.${key}.normalized_score`);
77
- })
78
- .map(([, value]) => value.normalized_score);
79
-
80
- const average =
81
- evaluationScores.length > 0
82
- ? evaluationScores.reduce((a, b) => a + b, 0) /
83
- evaluationScores.length
84
- : averageMode === "visible"
85
- ? null
86
- : 0;
87
-
88
- // Boolean standardization
89
- const standardizedFeatures = {
90
- ...item.features,
91
- is_moe: Boolean(item.features.is_moe),
92
- is_flagged: Boolean(item.features.is_flagged),
93
- is_highlighted_by_maintainer: Boolean(
94
- item.features.is_highlighted_by_maintainer
95
- ),
96
- is_merged: Boolean(item.features.is_merged),
97
- is_not_available_on_hub: Boolean(item.features.is_not_available_on_hub),
98
- };
99
-
100
- return {
101
- ...item,
102
- features: standardizedFeatures,
103
- evaluations: enhancedEvaluations, // Use enhanced evaluations
104
  model: {
105
- ...item.model,
106
- has_chat_template: Boolean(item.model.has_chat_template),
107
- average_score: average,
108
  },
109
- };
110
- });
111
-
112
- // Create mapping of existing models, check which ones are in the allowed list
113
- const existingModelsMap = {};
114
- const filteredModels = [];
115
-
116
- processed.forEach(model => {
117
- if (isModelAllowed(model.model.name)) {
118
- existingModelsMap[model.model.name] = model;
119
- filteredModels.push(model);
120
- }
 
121
  });
122
 
123
- // Add "missing" entries, create placeholders for models in the allowed list that don't exist
124
- ALLOWED_MODELS.forEach(allowedModelName => {
125
- // Check if a matching model already exists
126
- const modelExists = Object.keys(existingModelsMap).some(name =>
127
- name.toLowerCase().includes(allowedModelName.toLowerCase())
128
- );
129
-
130
- if (!modelExists) {
131
- // Create a "missing" placeholder
132
- filteredModels.push({
133
- id: `missing-${allowedModelName}`,
134
- model: {
135
- name: allowedModelName,
136
- average_score: null,
137
- type: "Unknown",
138
- },
139
- evaluations: {
140
- greek_average: null
141
- },
142
- features: {
143
- is_moe: false,
144
- is_flagged: false,
145
- is_highlighted_by_maintainer: false,
146
- is_merged: false,
147
- is_not_available_on_hub: true,
148
- },
149
- metadata: {
150
- submission_date: new Date().toISOString(),
151
- },
152
- isMissing: true, // Mark as missing
153
- });
154
- }
155
- });
156
-
157
- // Sort the results
158
- filteredModels.sort((a, b) => {
159
- // Place missing models at the end
160
- if (a.isMissing && !b.isMissing) return 1;
161
- if (!a.isMissing && b.isMissing) return -1;
162
-
163
- // If both are missing or both are not missing, sort by average score
164
  if (a.model.average_score === null && b.model.average_score === null)
165
  return 0;
166
  if (a.model.average_score === null) return 1;
167
  if (b.model.average_score === null) return -1;
168
  return b.model.average_score - a.model.average_score;
169
  });
170
-
171
- return filteredModels.map((item, index) => ({
 
172
  ...item,
173
  static_rank: index + 1,
174
  }));
175
  }, [data, averageMode, visibleColumns]);
176
  };
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  // Common filtering logic
179
  export const useFilteredData = (
180
  processedData,
@@ -188,179 +227,17 @@ export const useFilteredData = (
188
  isOfficialProviderActive = false
189
  ) => {
190
  return useMemo(() => {
191
- const pinnedData = processedData.filter((row) => {
192
- return pinnedModels.includes(row.id);
193
- });
194
- const unpinnedData = processedData.filter((row) => {
195
- return !pinnedModels.includes(row.id);
196
- });
197
-
198
- let filteredUnpinned = unpinnedData;
199
-
200
- // Filter by official providers
201
- if (isOfficialProviderActive) {
202
- filteredUnpinned = filteredUnpinned.filter(
203
- (row) =>
204
- row.features?.is_highlighted_by_maintainer ||
205
- row.metadata?.is_highlighted_by_maintainer
206
- );
207
- }
208
-
209
- // Filter by precision
210
- if (selectedPrecisions.length > 0) {
211
- filteredUnpinned = filteredUnpinned.filter((row) =>
212
- selectedPrecisions.includes(row.model.precision)
213
- );
214
- }
215
-
216
- // Filter by type
217
- if (selectedTypes.length > 0) {
218
- filteredUnpinned = filteredUnpinned.filter((row) => {
219
- const modelType = row.model.type?.toLowerCase().trim();
220
- return selectedTypes.some((type) => modelType?.includes(type));
221
- });
222
- }
223
-
224
- // Filter by parameters
225
- filteredUnpinned = filteredUnpinned.filter((row) => {
226
- // Skip parameter filtering if no filter is active
227
- if (paramsRange[0] === -1 && paramsRange[1] === 140) return true;
228
-
229
- const params =
230
- row.metadata?.params_billions || row.features?.params_billions;
231
- if (params === undefined || params === null) return false;
232
- return params >= paramsRange[0] && params < paramsRange[1];
233
- });
234
-
235
- // Filter by search
236
- if (searchValue) {
237
- const searchQueries = searchValue
238
- .split(";")
239
- .map((q) => q.trim())
240
- .filter((q) => q);
241
- if (searchQueries.length > 0) {
242
- filteredUnpinned = filteredUnpinned.filter((row) => {
243
- return searchQueries.some((query) => {
244
- const { specialSearches, textSearch } = parseSearchQuery(query);
245
-
246
- const specialSearchMatch = specialSearches.every(
247
- ({ field, value }) => {
248
- const fieldValue = getValueByPath(row, field)
249
- ?.toString()
250
- .toLowerCase();
251
- return fieldValue?.includes(value.toLowerCase());
252
- }
253
- );
254
-
255
- if (!specialSearchMatch) return false;
256
- if (!textSearch) return true;
257
-
258
- const modelName = row.model.name.toLowerCase();
259
- const searchLower = textSearch.toLowerCase();
260
-
261
- if (looksLikeRegex(textSearch)) {
262
- try {
263
- const regex = new RegExp(textSearch, "i");
264
- return regex.test(modelName);
265
- } catch (e) {
266
- return modelName.includes(searchLower);
267
- }
268
- } else {
269
- return modelName.includes(searchLower);
270
- }
271
- });
272
- });
273
- }
274
- }
275
-
276
- // Filter by booleans
277
- if (selectedBooleanFilters.length > 0) {
278
- filteredUnpinned = filteredUnpinned.filter((row) => {
279
- return selectedBooleanFilters.every((filter) => {
280
- const filterValue =
281
- typeof filter === "object" ? filter.value : filter;
282
-
283
- // Maintainer's Highlight keeps positive logic
284
- if (filterValue === "is_highlighted_by_maintainer") {
285
- return row.features[filterValue];
286
- }
287
-
288
- // For all other filters, invert the logic
289
- if (filterValue === "is_not_available_on_hub") {
290
- return row.features[filterValue];
291
- }
292
-
293
- return !row.features[filterValue];
294
- });
295
- });
296
- }
297
-
298
- // Create ordered array of pinned models respecting pinnedModels order
299
- const orderedPinnedData = pinnedModels
300
- .map((pinnedModelId) =>
301
- pinnedData.find((item) => item.id === pinnedModelId)
302
- )
303
- .filter(Boolean);
304
-
305
- // Combine all filtered data
306
- const allFilteredData = [...filteredUnpinned, ...orderedPinnedData];
307
-
308
- // Sort all data by average_score for dynamic_rank
309
- const sortedByScore = [...allFilteredData].sort((a, b) => {
310
- // Si les scores moyens sont différents, trier par score
311
- if (a.model.average_score !== b.model.average_score) {
312
- if (a.model.average_score === null && b.model.average_score === null)
313
- return 0;
314
- if (a.model.average_score === null) return 1;
315
- if (b.model.average_score === null) return -1;
316
- return b.model.average_score - a.model.average_score;
317
- }
318
-
319
- // Si les scores sont égaux, comparer le nom du modèle et la date de soumission
320
- if (a.model.name === b.model.name) {
321
- // Si même nom, trier par date de soumission (la plus récente d'abord)
322
- const dateA = new Date(a.metadata?.submission_date || 0);
323
- const dateB = new Date(b.metadata?.submission_date || 0);
324
- return dateB - dateA;
325
- }
326
-
327
- // Si noms différents, trier par nom
328
- return a.model.name.localeCompare(b.model.name);
329
- });
330
-
331
- // Create Map to store dynamic_ranks
332
- const dynamicRankMap = new Map();
333
- sortedByScore.forEach((item, index) => {
334
- dynamicRankMap.set(item.id, index + 1);
335
- });
336
-
337
- // Add ranks to final data
338
- const finalData = [...orderedPinnedData, ...filteredUnpinned].map(
339
- (item) => {
340
- return {
341
- ...item,
342
- dynamic_rank: dynamicRankMap.get(item.id),
343
- rank: item.isPinned
344
- ? pinnedModels.indexOf(item.id) + 1
345
- : rankingMode === "static"
346
- ? item.static_rank
347
- : dynamicRankMap.get(item.id),
348
- isPinned: pinnedModels.includes(item.id),
349
- };
350
- }
351
- );
352
-
353
- return finalData;
354
  }, [
355
  processedData,
356
- selectedPrecisions,
357
- selectedTypes,
358
- paramsRange,
359
- searchValue,
360
- selectedBooleanFilters,
361
  rankingMode,
362
  pinnedModels,
363
- isOfficialProviderActive,
364
  ]);
365
  };
366
 
 
6
  } from "../utils/searchUtils";
7
  import { ALLOWED_MODELS, isModelAllowed } from "../constants/allowedModels";
8
 
9
+ // 硬编码数据集
10
+ const HARDCODED_SCORES = {
11
+ vision: {
12
+ "GPT-4o": 55.54, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 16.27,
13
+ "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 14.97, "google/gemma-3-27b-it": 25.57,
14
+ "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 24.97, "TheFinAI/finma-7b-full": 0.00,
15
+ "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00,
16
+ "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "Qwen-VL-MAX": 18.47,
17
+ "LLaVA-1.6 Vicuna-13B": 19.77, "Deepseek-VL-7B-Chat": 19.10, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
18
+ "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
19
+ },
20
+ audio: {
21
+ "GPT-4o": 55.56, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 0.00,
22
+ "meta-llama/Llama-3.1-70B-Instruct": 0.00, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00,
23
+ "Qwen/Qwen2.5-32B-Instruct": 0.00, "Qwen/Qwen2.5-Omni-7B": 48.22, "TheFinAI/finma-7b-full": 0.00,
24
+ "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 0.00, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 0.00,
25
+ "TheFinAI/FinMA-ES-Bilingual": 0.00, "TheFinAI/plutus-8B-instruct": 0.00, "Qwen-VL-MAX": 0.00,
26
+ "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 51.58, "Qwen2-Audio-7B": 48.02,
27
+ "Qwen2-Audio-7B-Instruct": 50.06, "SALMONN-7B": 24.24, "SALMONN-13B": 24.59
28
+ },
29
+ english: {
30
+ "GPT-4o": 42.18, "o3-Mini": 20.20, "Deepseek-V3": 18.04, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 24.16,
31
+ "meta-llama/Llama-3.1-70B-Instruct": 38.71, "google/gemma-3-4b-it": 16.13, "google/gemma-3-27b-it": 17.19,
32
+ "Qwen/Qwen2.5-32B-Instruct": 32.01, "Qwen/Qwen2.5-Omni-7B": 24.99, "TheFinAI/finma-7b-full": 28.89,
33
+ "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 29.39, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 26.38,
34
+ "TheFinAI/FinMA-ES-Bilingual": 31.72, "TheFinAI/plutus-8B-instruct": 27.82, "Qwen-VL-MAX": 0.00,
35
+ "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
36
+ "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
37
+ },
38
+ chinese: {
39
+ "GPT-4o": 60.34, "o3-Mini": 0.00, "Deepseek-V3": 60.94, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 64.51,
40
+ "meta-llama/Llama-3.1-70B-Instruct": 56.74, "google/gemma-3-4b-it": 26.23, "google/gemma-3-27b-it": 26.24,
41
+ "Qwen/Qwen2.5-32B-Instruct": 56.62, "Qwen/Qwen2.5-Omni-7B": 53.09, "TheFinAI/finma-7b-full": 24.42,
42
+ "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 23.04, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 13.18,
43
+ "TheFinAI/FinMA-ES-Bilingual": 21.50, "TheFinAI/plutus-8B-instruct": 31.04, "Qwen-VL-MAX": 0.00,
44
+ "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
45
+ "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
46
+ },
47
+ japanese: {
48
+ "GPT-4o": 0.00, "o3-Mini": 0.00, "Deepseek-V3": 0.00, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 48.43,
49
+ "meta-llama/Llama-3.1-70B-Instruct": 32.17, "google/gemma-3-4b-it": 8.98, "google/gemma-3-27b-it": 23.96,
50
+ "Qwen/Qwen2.5-32B-Instruct": 4.54, "Qwen/Qwen2.5-Omni-7B": 44.35, "TheFinAI/finma-7b-full": 46.94,
51
+ "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 47.59, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 23.96,
52
+ "TheFinAI/FinMA-ES-Bilingual": 57.36, "TheFinAI/plutus-8B-instruct": 34.62, "Qwen-VL-MAX": 0.00,
53
+ "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
54
+ "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
55
+ },
56
+ spanish: {
57
+ "GPT-4o": 29.80, "o3-Mini": 4.53, "Deepseek-V3": 25.49, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 47.90,
58
+ "meta-llama/Llama-3.1-70B-Instruct": 37.84, "google/gemma-3-4b-it": 27.66, "google/gemma-3-27b-it": 27.77,
59
+ "Qwen/Qwen2.5-32B-Instruct": 37.47, "Qwen/Qwen2.5-Omni-7B": 39.16, "TheFinAI/finma-7b-full": 27.04,
60
+ "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 42.86, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 28.01,
61
+ "TheFinAI/FinMA-ES-Bilingual": 38.69, "TheFinAI/plutus-8B-instruct": 40.16, "Qwen-VL-MAX": 0.00,
62
+ "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
63
+ "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
64
+ },
65
+ greek: {
66
+ "GPT-4o": 43.04, "o3-Mini": 9.48, "Deepseek-V3": 39.07, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 48.95,
67
+ "meta-llama/Llama-3.1-70B-Instruct": 43.60, "google/gemma-3-4b-it": 15.45, "google/gemma-3-27b-it": 15.44,
68
+ "Qwen/Qwen2.5-32B-Instruct": 44.32, "Qwen/Qwen2.5-Omni-7B": 23.45, "TheFinAI/finma-7b-full": 17.93,
69
+ "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 29.49, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 20.91,
70
+ "TheFinAI/FinMA-ES-Bilingual": 15.47, "TheFinAI/plutus-8B-instruct": 60.19, "Qwen-VL-MAX": 0.00,
71
+ "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
72
+ "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
73
+ },
74
+ bilingual: {
75
+ "GPT-4o": 92.29, "o3-Mini": 90.13, "Deepseek-V3": 86.26, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 89.17,
76
+ "meta-llama/Llama-3.1-70B-Instruct": 92.13, "google/gemma-3-4b-it": 35.92, "google/gemma-3-27b-it": 35.92,
77
+ "Qwen/Qwen2.5-32B-Instruct": 92.29, "Qwen/Qwen2.5-Omni-7B": 91.80, "TheFinAI/finma-7b-full": 69.24,
78
+ "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 91.60, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 71.81,
79
+ "TheFinAI/FinMA-ES-Bilingual": 66.57, "TheFinAI/plutus-8B-instruct": 91.59, "Qwen-VL-MAX": 0.00,
80
+ "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
81
+ "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
82
+ },
83
+ multilingual: {
84
+ "GPT-4o": 6.53, "o3-Mini": 7.80, "Deepseek-V3": 36.99, "meta-llama/Llama-4-Scout-17B-16E-Instruct": 13.52,
85
+ "meta-llama/Llama-3.1-70B-Instruct": 21.97, "google/gemma-3-4b-it": 0.00, "google/gemma-3-27b-it": 0.00,
86
+ "Qwen/Qwen2.5-32B-Instruct": 18.48, "Qwen/Qwen2.5-Omni-7B": 16.29, "TheFinAI/finma-7b-full": 3.10,
87
+ "Duxiaoman-DI/Llama3.1-XuanYuan-FinX1-Preview": 1.76, "cyberagent/DeepSeek-R1-Distill-Qwen-32B-Japanese": 10.25,
88
+ "TheFinAI/FinMA-ES-Bilingual": 0.35, "TheFinAI/plutus-8B-instruct": 7.24, "Qwen-VL-MAX": 0.00,
89
+ "LLaVA-1.6 Vicuna-13B": 0.00, "Deepseek-VL-7B-Chat": 0.00, "Whisper-V3": 0.00, "Qwen2-Audio-7B": 0.00,
90
+ "Qwen2-Audio-7B-Instruct": 0.00, "SALMONN-7B": 0.00, "SALMONN-13B": 0.00
91
+ }
92
+ };
93
+
94
  // Calculate min/max averages
95
  export const useAverageRange = (data) => {
96
  return useMemo(() => {
 
124
  // Process data with boolean standardization
125
  export const useProcessedData = (data, averageMode, visibleColumns) => {
126
  return useMemo(() => {
127
+ // 直接使用硬编码数据创建模型列表
128
+ const modelList = [];
129
+
130
+ // 从HARDCODED_SCORES中获取所有模型名称
131
+ const modelNames = new Set();
132
+ Object.values(HARDCODED_SCORES).forEach(categoryData => {
133
+ Object.entries(categoryData).forEach(([modelName, score]) => {
134
+ // 添加所有模型,不管分数是否为0
135
+ modelNames.add(modelName);
136
+ });
137
+ });
138
+
139
+ // 为每个模型创建条目
140
+ Array.from(modelNames).forEach((modelName, index) => {
141
+ // 创建硬编码评估数据
142
+ const hardcodedEvaluations = {
143
+ vision_average: getHardcodedScore(modelName, 'vision'),
144
+ audio_average: getHardcodedScore(modelName, 'audio'),
145
+ english_average: getHardcodedScore(modelName, 'english'),
146
+ chinese_average: getHardcodedScore(modelName, 'chinese'),
147
+ japanese_average: getHardcodedScore(modelName, 'japanese'),
148
+ spanish_average: getHardcodedScore(modelName, 'spanish'),
149
+ greek_average: getHardcodedScore(modelName, 'greek'),
150
+ bilingual_average: getHardcodedScore(modelName, 'bilingual'),
151
+ multilingual_average: getHardcodedScore(modelName, 'multilingual')
152
  };
153
 
154
+ // 计算总平均分(包含分数为0的类别)
155
+ const scores = Object.values(hardcodedEvaluations).filter(score => score !== null);
156
+ const averageScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : null;
 
 
 
 
 
 
 
 
 
157
 
158
+ // 创建模型数据
159
+ modelList.push({
160
+ id: `model-${index}`,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  model: {
162
+ name: modelName,
163
+ average_score: averageScore,
164
+ type: "chat", // 统一设为chat类型
165
  },
166
+ evaluations: hardcodedEvaluations,
167
+ features: {
168
+ is_moe: false,
169
+ is_flagged: false,
170
+ is_highlighted_by_maintainer: false,
171
+ is_merged: false,
172
+ is_not_available_on_hub: false,
173
+ },
174
+ metadata: {
175
+ submission_date: new Date().toISOString(),
176
+ },
177
+ isMissing: false,
178
+ });
179
  });
180
 
181
+ // 根据平均分排序
182
+ modelList.sort((a, b) => {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  if (a.model.average_score === null && b.model.average_score === null)
184
  return 0;
185
  if (a.model.average_score === null) return 1;
186
  if (b.model.average_score === null) return -1;
187
  return b.model.average_score - a.model.average_score;
188
  });
189
+
190
+ // 添加排名
191
+ return modelList.map((item, index) => ({
192
  ...item,
193
  static_rank: index + 1,
194
  }));
195
  }, [data, averageMode, visibleColumns]);
196
  };
197
 
198
+ // 辅助函数:从硬编码数据中获取分数
199
+ function getHardcodedScore(modelName, category) {
200
+ if (!HARDCODED_SCORES[category]) return null;
201
+
202
+ // 尝试精确匹配
203
+ if (HARDCODED_SCORES[category][modelName] !== undefined) {
204
+ return HARDCODED_SCORES[category][modelName];
205
+ }
206
+
207
+ // 尝试部分匹配
208
+ for (const key in HARDCODED_SCORES[category]) {
209
+ if (modelName.includes(key) || key.includes(modelName)) {
210
+ return HARDCODED_SCORES[category][key];
211
+ }
212
+ }
213
+
214
+ return null;
215
+ }
216
+
217
  // Common filtering logic
218
  export const useFilteredData = (
219
  processedData,
 
227
  isOfficialProviderActive = false
228
  ) => {
229
  return useMemo(() => {
230
+ // 由于使用的是硬编码数据,这里直接返回所有数据而不进行过滤
231
+ return processedData.map((item, index) => ({
232
+ ...item,
233
+ dynamic_rank: index + 1,
234
+ rank: rankingMode === "static" ? item.static_rank : index + 1,
235
+ isPinned: pinnedModels.includes(item.id),
236
+ }));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  }, [
238
  processedData,
 
 
 
 
 
239
  rankingMode,
240
  pinnedModels,
 
241
  ]);
242
  };
243
 
frontend/src/pages/LeaderboardPage/components/Leaderboard/hooks/useLeaderboardData.js CHANGED
@@ -8,60 +8,11 @@ const CACHE_KEY = "leaderboardData";
8
  const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
9
 
10
  export const useLeaderboardData = () => {
11
- const queryClient = useQueryClient();
12
- const [searchParams] = useSearchParams();
13
- const isInitialLoadRef = useRef(true);
14
-
15
- const { data, isLoading, error } = useQuery({
16
- queryKey: ["leaderboard"],
17
- queryFn: async () => {
18
- try {
19
- const cachedData = localStorage.getItem(CACHE_KEY);
20
- if (cachedData) {
21
- const { data: cached, timestamp } = JSON.parse(cachedData);
22
- const age = Date.now() - timestamp;
23
- if (age < CACHE_DURATION) {
24
- return cached;
25
- }
26
- }
27
-
28
- const response = await fetch("/api/leaderboard/formatted");
29
- if (!response.ok) {
30
- throw new Error(`HTTP error! status: ${response.status}`);
31
- }
32
-
33
- const newData = await response.json();
34
- localStorage.setItem(
35
- CACHE_KEY,
36
- JSON.stringify({
37
- data: newData,
38
- timestamp: Date.now(),
39
- })
40
- );
41
-
42
- return newData;
43
- } catch (error) {
44
- console.error("Detailed error:", error);
45
- throw error;
46
- }
47
- },
48
- staleTime: CACHE_DURATION,
49
- cacheTime: CACHE_DURATION * 2,
50
- refetchOnWindowFocus: false,
51
- enabled: isInitialLoadRef.current || !!searchParams.toString(),
52
- });
53
-
54
- useMemo(() => {
55
- if (data && isInitialLoadRef.current) {
56
- isInitialLoadRef.current = false;
57
- }
58
- }, [data]);
59
-
60
  return {
61
- data,
62
- isLoading,
63
- error,
64
- refetch: () => queryClient.invalidateQueries(["leaderboard"]),
65
  };
66
  };
67
 
 
8
  const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
9
 
10
  export const useLeaderboardData = () => {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  return {
12
+ data: [], // 直接返回空数组,我们使用硬编码数据
13
+ isLoading: false,
14
+ error: null,
15
+ refetch: () => {}
16
  };
17
  };
18
 
frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js CHANGED
@@ -499,6 +499,67 @@ const createGreekLeaderboardHeader = (header) => (
499
  </Box>
500
  );
501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
  export const createColumns = (
503
  getColorForValue,
504
  scoreDisplay = "normalized",
@@ -928,6 +989,142 @@ export const createColumns = (
928
  }),
929
  },
930
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
931
  ];
932
 
933
  const optionalColumns = [
 
499
  </Box>
500
  );
501
 
502
+ // 为各种类型的Leaderboard创建自定义标题组件
503
+ const createLeaderboardHeader = (label, tooltip, linkUrl) => (header) => (
504
+ <Box
505
+ className="header-content"
506
+ sx={{
507
+ display: "flex",
508
+ alignItems: "center",
509
+ width: "100%",
510
+ position: "relative",
511
+ }}
512
+ >
513
+ <HeaderLabel
514
+ label={`${label} Leaderboard`}
515
+ tooltip={tooltip}
516
+ className="header-label"
517
+ isSorted={header?.column?.getIsSorted()}
518
+ />
519
+
520
+ <Box
521
+ sx={{
522
+ display: "flex",
523
+ alignItems: "center",
524
+ gap: 0.5,
525
+ ml: "auto",
526
+ flexShrink: 0,
527
+ }}
528
+ >
529
+ <InfoIcon tooltip={tooltip} />
530
+ {linkUrl && (
531
+ <Link
532
+ href={linkUrl}
533
+ target="_blank"
534
+ rel="noopener noreferrer"
535
+ aria-label={`View ${label} Leaderboard`}
536
+ sx={{
537
+ color: "info.main",
538
+ display: "flex",
539
+ alignItems: "center",
540
+ ml: 0.5,
541
+ textDecoration: "none",
542
+ "&:hover": {
543
+ textDecoration: "underline",
544
+ "& svg": {
545
+ opacity: 0.8,
546
+ },
547
+ },
548
+ }}
549
+ >
550
+ <OpenInNewIcon
551
+ sx={{
552
+ fontSize: "1rem",
553
+ opacity: 0.6,
554
+ transition: "opacity 0.2s ease-in-out",
555
+ }}
556
+ />
557
+ </Link>
558
+ )}
559
+ </Box>
560
+ </Box>
561
+ );
562
+
563
  export const createColumns = (
564
  getColorForValue,
565
  scoreDisplay = "normalized",
 
989
  }),
990
  },
991
  },
992
+ {
993
+ accessorKey: "evaluations.vision_average",
994
+ header: createLeaderboardHeader("Vision", "Average performance on vision tasks", null),
995
+ cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.vision_average"),
996
+ size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
997
+ meta: {
998
+ headerStyle: {
999
+ backgroundColor: (theme) => alpha(theme.palette.primary.light, 0.05),
1000
+ },
1001
+ cellStyle: (value) => ({
1002
+ position: "relative",
1003
+ overflow: "hidden",
1004
+ padding: "8px 16px",
1005
+ backgroundColor: (theme) => alpha(theme.palette.primary.light, 0.05),
1006
+ }),
1007
+ },
1008
+ },
1009
+ {
1010
+ accessorKey: "evaluations.audio_average",
1011
+ header: createLeaderboardHeader("Audio", "Average performance on audio tasks", null),
1012
+ cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.audio_average"),
1013
+ size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1014
+ meta: {
1015
+ headerStyle: {
1016
+ backgroundColor: (theme) => alpha(theme.palette.secondary.light, 0.05),
1017
+ },
1018
+ cellStyle: (value) => ({
1019
+ position: "relative",
1020
+ overflow: "hidden",
1021
+ padding: "8px 16px",
1022
+ backgroundColor: (theme) => alpha(theme.palette.secondary.light, 0.05),
1023
+ }),
1024
+ },
1025
+ },
1026
+ {
1027
+ accessorKey: "evaluations.english_average",
1028
+ header: createLeaderboardHeader("English", "Average performance on English language tasks", null),
1029
+ cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.english_average"),
1030
+ size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1031
+ meta: {
1032
+ headerStyle: {
1033
+ backgroundColor: (theme) => alpha(theme.palette.success.light, 0.05),
1034
+ },
1035
+ cellStyle: (value) => ({
1036
+ position: "relative",
1037
+ overflow: "hidden",
1038
+ padding: "8px 16px",
1039
+ backgroundColor: (theme) => alpha(theme.palette.success.light, 0.05),
1040
+ }),
1041
+ },
1042
+ },
1043
+ {
1044
+ accessorKey: "evaluations.chinese_average",
1045
+ header: createLeaderboardHeader("Chinese", "Average performance on Chinese language tasks", null),
1046
+ cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.chinese_average"),
1047
+ size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1048
+ meta: {
1049
+ headerStyle: {
1050
+ backgroundColor: (theme) => alpha(theme.palette.warning.light, 0.05),
1051
+ },
1052
+ cellStyle: (value) => ({
1053
+ position: "relative",
1054
+ overflow: "hidden",
1055
+ padding: "8px 16px",
1056
+ backgroundColor: (theme) => alpha(theme.palette.warning.light, 0.05),
1057
+ }),
1058
+ },
1059
+ },
1060
+ {
1061
+ accessorKey: "evaluations.japanese_average",
1062
+ header: createLeaderboardHeader("Japanese", "Average performance on Japanese language tasks", null),
1063
+ cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.japanese_average"),
1064
+ size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1065
+ meta: {
1066
+ headerStyle: {
1067
+ backgroundColor: (theme) => alpha(theme.palette.error.light, 0.05),
1068
+ },
1069
+ cellStyle: (value) => ({
1070
+ position: "relative",
1071
+ overflow: "hidden",
1072
+ padding: "8px 16px",
1073
+ backgroundColor: (theme) => alpha(theme.palette.error.light, 0.05),
1074
+ }),
1075
+ },
1076
+ },
1077
+ {
1078
+ accessorKey: "evaluations.spanish_average",
1079
+ header: createLeaderboardHeader("Spanish", "Average performance on Spanish language tasks", null),
1080
+ cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.spanish_average"),
1081
+ size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1082
+ meta: {
1083
+ headerStyle: {
1084
+ backgroundColor: (theme) => alpha(theme.palette.info.main, 0.05),
1085
+ },
1086
+ cellStyle: (value) => ({
1087
+ position: "relative",
1088
+ overflow: "hidden",
1089
+ padding: "8px 16px",
1090
+ backgroundColor: (theme) => alpha(theme.palette.info.main, 0.05),
1091
+ }),
1092
+ },
1093
+ },
1094
+ {
1095
+ accessorKey: "evaluations.bilingual_average",
1096
+ header: createLeaderboardHeader("Bilingual", "Average performance on bilingual tasks", null),
1097
+ cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.bilingual_average"),
1098
+ size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1099
+ meta: {
1100
+ headerStyle: {
1101
+ backgroundColor: (theme) => alpha(theme.palette.primary.main, 0.05),
1102
+ },
1103
+ cellStyle: (value) => ({
1104
+ position: "relative",
1105
+ overflow: "hidden",
1106
+ padding: "8px 16px",
1107
+ backgroundColor: (theme) => alpha(theme.palette.primary.main, 0.05),
1108
+ }),
1109
+ },
1110
+ },
1111
+ {
1112
+ accessorKey: "evaluations.multilingual_average",
1113
+ header: createLeaderboardHeader("Multilingual", "Average performance on multilingual tasks", null),
1114
+ cell: ({ row, getValue }) => createScoreCell(getValue, row, "evaluations.multilingual_average"),
1115
+ size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES["model.average_score"] || 100,
1116
+ meta: {
1117
+ headerStyle: {
1118
+ backgroundColor: (theme) => alpha(theme.palette.secondary.main, 0.05),
1119
+ },
1120
+ cellStyle: (value) => ({
1121
+ position: "relative",
1122
+ overflow: "hidden",
1123
+ padding: "8px 16px",
1124
+ backgroundColor: (theme) => alpha(theme.palette.secondary.main, 0.05),
1125
+ }),
1126
+ },
1127
+ }
1128
  ];
1129
 
1130
  const optionalColumns = [