Clémentine commited on
Commit
014d36a
1 Parent(s): 1de2d20

updated tooltips

Browse files
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/defaults.js CHANGED
@@ -44,13 +44,13 @@ const FILTERS = {
44
  hide: true,
45
  },
46
  {
47
- value: "is_flagged",
48
- label: "Potentially contaminated model",
49
  hide: true,
50
  },
51
  {
52
- value: "is_merged",
53
- label: "Merged model",
54
  hide: true,
55
  },
56
  {
 
44
  hide: true,
45
  },
46
  {
47
+ value: "is_merged",
48
+ label: "Merged model",
49
  hide: true,
50
  },
51
  {
52
+ value: "is_flagged",
53
+ label: "Potentially contaminated model",
54
  hide: true,
55
  },
56
  {
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/quickFilters.js CHANGED
@@ -11,8 +11,8 @@ export const QUICK_FILTER_PRESETS = [
11
  },
12
  {
13
  id: 'small_models',
14
- label: 'For consumers',
15
- shortDescription: 'Smol-LMs: Up to 7B parameters',
16
  description: 'Lightweight models optimized for consumer hardware with up to one GPU. Ideal for private consumer hardware.',
17
  filters: {
18
  paramsRange: [3, 7],
@@ -21,7 +21,7 @@ export const QUICK_FILTER_PRESETS = [
21
  },
22
  {
23
  id: 'medium_models',
24
- label: 'For production',
25
  shortDescription: 'Medium-sized models: 7B-65B parameters',
26
  description: 'Overall balance between performance and required resources.',
27
  filters: {
@@ -33,7 +33,7 @@ export const QUICK_FILTER_PRESETS = [
33
  id: 'large_models',
34
  label: 'For the GPU-rich',
35
  shortDescription: 'Large models: 65B+ parameters',
36
- description: 'Large-scale models offering (in theory) the best performance but requiring significant resources. Requires adapted infrastructure.',
37
  filters: {
38
  paramsRange: [65, 140],
39
  selectedBooleanFilters: []
@@ -43,7 +43,7 @@ export const QUICK_FILTER_PRESETS = [
43
  id: 'official_providers',
44
  label: 'Only Official Providers',
45
  shortDescription: 'Officially provided models',
46
- description: 'Models that are officially provided and maintained by their original creators or organizations.',
47
  filters: {
48
  selectedBooleanFilters: ['is_highlighted_by_maintainer']
49
  }
 
11
  },
12
  {
13
  id: 'small_models',
14
+ label: 'For Consumers',
15
+ shortDescription: 'Smol-LMs: 3-7B parameters',
16
  description: 'Lightweight models optimized for consumer hardware with up to one GPU. Ideal for private consumer hardware.',
17
  filters: {
18
  paramsRange: [3, 7],
 
21
  },
22
  {
23
  id: 'medium_models',
24
+ label: 'Mid-range',
25
  shortDescription: 'Medium-sized models: 7B-65B parameters',
26
  description: 'Overall balance between performance and required resources.',
27
  filters: {
 
33
  id: 'large_models',
34
  label: 'For the GPU-rich',
35
  shortDescription: 'Large models: 65B+ parameters',
36
+ description: 'Large-scale models offering (in theory) the best performance but requiring significant resources. Require adapted infrastructure.',
37
  filters: {
38
  paramsRange: [65, 140],
39
  selectedBooleanFilters: []
 
43
  id: 'official_providers',
44
  label: 'Only Official Providers',
45
  shortDescription: 'Officially provided models',
46
+ description: 'Models that are officially provided and maintained by official creators or organizations.',
47
  filters: {
48
  selectedBooleanFilters: ['is_highlighted_by_maintainer']
49
  }
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/tooltips.js CHANGED
@@ -48,15 +48,15 @@ export const COLUMN_TOOLTIPS = {
48
  subItems: ["Instruction following", "Formatting", "Generation"],
49
  },
50
  {
51
- label: "Scoring",
52
- description: "Accuracy: was the format asked for strictly respected.",
53
  },
54
  ]),
55
 
56
  BBH: createTooltipContent("Big Bench Hard (BBH):", [
57
  {
58
  label: "Overview",
59
- description: "Collection of challenging for LLM tasks across domains",
60
  subItems: [
61
  "Language understanding",
62
  "Mathematical reasoning",
@@ -64,9 +64,9 @@ export const COLUMN_TOOLTIPS = {
64
  ],
65
  },
66
  {
67
- label: "Scoring",
68
  description:
69
- "Accuracy: was the correct choice selected among the options.",
70
  },
71
  ]),
72
 
@@ -79,9 +79,9 @@ export const COLUMN_TOOLTIPS = {
79
  subItems: ["Complex algebra", "Geometry problems", "Advanced calculus"],
80
  },
81
  {
82
- label: "Evaluation",
83
  description:
84
- "Accuracy: is the solution generated correct and in the expected format",
85
  },
86
  ]
87
  ),
@@ -91,15 +91,15 @@ export const COLUMN_TOOLTIPS = {
91
  label: "Focus",
92
  description: "PhD-level knowledge multiple choice questions in science",
93
  subItems: [
94
- "PhD-level chemistry",
95
- "PhD-level biology",
96
- "PhD-level physics",
97
  ],
98
  },
99
  {
100
- label: "Methodology",
101
  description:
102
- "Accuracy: was the correct choice selected among the options.",
103
  },
104
  ]),
105
 
@@ -114,9 +114,9 @@ export const COLUMN_TOOLTIPS = {
114
  ],
115
  },
116
  {
117
- label: "Scoring",
118
  description:
119
- "Accuracy: was the correct choice selected among the options.",
120
  },
121
  ]),
122
 
@@ -125,7 +125,7 @@ export const COLUMN_TOOLTIPS = {
125
  [
126
  {
127
  label: "Coverage",
128
- description: "Expertly reviewed multichoice questions across domains",
129
  subItems: [
130
  "Medicine and healthcare",
131
  "Law and ethics",
@@ -134,9 +134,9 @@ export const COLUMN_TOOLTIPS = {
134
  ],
135
  },
136
  {
137
- label: "Evaluation",
138
  description:
139
- "Accuracy: was the correct choice selected among the options.",
140
  },
141
  ]
142
  ),
@@ -146,19 +146,21 @@ export const COLUMN_TOOLTIPS = {
146
  label: "Definition",
147
  description: "The fundamental structure and design of the model",
148
  subItems: [
149
- "Base architecture type (e.g., Llama, Mistral, GPT-J)",
150
- "Specific architectural innovations and improvements",
151
- "Model family and version information",
152
- "Core design principles and techniques used",
 
 
153
  ],
154
  },
155
  {
156
  label: "Impact",
157
  description: "How architecture affects model capabilities",
158
  subItems: [
159
- "Influences model's learning capacity and efficiency",
160
- "Determines hardware compatibility and requirements",
161
- "Affects inference speed and memory usage",
162
  ],
163
  },
164
  ]),
@@ -169,10 +171,10 @@ export const COLUMN_TOOLTIPS = {
169
  description:
170
  "Data format used to store model weights and perform computations",
171
  subItems: [
172
- "BFloat16: Brain Float format, good for training stability",
173
- "Float16: Half precision, balances accuracy and speed",
174
- "Int8/Int4: Quantized formats for efficiency",
175
- "GPTQ/AWQ: Advanced quantization techniques",
176
  ],
177
  },
178
  {
@@ -181,40 +183,28 @@ export const COLUMN_TOOLTIPS = {
181
  subItems: [
182
  "Higher precision = better accuracy but more memory usage",
183
  "Lower precision = faster inference and smaller size",
184
- "Different hardware compatibility requirements",
185
  "Trade-off between model quality and resource usage",
186
  ],
187
  },
188
- {
189
- label: "Use Cases",
190
- description: "Choosing the right precision format",
191
- subItems: [
192
- "Production deployment optimization",
193
- "Resource-constrained environments",
194
- "High-performance computing scenarios",
195
- ],
196
- },
197
  ]),
198
 
199
  FLAGS: createTooltipContent("Model Flags and Special Features:", [
200
  {
201
- label: "Purpose",
202
- description: "Special indicators and capabilities of the model",
203
  subItems: [
204
- "Safeguards and content filtering features",
205
- "Specialized training techniques used",
206
- "Hardware optimization flags",
207
- "Deployment-specific configurations",
208
  ],
209
  },
210
  {
211
- label: "Common Flags",
212
- description: "Frequently used model indicators",
213
  subItems: [
214
- "RLHF: Reinforcement Learning from Human Feedback",
215
- "DPO: Direct Preference Optimization",
216
- "MoE: Mixture of Experts architecture",
217
- "Flash Attention: Optimized attention implementation",
218
  ],
219
  },
220
  ]),
@@ -260,7 +250,6 @@ export const COLUMN_TOOLTIPS = {
260
  subItems: [
261
  "Large models can have significant carbon footprints",
262
  "Helps make informed choices about model selection",
263
- "Promotes awareness of AI's environmental impact",
264
  ],
265
  },
266
  {
@@ -332,12 +321,12 @@ export const UI_TOOLTIPS = {
332
  "Efficient models for edge devices, optimized for blazing fast inference.",
333
  },
334
  {
335
- label: "Smol Models (1.7B-7B)",
336
  description:
337
- "Efficient models for consumer hardware and edge devices, optimized for fast inference.",
338
  },
339
  {
340
- label: "Middle ground models (7B-65B)",
341
  description:
342
  "A bit of everything here, with overall balanced performance and resource usage around 30B.",
343
  },
 
48
  subItems: ["Instruction following", "Formatting", "Generation"],
49
  },
50
  {
51
+ label: "Scoring: Accuracy",
52
+ description: "Was the format asked for strictly respected.",
53
  },
54
  ]),
55
 
56
  BBH: createTooltipContent("Big Bench Hard (BBH):", [
57
  {
58
  label: "Overview",
59
+ description: "Collection of challenging for LLM tasks across domains, for example",
60
  subItems: [
61
  "Language understanding",
62
  "Mathematical reasoning",
 
64
  ],
65
  },
66
  {
67
+ label: "Scoring: Accuracy",
68
  description:
69
+ "Was the correct choice selected among the options.",
70
  },
71
  ]),
72
 
 
79
  subItems: ["Complex algebra", "Geometry problems", "Advanced calculus"],
80
  },
81
  {
82
+ label: "Scoring: Exact match",
83
  description:
84
+ "Was the solution generated correct and in the expected format",
85
  },
86
  ]
87
  ),
 
91
  label: "Focus",
92
  description: "PhD-level knowledge multiple choice questions in science",
93
  subItems: [
94
+ "Chemistry",
95
+ "Biology",
96
+ "Physics",
97
  ],
98
  },
99
  {
100
+ label: "Scoring: Accuracy",
101
  description:
102
+ "Was the correct choice selected among the options.",
103
  },
104
  ]),
105
 
 
114
  ],
115
  },
116
  {
117
+ label: "Scoring: Accuracy",
118
  description:
119
+ "Was the correct choice selected among the options.",
120
  },
121
  ]),
122
 
 
125
  [
126
  {
127
  label: "Coverage",
128
+ description: "Expertly reviewed multichoice questions across domains, for example:",
129
  subItems: [
130
  "Medicine and healthcare",
131
  "Law and ethics",
 
134
  ],
135
  },
136
  {
137
+ label: "Scoring: Accuracy",
138
  description:
139
+ "Was the correct choice selected among the options.",
140
  },
141
  ]
142
  ),
 
146
  label: "Definition",
147
  description: "The fundamental structure and design of the model",
148
  subItems: [
149
+ "Pretrained: Foundational models, initially trained on large datasets without task-specific tuning, serving as a versatile base for further development.",
150
+ "Continuously Pretrained: Base models trained with a data mix evolving as the model is trained, with the addition of specialized data during the last training steps.",
151
+ "Fine-tuned: Base models, fine-tuned on specialised domain data (legal, medical, ...), and optimized for particular tasks.",
152
+ "Chat: Models fine-tuned with IFT, RLHF, DPO, and other techniques, to handle conversational contexts effectively.",
153
+ "Merged: Combining multiple models through weights averaging or similar methods.",
154
+ "Multimodal: Models which can handle several modalities (text & image/audio/video/...). We only evaluate the text capabilities.",
155
  ],
156
  },
157
  {
158
  label: "Impact",
159
  description: "How architecture affects model capabilities",
160
  subItems: [
161
+ "Base models are expected to perform less well on instruction following evaluations, like IFEval.",
162
+ "Fine-tuned and chat models can be more verbose and more chatty than base models.",
163
+ "Merged models tend to exhibit good performance on benchmarks, which do not translate to real-world situations.",
164
  ],
165
  },
166
  ]),
 
171
  description:
172
  "Data format used to store model weights and perform computations",
173
  subItems: [
174
+ "bfloat16: Half precision (Brain Float format), good for stability",
175
+ "float16: Half precision",
176
+ "8bit/4bit: Quantized formats, for efficiency",
177
+ "GPTQ/AWQ: Quantized methods",
178
  ],
179
  },
180
  {
 
183
  subItems: [
184
  "Higher precision = better accuracy but more memory usage",
185
  "Lower precision = faster inference and smaller size",
 
186
  "Trade-off between model quality and resource usage",
187
  ],
188
  },
 
 
 
 
 
 
 
 
 
189
  ]),
190
 
191
  FLAGS: createTooltipContent("Model Flags and Special Features:", [
192
  {
193
+ label: "Filters",
 
194
  subItems: [
195
+ "Mixture of Expert: Uses a MoE architecture",
196
+ "Merged models: Created by averaging other models",
197
+ "Contaminated: Flagged by users from the community for (possibly accidental) cheating",
198
+ "Unavailable: No longer on the hub (private, deleted) or missing a license tag",
199
  ],
200
  },
201
  {
202
+ label: "Purpose",
203
+ description: "Why do people want to hide these models?",
204
  subItems: [
205
+ "Mixture of Experts: These models can be too parameter heavy",
206
+ "Merged models: Performance on benchmarks tend to be inflated compared to real life usage",
207
+ "Contaminated: Performance on benchmarks is inflated and not reflecting real life usage",
 
208
  ],
209
  },
210
  ]),
 
250
  subItems: [
251
  "Large models can have significant carbon footprints",
252
  "Helps make informed choices about model selection",
 
253
  ],
254
  },
255
  {
 
321
  "Efficient models for edge devices, optimized for blazing fast inference.",
322
  },
323
  {
324
+ label: "Smol Models (3B-7B)",
325
  description:
326
+ "Efficient models for consumer hardware, optimized for fast inference.",
327
  },
328
  {
329
+ label: "Mid-range models (7B-65B)",
330
  description:
331
  "A bit of everything here, with overall balanced performance and resource usage around 30B.",
332
  },