David Pomerenke commited on
Commit
8274634
·
1 Parent(s): 9051509

Run on 100 languages, adjust display

Browse files
evals/backend.py CHANGED
@@ -20,6 +20,7 @@ models = pd.DataFrame(results["models"])
20
  def mean(lst):
21
  return sum(lst) / len(lst) if lst else None
22
 
 
23
 
24
  def make_model_table(df, models):
25
  df = (
@@ -29,7 +30,6 @@ def make_model_table(df, models):
29
  )
30
  df["task_metric"] = df["task"] + "_" + df["metric"]
31
  df = df.drop(columns=["task", "metric"])
32
- task_metrics = df["task_metric"].unique()
33
  df = df.pivot(index="model", columns="task_metric", values="score").fillna(0)
34
  df["average"] = df[task_metrics].mean(axis=1)
35
  df = df.sort_values(by="average", ascending=False).reset_index()
@@ -59,7 +59,6 @@ def make_language_table(df, languages):
59
  )
60
  df["task_metric"] = df["task"] + "_" + df["metric"]
61
  df = df.drop(columns=["task", "metric"])
62
- task_metrics = df["task_metric"].unique()
63
  df = (
64
  df.pivot(index="bcp_47", columns="task_metric", values="score")
65
  .fillna(0)
 
20
  def mean(lst):
21
  return sum(lst) / len(lst) if lst else None
22
 
23
+ task_metrics = ["translation_bleu", "classification_accuracy"]
24
 
25
  def make_model_table(df, models):
26
  df = (
 
30
  )
31
  df["task_metric"] = df["task"] + "_" + df["metric"]
32
  df = df.drop(columns=["task", "metric"])
 
33
  df = df.pivot(index="model", columns="task_metric", values="score").fillna(0)
34
  df["average"] = df[task_metrics].mean(axis=1)
35
  df = df.sort_values(by="average", ascending=False).reset_index()
 
59
  )
60
  df["task_metric"] = df["task"] + "_" + df["metric"]
61
  df = df.drop(columns=["task", "metric"])
 
62
  df = (
63
  df.pivot(index="bcp_47", columns="task_metric", values="score")
64
  .fillna(0)
evals/main.py CHANGED
@@ -6,16 +6,12 @@ import pandas as pd
6
  from tqdm.asyncio import tqdm_asyncio
7
 
8
  from languages import languages
9
- from models import model_fast, models
10
  from tasks import tasks
11
 
12
  # ===== config =====
13
 
14
- n_sentences = 30
15
- langs_eval = languages.iloc[:30]
16
- langs_eval_detailed = languages.iloc[:2]
17
- transcription_langs_eval = languages.iloc[:10]
18
- transcription_langs_eval_detailed = languages.iloc[:5]
19
 
20
  # ===== run evaluation and aggregate results =====
21
 
@@ -23,16 +19,12 @@ transcription_langs_eval_detailed = languages.iloc[:5]
23
  async def evaluate():
24
  print("running evaluations")
25
  results = [
26
- task(model, original_language.bcp_47, i)
27
  for task in tasks
28
  for i in range(n_sentences)
29
- for original_language in langs_eval.itertuples()
30
  for model in models["id"]
31
- if original_language.in_benchmark
32
- and (
33
- model == model_fast
34
- or original_language.bcp_47 in langs_eval_detailed.bcp_47.values
35
- )
36
  ]
37
  return await tqdm_asyncio.gather(*results, miniters=1)
38
 
 
6
  from tqdm.asyncio import tqdm_asyncio
7
 
8
  from languages import languages
9
+ from models import models
10
  from tasks import tasks
11
 
12
  # ===== config =====
13
 
14
+ n_sentences = 10
 
 
 
 
15
 
16
  # ===== run evaluation and aggregate results =====
17
 
 
19
  async def evaluate():
20
  print("running evaluations")
21
  results = [
22
+ task(model, lang.bcp_47, i)
23
  for task in tasks
24
  for i in range(n_sentences)
25
+ for lang in languages.iloc[:100].itertuples()
26
  for model in models["id"]
27
+ if lang.in_benchmark
 
 
 
 
28
  ]
29
  return await tqdm_asyncio.gather(*results, miniters=1)
30
 
evals/models.py CHANGED
@@ -14,23 +14,24 @@ from requests import HTTPError
14
  models = [
15
  "openai/gpt-4o-mini", # 0.6$/M tokens
16
  # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
 
17
  "meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
18
  "meta-llama/llama-3.1-70b-instruct", # 0.3$/M tokens
19
  "meta-llama/llama-3-70b-instruct", # 0.4$/M tokens
20
- "mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
21
- "mistralai/mistral-nemo",
 
22
  "google/gemini-2.0-flash-001", # 0.4$/M tokens
23
- "google/gemini-2.0-flash-lite-001", # 0.3$/M tokens
24
  "google/gemma-3-27b-it", # 0.2$/M tokens
25
  # "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
26
- "qwen/qwq-32b",
27
- # "deepseek/deepseek-chat", # 1.3$/M tokens
28
  # "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
29
- "microsoft/phi-4-multimodal-instruct",
30
  "amazon/nova-micro-v1", # 0.09$/M tokens
31
  # "openGPT-X/Teuken-7B-instruct-research-v0.4", # not on OpenRouter
32
  ]
33
- model_fast = "meta-llama/llama-3.3-70b-instruct"
34
 
35
  transcription_models = [
36
  "elevenlabs/scribe_v1",
@@ -38,7 +39,6 @@ transcription_models = [
38
  # "openai/whisper-small",
39
  # "facebook/seamless-m4t-v2-large",
40
  ]
41
- transcription_model_fast = "elevenlabs/scribe_v1"
42
 
43
  load_dotenv()
44
  client = AsyncOpenAI(
 
14
  models = [
15
  "openai/gpt-4o-mini", # 0.6$/M tokens
16
  # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
17
+ "meta-llama/llama-4-maverick", # 0.6$/M tokens
18
  "meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
19
  "meta-llama/llama-3.1-70b-instruct", # 0.3$/M tokens
20
  "meta-llama/llama-3-70b-instruct", # 0.4$/M tokens
21
+ "mistralai/mistral-small-3.1-24b-instruct", # 0.3$/M tokens
22
+ # "mistralai/mistral-saba", # 0.6$/M tokens
23
+ # "mistralai/mistral-nemo", # 0.08$/M tokens
24
  "google/gemini-2.0-flash-001", # 0.4$/M tokens
25
+ # "google/gemini-2.0-flash-lite-001", # 0.3$/M tokens
26
  "google/gemma-3-27b-it", # 0.2$/M tokens
27
  # "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
28
+ "qwen/qwq-32b", # 0.2$/M tokens
29
+ "deepseek/deepseek-chat-v3-0324", # 1.1$/M tokens
30
  # "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
31
+ "microsoft/phi-4-multimodal-instruct", # 0.1$/M tokens
32
  "amazon/nova-micro-v1", # 0.09$/M tokens
33
  # "openGPT-X/Teuken-7B-instruct-research-v0.4", # not on OpenRouter
34
  ]
 
35
 
36
  transcription_models = [
37
  "elevenlabs/scribe_v1",
 
39
  # "openai/whisper-small",
40
  # "facebook/seamless-m4t-v2-large",
41
  ]
 
42
 
43
  load_dotenv()
44
  client = AsyncOpenAI(
evals/tasks.py CHANGED
@@ -212,6 +212,6 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
212
  tasks = [
213
  translate_and_evaluate,
214
  classify_and_evaluate,
215
- mlm_and_evaluate,
216
  # transcribe_and_evaluate,
217
  ]
 
212
  tasks = [
213
  translate_and_evaluate,
214
  classify_and_evaluate,
215
+ # mlm_and_evaluate,
216
  # transcribe_and_evaluate,
217
  ]
frontend/src/components/LanguageTable.js CHANGED
@@ -174,12 +174,12 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages }) => {
174
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
175
  />
176
  <Column
177
- field='translation_chrf'
178
  header='Translation'
179
  sortable
180
- body={scoreBodyTemplate('translation_chrf', {
181
- minScore: 0.3,
182
- maxScore: 0.6
183
  })}
184
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
185
  />
@@ -188,12 +188,12 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages }) => {
188
  header='Classification'
189
  sortable
190
  body={scoreBodyTemplate('classification_accuracy', {
191
- minScore: 0.3,
192
- maxScore: 0.7
193
  })}
194
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
195
  />
196
- <Column
197
  field='language_modeling_chrf'
198
  header='Language Modeling'
199
  sortable
@@ -202,7 +202,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages }) => {
202
  maxScore: 1
203
  })}
204
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
205
- />
206
  </DataTable>
207
  )
208
  }
 
174
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
175
  />
176
  <Column
177
+ field='translation_bleu'
178
  header='Translation'
179
  sortable
180
+ body={scoreBodyTemplate('translation_bleu', {
181
+ minScore: 0,
182
+ maxScore: 0.5
183
  })}
184
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
185
  />
 
188
  header='Classification'
189
  sortable
190
  body={scoreBodyTemplate('classification_accuracy', {
191
+ minScore: 0,
192
+ maxScore: 0.5
193
  })}
194
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
195
  />
196
+ {/* <Column
197
  field='language_modeling_chrf'
198
  header='Language Modeling'
199
  sortable
 
202
  maxScore: 1
203
  })}
204
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
205
+ /> */}
206
  </DataTable>
207
  )
208
  }
frontend/src/components/ModelTable.js CHANGED
@@ -103,18 +103,29 @@ const ModelTable = ({ data }) => {
103
  return <div style={{ textAlign: 'center' }}>{sizeStr}</div>
104
  }
105
 
106
- const capitalize = s => String(s).charAt(0).toUpperCase() + String(s).slice(1)
 
 
 
 
 
107
 
108
  const providerBodyTemplate = rowData => {
109
- const providerName = rowData.model.split('/')[0].split('-').map(capitalize).join(' ')
 
 
 
 
110
  return providerName
111
  }
112
 
113
  const modelBodyTemplate = rowData => {
114
- const modelName = rowData.model.split('/')[1].split('-').map(capitalize).join(' ')
115
- return (
116
- <div style={{ fontWeight: 'bold', height: '100%' }}>{modelName}</div>
117
- )
 
 
118
  }
119
 
120
  const typeBodyTemplate = rowData => {
@@ -148,7 +159,12 @@ const ModelTable = ({ data }) => {
148
  style={{ width: '800px', minHeight: '650px' }}
149
  >
150
  <Column field='rank' body={rankBodyTemplate} />
151
- <Column field='provider' header='Provider' style={{ minWidth: '5rem' }} body={providerBodyTemplate} />
 
 
 
 
 
152
  <Column
153
  field='model'
154
  header='Model'
@@ -178,16 +194,16 @@ const ModelTable = ({ data }) => {
178
  field='average'
179
  header='Average'
180
  sortable
181
- body={scoreBodyTemplate('average', { minScore: 0.3, maxScore: 0.6 })}
182
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
183
  />
184
  <Column
185
- field='translation_chrf'
186
  header='Translation'
187
  sortable
188
- body={scoreBodyTemplate('translation_chrf', {
189
- minScore: 0.3,
190
- maxScore: 0.7
191
  })}
192
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
193
  />
@@ -196,12 +212,12 @@ const ModelTable = ({ data }) => {
196
  header='Classification'
197
  sortable
198
  body={scoreBodyTemplate('classification_accuracy', {
199
- minScore: 0.3,
200
- maxScore: 0.8
201
  })}
202
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
203
  />
204
- <Column
205
  field='language_modeling_chrf'
206
  header='Language Modeling'
207
  sortable
@@ -210,7 +226,7 @@ const ModelTable = ({ data }) => {
210
  maxScore: 1
211
  })}
212
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
213
- />
214
  </DataTable>
215
  )
216
  }
 
103
  return <div style={{ textAlign: 'center' }}>{sizeStr}</div>
104
  }
105
 
106
+ const capitalize = s =>
107
+ (String(s).charAt(0).toUpperCase() + String(s).slice(1))
108
+ .replace(/gpt/i, 'GPT')
109
+ .replace(/qwq/i, 'QwQ')
110
+ .replace(/deepseek/i, 'DeepSeek')
111
+ .replace(/openai/i, 'OpenAI')
112
 
113
  const providerBodyTemplate = rowData => {
114
+ const providerName = rowData.model
115
+ .split('/')[0]
116
+ .split('-')
117
+ .map(capitalize)
118
+ .join(' ')
119
  return providerName
120
  }
121
 
122
  const modelBodyTemplate = rowData => {
123
+ const modelName = rowData.model
124
+ .split('/')[1]
125
+ .split('-')
126
+ .map(capitalize)
127
+ .join(' ')
128
+ return <div style={{ fontWeight: 'bold', height: '100%' }}>{modelName}</div>
129
  }
130
 
131
  const typeBodyTemplate = rowData => {
 
159
  style={{ width: '800px', minHeight: '650px' }}
160
  >
161
  <Column field='rank' body={rankBodyTemplate} />
162
+ <Column
163
+ field='provider'
164
+ header='Provider'
165
+ style={{ minWidth: '7rem' }}
166
+ body={providerBodyTemplate}
167
+ />
168
  <Column
169
  field='model'
170
  header='Model'
 
194
  field='average'
195
  header='Average'
196
  sortable
197
+ body={scoreBodyTemplate('average', { minScore: 0, maxScore: 0.6 })}
198
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
199
  />
200
  <Column
201
+ field='translation_bleu'
202
  header='Translation'
203
  sortable
204
+ body={scoreBodyTemplate('translation_bleu', {
205
+ minScore: 0,
206
+ maxScore: 0.3
207
  })}
208
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
209
  />
 
212
  header='Classification'
213
  sortable
214
  body={scoreBodyTemplate('classification_accuracy', {
215
+ minScore: 0,
216
+ maxScore: 0.9
217
  })}
218
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
219
  />
220
+ {/* <Column
221
  field='language_modeling_chrf'
222
  header='Language Modeling'
223
  sortable
 
226
  maxScore: 1
227
  })}
228
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
229
+ /> */}
230
  </DataTable>
231
  )
232
  }
results.json CHANGED
The diff for this file is too large to render. See raw diff