David Pomerenke
commited on
Commit
·
8274634
1
Parent(s):
9051509
Run on 100 languages, adjust display
Browse files- evals/backend.py +1 -2
- evals/main.py +5 -13
- evals/models.py +8 -8
- evals/tasks.py +1 -1
- frontend/src/components/LanguageTable.js +8 -8
- frontend/src/components/ModelTable.js +32 -16
- results.json +0 -0
evals/backend.py
CHANGED
@@ -20,6 +20,7 @@ models = pd.DataFrame(results["models"])
|
|
20 |
def mean(lst):
|
21 |
return sum(lst) / len(lst) if lst else None
|
22 |
|
|
|
23 |
|
24 |
def make_model_table(df, models):
|
25 |
df = (
|
@@ -29,7 +30,6 @@ def make_model_table(df, models):
|
|
29 |
)
|
30 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
31 |
df = df.drop(columns=["task", "metric"])
|
32 |
-
task_metrics = df["task_metric"].unique()
|
33 |
df = df.pivot(index="model", columns="task_metric", values="score").fillna(0)
|
34 |
df["average"] = df[task_metrics].mean(axis=1)
|
35 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
@@ -59,7 +59,6 @@ def make_language_table(df, languages):
|
|
59 |
)
|
60 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
61 |
df = df.drop(columns=["task", "metric"])
|
62 |
-
task_metrics = df["task_metric"].unique()
|
63 |
df = (
|
64 |
df.pivot(index="bcp_47", columns="task_metric", values="score")
|
65 |
.fillna(0)
|
|
|
20 |
def mean(lst):
|
21 |
return sum(lst) / len(lst) if lst else None
|
22 |
|
23 |
+
task_metrics = ["translation_bleu", "classification_accuracy"]
|
24 |
|
25 |
def make_model_table(df, models):
|
26 |
df = (
|
|
|
30 |
)
|
31 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
32 |
df = df.drop(columns=["task", "metric"])
|
|
|
33 |
df = df.pivot(index="model", columns="task_metric", values="score").fillna(0)
|
34 |
df["average"] = df[task_metrics].mean(axis=1)
|
35 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
|
|
59 |
)
|
60 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
61 |
df = df.drop(columns=["task", "metric"])
|
|
|
62 |
df = (
|
63 |
df.pivot(index="bcp_47", columns="task_metric", values="score")
|
64 |
.fillna(0)
|
evals/main.py
CHANGED
@@ -6,16 +6,12 @@ import pandas as pd
|
|
6 |
from tqdm.asyncio import tqdm_asyncio
|
7 |
|
8 |
from languages import languages
|
9 |
-
from models import
|
10 |
from tasks import tasks
|
11 |
|
12 |
# ===== config =====
|
13 |
|
14 |
-
n_sentences =
|
15 |
-
langs_eval = languages.iloc[:30]
|
16 |
-
langs_eval_detailed = languages.iloc[:2]
|
17 |
-
transcription_langs_eval = languages.iloc[:10]
|
18 |
-
transcription_langs_eval_detailed = languages.iloc[:5]
|
19 |
|
20 |
# ===== run evaluation and aggregate results =====
|
21 |
|
@@ -23,16 +19,12 @@ transcription_langs_eval_detailed = languages.iloc[:5]
|
|
23 |
async def evaluate():
|
24 |
print("running evaluations")
|
25 |
results = [
|
26 |
-
task(model,
|
27 |
for task in tasks
|
28 |
for i in range(n_sentences)
|
29 |
-
for
|
30 |
for model in models["id"]
|
31 |
-
if
|
32 |
-
and (
|
33 |
-
model == model_fast
|
34 |
-
or original_language.bcp_47 in langs_eval_detailed.bcp_47.values
|
35 |
-
)
|
36 |
]
|
37 |
return await tqdm_asyncio.gather(*results, miniters=1)
|
38 |
|
|
|
6 |
from tqdm.asyncio import tqdm_asyncio
|
7 |
|
8 |
from languages import languages
|
9 |
+
from models import models
|
10 |
from tasks import tasks
|
11 |
|
12 |
# ===== config =====
|
13 |
|
14 |
+
n_sentences = 10
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# ===== run evaluation and aggregate results =====
|
17 |
|
|
|
19 |
async def evaluate():
|
20 |
print("running evaluations")
|
21 |
results = [
|
22 |
+
task(model, lang.bcp_47, i)
|
23 |
for task in tasks
|
24 |
for i in range(n_sentences)
|
25 |
+
for lang in languages.iloc[:100].itertuples()
|
26 |
for model in models["id"]
|
27 |
+
if lang.in_benchmark
|
|
|
|
|
|
|
|
|
28 |
]
|
29 |
return await tqdm_asyncio.gather(*results, miniters=1)
|
30 |
|
evals/models.py
CHANGED
@@ -14,23 +14,24 @@ from requests import HTTPError
|
|
14 |
models = [
|
15 |
"openai/gpt-4o-mini", # 0.6$/M tokens
|
16 |
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
|
|
|
17 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
18 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$/M tokens
|
19 |
"meta-llama/llama-3-70b-instruct", # 0.4$/M tokens
|
20 |
-
"mistralai/mistral-small-24b-instruct
|
21 |
-
"mistralai/mistral-
|
|
|
22 |
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
23 |
-
"google/gemini-2.0-flash-lite-001", # 0.3$/M tokens
|
24 |
"google/gemma-3-27b-it", # 0.2$/M tokens
|
25 |
# "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
|
26 |
-
"qwen/qwq-32b",
|
27 |
-
|
28 |
# "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
|
29 |
-
"microsoft/phi-4-multimodal-instruct",
|
30 |
"amazon/nova-micro-v1", # 0.09$/M tokens
|
31 |
# "openGPT-X/Teuken-7B-instruct-research-v0.4", # not on OpenRouter
|
32 |
]
|
33 |
-
model_fast = "meta-llama/llama-3.3-70b-instruct"
|
34 |
|
35 |
transcription_models = [
|
36 |
"elevenlabs/scribe_v1",
|
@@ -38,7 +39,6 @@ transcription_models = [
|
|
38 |
# "openai/whisper-small",
|
39 |
# "facebook/seamless-m4t-v2-large",
|
40 |
]
|
41 |
-
transcription_model_fast = "elevenlabs/scribe_v1"
|
42 |
|
43 |
load_dotenv()
|
44 |
client = AsyncOpenAI(
|
|
|
14 |
models = [
|
15 |
"openai/gpt-4o-mini", # 0.6$/M tokens
|
16 |
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
|
17 |
+
"meta-llama/llama-4-maverick", # 0.6$/M tokens
|
18 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
19 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$/M tokens
|
20 |
"meta-llama/llama-3-70b-instruct", # 0.4$/M tokens
|
21 |
+
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$/M tokens
|
22 |
+
# "mistralai/mistral-saba", # 0.6$/M tokens
|
23 |
+
# "mistralai/mistral-nemo", # 0.08$/M tokens
|
24 |
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
25 |
+
# "google/gemini-2.0-flash-lite-001", # 0.3$/M tokens
|
26 |
"google/gemma-3-27b-it", # 0.2$/M tokens
|
27 |
# "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
|
28 |
+
"qwen/qwq-32b", # 0.2$/M tokens
|
29 |
+
"deepseek/deepseek-chat-v3-0324", # 1.1$/M tokens
|
30 |
# "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
|
31 |
+
"microsoft/phi-4-multimodal-instruct", # 0.1$/M tokens
|
32 |
"amazon/nova-micro-v1", # 0.09$/M tokens
|
33 |
# "openGPT-X/Teuken-7B-instruct-research-v0.4", # not on OpenRouter
|
34 |
]
|
|
|
35 |
|
36 |
transcription_models = [
|
37 |
"elevenlabs/scribe_v1",
|
|
|
39 |
# "openai/whisper-small",
|
40 |
# "facebook/seamless-m4t-v2-large",
|
41 |
]
|
|
|
42 |
|
43 |
load_dotenv()
|
44 |
client = AsyncOpenAI(
|
evals/tasks.py
CHANGED
@@ -212,6 +212,6 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
212 |
tasks = [
|
213 |
translate_and_evaluate,
|
214 |
classify_and_evaluate,
|
215 |
-
mlm_and_evaluate,
|
216 |
# transcribe_and_evaluate,
|
217 |
]
|
|
|
212 |
tasks = [
|
213 |
translate_and_evaluate,
|
214 |
classify_and_evaluate,
|
215 |
+
# mlm_and_evaluate,
|
216 |
# transcribe_and_evaluate,
|
217 |
]
|
frontend/src/components/LanguageTable.js
CHANGED
@@ -174,12 +174,12 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages }) => {
|
|
174 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
175 |
/>
|
176 |
<Column
|
177 |
-
field='
|
178 |
header='Translation'
|
179 |
sortable
|
180 |
-
body={scoreBodyTemplate('
|
181 |
-
minScore: 0
|
182 |
-
maxScore: 0.
|
183 |
})}
|
184 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
185 |
/>
|
@@ -188,12 +188,12 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages }) => {
|
|
188 |
header='Classification'
|
189 |
sortable
|
190 |
body={scoreBodyTemplate('classification_accuracy', {
|
191 |
-
minScore: 0
|
192 |
-
maxScore: 0.
|
193 |
})}
|
194 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
195 |
/>
|
196 |
-
<Column
|
197 |
field='language_modeling_chrf'
|
198 |
header='Language Modeling'
|
199 |
sortable
|
@@ -202,7 +202,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages }) => {
|
|
202 |
maxScore: 1
|
203 |
})}
|
204 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
205 |
-
/>
|
206 |
</DataTable>
|
207 |
)
|
208 |
}
|
|
|
174 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
175 |
/>
|
176 |
<Column
|
177 |
+
field='translation_bleu'
|
178 |
header='Translation'
|
179 |
sortable
|
180 |
+
body={scoreBodyTemplate('translation_bleu', {
|
181 |
+
minScore: 0,
|
182 |
+
maxScore: 0.5
|
183 |
})}
|
184 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
185 |
/>
|
|
|
188 |
header='Classification'
|
189 |
sortable
|
190 |
body={scoreBodyTemplate('classification_accuracy', {
|
191 |
+
minScore: 0,
|
192 |
+
maxScore: 0.5
|
193 |
})}
|
194 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
195 |
/>
|
196 |
+
{/* <Column
|
197 |
field='language_modeling_chrf'
|
198 |
header='Language Modeling'
|
199 |
sortable
|
|
|
202 |
maxScore: 1
|
203 |
})}
|
204 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
205 |
+
/> */}
|
206 |
</DataTable>
|
207 |
)
|
208 |
}
|
frontend/src/components/ModelTable.js
CHANGED
@@ -103,18 +103,29 @@ const ModelTable = ({ data }) => {
|
|
103 |
return <div style={{ textAlign: 'center' }}>{sizeStr}</div>
|
104 |
}
|
105 |
|
106 |
-
const capitalize = s =>
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
const providerBodyTemplate = rowData => {
|
109 |
-
const providerName = rowData.model
|
|
|
|
|
|
|
|
|
110 |
return providerName
|
111 |
}
|
112 |
|
113 |
const modelBodyTemplate = rowData => {
|
114 |
-
const modelName = rowData.model
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
|
|
118 |
}
|
119 |
|
120 |
const typeBodyTemplate = rowData => {
|
@@ -148,7 +159,12 @@ const ModelTable = ({ data }) => {
|
|
148 |
style={{ width: '800px', minHeight: '650px' }}
|
149 |
>
|
150 |
<Column field='rank' body={rankBodyTemplate} />
|
151 |
-
<Column
|
|
|
|
|
|
|
|
|
|
|
152 |
<Column
|
153 |
field='model'
|
154 |
header='Model'
|
@@ -178,16 +194,16 @@ const ModelTable = ({ data }) => {
|
|
178 |
field='average'
|
179 |
header='Average'
|
180 |
sortable
|
181 |
-
body={scoreBodyTemplate('average', { minScore: 0
|
182 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
183 |
/>
|
184 |
<Column
|
185 |
-
field='
|
186 |
header='Translation'
|
187 |
sortable
|
188 |
-
body={scoreBodyTemplate('
|
189 |
-
minScore: 0
|
190 |
-
maxScore: 0.
|
191 |
})}
|
192 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
193 |
/>
|
@@ -196,12 +212,12 @@ const ModelTable = ({ data }) => {
|
|
196 |
header='Classification'
|
197 |
sortable
|
198 |
body={scoreBodyTemplate('classification_accuracy', {
|
199 |
-
minScore: 0
|
200 |
-
maxScore: 0.
|
201 |
})}
|
202 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
203 |
/>
|
204 |
-
<Column
|
205 |
field='language_modeling_chrf'
|
206 |
header='Language Modeling'
|
207 |
sortable
|
@@ -210,7 +226,7 @@ const ModelTable = ({ data }) => {
|
|
210 |
maxScore: 1
|
211 |
})}
|
212 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
213 |
-
/>
|
214 |
</DataTable>
|
215 |
)
|
216 |
}
|
|
|
103 |
return <div style={{ textAlign: 'center' }}>{sizeStr}</div>
|
104 |
}
|
105 |
|
106 |
+
const capitalize = s =>
|
107 |
+
(String(s).charAt(0).toUpperCase() + String(s).slice(1))
|
108 |
+
.replace(/gpt/i, 'GPT')
|
109 |
+
.replace(/qwq/i, 'QwQ')
|
110 |
+
.replace(/deepseek/i, 'DeepSeek')
|
111 |
+
.replace(/openai/i, 'OpenAI')
|
112 |
|
113 |
const providerBodyTemplate = rowData => {
|
114 |
+
const providerName = rowData.model
|
115 |
+
.split('/')[0]
|
116 |
+
.split('-')
|
117 |
+
.map(capitalize)
|
118 |
+
.join(' ')
|
119 |
return providerName
|
120 |
}
|
121 |
|
122 |
const modelBodyTemplate = rowData => {
|
123 |
+
const modelName = rowData.model
|
124 |
+
.split('/')[1]
|
125 |
+
.split('-')
|
126 |
+
.map(capitalize)
|
127 |
+
.join(' ')
|
128 |
+
return <div style={{ fontWeight: 'bold', height: '100%' }}>{modelName}</div>
|
129 |
}
|
130 |
|
131 |
const typeBodyTemplate = rowData => {
|
|
|
159 |
style={{ width: '800px', minHeight: '650px' }}
|
160 |
>
|
161 |
<Column field='rank' body={rankBodyTemplate} />
|
162 |
+
<Column
|
163 |
+
field='provider'
|
164 |
+
header='Provider'
|
165 |
+
style={{ minWidth: '7rem' }}
|
166 |
+
body={providerBodyTemplate}
|
167 |
+
/>
|
168 |
<Column
|
169 |
field='model'
|
170 |
header='Model'
|
|
|
194 |
field='average'
|
195 |
header='Average'
|
196 |
sortable
|
197 |
+
body={scoreBodyTemplate('average', { minScore: 0, maxScore: 0.6 })}
|
198 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
199 |
/>
|
200 |
<Column
|
201 |
+
field='translation_bleu'
|
202 |
header='Translation'
|
203 |
sortable
|
204 |
+
body={scoreBodyTemplate('translation_bleu', {
|
205 |
+
minScore: 0,
|
206 |
+
maxScore: 0.3
|
207 |
})}
|
208 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
209 |
/>
|
|
|
212 |
header='Classification'
|
213 |
sortable
|
214 |
body={scoreBodyTemplate('classification_accuracy', {
|
215 |
+
minScore: 0,
|
216 |
+
maxScore: 0.9
|
217 |
})}
|
218 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
219 |
/>
|
220 |
+
{/* <Column
|
221 |
field='language_modeling_chrf'
|
222 |
header='Language Modeling'
|
223 |
sortable
|
|
|
226 |
maxScore: 1
|
227 |
})}
|
228 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
229 |
+
/> */}
|
230 |
</DataTable>
|
231 |
)
|
232 |
}
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|