Upload from GitHub Actions: Add math benchmarks
Browse files- datasets.json +44 -43
- evals/backend.py +1 -0
- evals/datasets_/mgsm.py +45 -0
- evals/main.py +7 -4
- evals/tasks.py +41 -0
- frontend/src/components/ModelTable.js +0 -1
- frontend/src/components/ScoreColumns.js +13 -2
- results.json +0 -0
datasets.json
CHANGED
@@ -249,6 +249,50 @@
|
|
249 |
"implemented": false,
|
250 |
"group": "Multitask Language Understanding"
|
251 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
{
|
253 |
"name": "FLEURS",
|
254 |
"author": "Meta",
|
@@ -477,49 +521,6 @@
|
|
477 |
"implemented": false,
|
478 |
"group": "Adversarial Language Modelling"
|
479 |
},
|
480 |
-
{
|
481 |
-
"name": "MGSM",
|
482 |
-
"author": "Google",
|
483 |
-
"author_url": "https://google.com",
|
484 |
-
"url": "https://huggingface.co/datasets/juletxara/mgsm",
|
485 |
-
"n_languages": 10,
|
486 |
-
"tasks": [
|
487 |
-
"math"
|
488 |
-
],
|
489 |
-
"parallel": true,
|
490 |
-
"base": "MGSM",
|
491 |
-
"group": "Grade School Math"
|
492 |
-
},
|
493 |
-
{
|
494 |
-
"name": "AfriMGSM",
|
495 |
-
"author": "Masakhane",
|
496 |
-
"author_url": "https://www.masakhane.io",
|
497 |
-
"url": "https://huggingface.co/datasets/masakhane/afrimgsm",
|
498 |
-
"n_languages": 18,
|
499 |
-
"tasks": [
|
500 |
-
"math"
|
501 |
-
],
|
502 |
-
"parallel": true,
|
503 |
-
"translation": "human",
|
504 |
-
"base": "MGSM",
|
505 |
-
"implemented": false,
|
506 |
-
"group": "Grade School Math"
|
507 |
-
},
|
508 |
-
{
|
509 |
-
"name": "GSM8K-X",
|
510 |
-
"author": "OpenGPT-X",
|
511 |
-
"author_url": "https://opengpt-x.de",
|
512 |
-
"url": "https://huggingface.co/datasets/openGPT-X/gsm8kx",
|
513 |
-
"n_languages": 20,
|
514 |
-
"tasks": [
|
515 |
-
"math"
|
516 |
-
],
|
517 |
-
"parallel": true,
|
518 |
-
"translation": "machine",
|
519 |
-
"base": "MGSM",
|
520 |
-
"implemented": false,
|
521 |
-
"group": "Grade School Math"
|
522 |
-
},
|
523 |
{
|
524 |
"name": "WikiANN / PAN-X",
|
525 |
"author": "Academic",
|
|
|
249 |
"implemented": false,
|
250 |
"group": "Multitask Language Understanding"
|
251 |
},
|
252 |
+
{
|
253 |
+
"name": "MGSM",
|
254 |
+
"author": "Google",
|
255 |
+
"author_url": "https://google.com",
|
256 |
+
"url": "https://huggingface.co/datasets/juletxara/mgsm",
|
257 |
+
"n_languages": 10,
|
258 |
+
"tasks": [
|
259 |
+
"math"
|
260 |
+
],
|
261 |
+
"parallel": true,
|
262 |
+
"base": "MGSM",
|
263 |
+
"implemented": true,
|
264 |
+
"group": "Grade School Math"
|
265 |
+
},
|
266 |
+
{
|
267 |
+
"name": "AfriMGSM",
|
268 |
+
"author": "Masakhane",
|
269 |
+
"author_url": "https://www.masakhane.io",
|
270 |
+
"url": "https://huggingface.co/datasets/masakhane/afrimgsm",
|
271 |
+
"n_languages": 18,
|
272 |
+
"tasks": [
|
273 |
+
"math"
|
274 |
+
],
|
275 |
+
"parallel": true,
|
276 |
+
"translation": "human",
|
277 |
+
"base": "MGSM",
|
278 |
+
"implemented": true,
|
279 |
+
"group": "Grade School Math"
|
280 |
+
},
|
281 |
+
{
|
282 |
+
"name": "GSM8K-X",
|
283 |
+
"author": "OpenGPT-X",
|
284 |
+
"author_url": "https://opengpt-x.de",
|
285 |
+
"url": "https://huggingface.co/datasets/openGPT-X/gsm8kx",
|
286 |
+
"n_languages": 20,
|
287 |
+
"tasks": [
|
288 |
+
"math"
|
289 |
+
],
|
290 |
+
"parallel": true,
|
291 |
+
"translation": "machine",
|
292 |
+
"base": "MGSM",
|
293 |
+
"implemented": true,
|
294 |
+
"group": "Grade School Math"
|
295 |
+
},
|
296 |
{
|
297 |
"name": "FLEURS",
|
298 |
"author": "Meta",
|
|
|
521 |
"implemented": false,
|
522 |
"group": "Adversarial Language Modelling"
|
523 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
524 |
{
|
525 |
"name": "WikiANN / PAN-X",
|
526 |
"author": "Academic",
|
evals/backend.py
CHANGED
@@ -25,6 +25,7 @@ task_metrics = [
|
|
25 |
"translation_to_bleu",
|
26 |
"classification_accuracy",
|
27 |
"mmlu_accuracy",
|
|
|
28 |
]
|
29 |
|
30 |
|
|
|
25 |
"translation_to_bleu",
|
26 |
"classification_accuracy",
|
27 |
"mmlu_accuracy",
|
28 |
+
"mgsm_accuracy",
|
29 |
]
|
30 |
|
31 |
|
evals/datasets_/mgsm.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset
|
2 |
+
from langcodes import Language, standardize_tag
|
3 |
+
|
4 |
+
slug_mgsm = "juletxara/mgsm"
|
5 |
+
tags_mgsm = {
|
6 |
+
standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_mgsm)
|
7 |
+
}
|
8 |
+
slug_afrimgsm = "masakhane/afrimgsm"
|
9 |
+
tags_afrimgsm = {
|
10 |
+
standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_afrimgsm)
|
11 |
+
}
|
12 |
+
slug_gsm8kx = "Eurolingua/gsm8kx"
|
13 |
+
tags_gsm8kx = {
|
14 |
+
standardize_tag(a, macro=True): a
|
15 |
+
for a in _get_dataset_config_names(slug_gsm8kx, trust_remote_code=True)
|
16 |
+
}
|
17 |
+
|
18 |
+
def parse_number(i):
|
19 |
+
if isinstance(i, int):
|
20 |
+
return i
|
21 |
+
try:
|
22 |
+
return int(i.replace(",", "").replace(".", ""))
|
23 |
+
except ValueError:
|
24 |
+
return None
|
25 |
+
|
26 |
+
def load_mgsm(language_bcp_47, nr):
|
27 |
+
if language_bcp_47 in tags_mgsm.keys():
|
28 |
+
ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
|
29 |
+
return slug_mgsm, ds[nr]
|
30 |
+
elif language_bcp_47 in tags_afrimgsm.keys():
|
31 |
+
ds = _load_dataset(
|
32 |
+
slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
|
33 |
+
)
|
34 |
+
return slug_afrimgsm, ds[nr]
|
35 |
+
elif language_bcp_47 in tags_gsm8kx.keys():
|
36 |
+
row = _load_dataset(
|
37 |
+
slug_gsm8kx,
|
38 |
+
subset=tags_gsm8kx[language_bcp_47],
|
39 |
+
split="test",
|
40 |
+
trust_remote_code=True,
|
41 |
+
)[nr]
|
42 |
+
row["answer_number"] = row["answer"].split("####")[1].strip()
|
43 |
+
return slug_gsm8kx, row
|
44 |
+
else:
|
45 |
+
return None, None
|
evals/main.py
CHANGED
@@ -16,12 +16,9 @@ n_models = 35
|
|
16 |
|
17 |
|
18 |
async def evaluate():
|
19 |
-
# save up-to-date info on models and languages
|
20 |
-
args = dict(orient="records", indent=2, force_ascii=False)
|
21 |
-
pd.DataFrame(models).to_json("models.json", **args)
|
22 |
-
pd.DataFrame(languages).to_json("languages.json", **args)
|
23 |
print("running evaluations")
|
24 |
old_results = pd.read_json("results.json")
|
|
|
25 |
# get all combinations of model, language and task
|
26 |
combis = [
|
27 |
(model, lang.bcp_47, task_name)
|
@@ -41,6 +38,7 @@ async def evaluate():
|
|
41 |
]
|
42 |
results = await tqdm_asyncio.gather(*results, miniters=1)
|
43 |
results = [r for group in results for r in group]
|
|
|
44 |
if results:
|
45 |
# aggregate results
|
46 |
results = pd.DataFrame(results)
|
@@ -53,6 +51,11 @@ async def evaluate():
|
|
53 |
results = pd.concat([old_results, results])
|
54 |
results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
|
55 |
results.to_json("results.json", **args)
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
|
58 |
if __name__ == "__main__":
|
|
|
16 |
|
17 |
|
18 |
async def evaluate():
|
|
|
|
|
|
|
|
|
19 |
print("running evaluations")
|
20 |
old_results = pd.read_json("results.json")
|
21 |
+
old_models = pd.read_json("models.json")
|
22 |
# get all combinations of model, language and task
|
23 |
combis = [
|
24 |
(model, lang.bcp_47, task_name)
|
|
|
38 |
]
|
39 |
results = await tqdm_asyncio.gather(*results, miniters=1)
|
40 |
results = [r for group in results for r in group]
|
41 |
+
args = dict(orient="records", indent=2, force_ascii=False)
|
42 |
if results:
|
43 |
# aggregate results
|
44 |
results = pd.DataFrame(results)
|
|
|
51 |
results = pd.concat([old_results, results])
|
52 |
results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
|
53 |
results.to_json("results.json", **args)
|
54 |
+
# save up-to-date info on models and languages
|
55 |
+
all_models = pd.concat([old_models, pd.DataFrame(models)])
|
56 |
+
all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
|
57 |
+
all_models.to_json("models.json", **args)
|
58 |
+
pd.DataFrame(languages).to_json("languages.json", **args)
|
59 |
|
60 |
|
61 |
if __name__ == "__main__":
|
evals/tasks.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
import random
|
2 |
from functools import partial
|
|
|
3 |
|
4 |
import evaluate
|
5 |
import pandas as pd
|
6 |
import sentencepiece as spm
|
7 |
from datasets_.flores import flores_sentences
|
|
|
8 |
from datasets_.mmlu import load_mmlu
|
9 |
from languages import languages, script_name
|
10 |
from models import complete, transcribe
|
@@ -247,6 +249,44 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
247 |
]
|
248 |
|
249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
251 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
252 |
fleurs = pd.read_csv(
|
@@ -284,5 +324,6 @@ tasks = {
|
|
284 |
"classification": classify_and_evaluate,
|
285 |
# "mlm": mlm_and_evaluate,
|
286 |
"mmlu": mmlu_and_evaluate,
|
|
|
287 |
# "asr": transcribe_and_evaluate,
|
288 |
}
|
|
|
1 |
import random
|
2 |
from functools import partial
|
3 |
+
from textwrap import dedent
|
4 |
|
5 |
import evaluate
|
6 |
import pandas as pd
|
7 |
import sentencepiece as spm
|
8 |
from datasets_.flores import flores_sentences
|
9 |
+
from datasets_.mgsm import load_mgsm, parse_number
|
10 |
from datasets_.mmlu import load_mmlu
|
11 |
from languages import languages, script_name
|
12 |
from models import complete, transcribe
|
|
|
249 |
]
|
250 |
|
251 |
|
252 |
+
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
253 |
+
system_prompt = """
|
254 |
+
Solve the math problem. Use reasoning, and finally give the answer as a number.
|
255 |
+
Response format: <reasoning> #### <number>
|
256 |
+
"""
|
257 |
+
system_prompt = dedent(system_prompt).strip()
|
258 |
+
ds_slug, question = load_mgsm(language_bcp_47, nr)
|
259 |
+
if not question:
|
260 |
+
return []
|
261 |
+
response = await complete(
|
262 |
+
model=model,
|
263 |
+
messages=[
|
264 |
+
{"role": "system", "content": system_prompt},
|
265 |
+
{"role": "user", "content": question["question"]},
|
266 |
+
],
|
267 |
+
temperature=0,
|
268 |
+
max_tokens=1024,
|
269 |
+
)
|
270 |
+
number = response.split("####")
|
271 |
+
if len(number) == 2:
|
272 |
+
accuracy = int(
|
273 |
+
parse_number(number[1].strip()) == parse_number(question["answer_number"])
|
274 |
+
)
|
275 |
+
else:
|
276 |
+
accuracy = 0
|
277 |
+
|
278 |
+
return [
|
279 |
+
{
|
280 |
+
"model": model,
|
281 |
+
"bcp_47": language_bcp_47,
|
282 |
+
"task": "mgsm",
|
283 |
+
"metric": "accuracy",
|
284 |
+
"score": accuracy,
|
285 |
+
"sentence_nr": nr,
|
286 |
+
}
|
287 |
+
]
|
288 |
+
|
289 |
+
|
290 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
291 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
292 |
fleurs = pd.read_csv(
|
|
|
324 |
"classification": classify_and_evaluate,
|
325 |
# "mlm": mlm_and_evaluate,
|
326 |
"mmlu": mmlu_and_evaluate,
|
327 |
+
"mgsm": mgsm_and_evaluate,
|
328 |
# "asr": transcribe_and_evaluate,
|
329 |
}
|
frontend/src/components/ModelTable.js
CHANGED
@@ -5,7 +5,6 @@ import { MultiSelect } from 'primereact/multiselect'
|
|
5 |
import { useState, useEffect } from 'react'
|
6 |
import Medal from './Medal'
|
7 |
import { Slider } from 'primereact/slider'
|
8 |
-
import ScoreField from './ScoreField'
|
9 |
import ScoreColumns from './ScoreColumns'
|
10 |
const ModelTable = ({ data }) => {
|
11 |
const [filters, setFilters] = useState({
|
|
|
5 |
import { useState, useEffect } from 'react'
|
6 |
import Medal from './Medal'
|
7 |
import { Slider } from 'primereact/slider'
|
|
|
8 |
import ScoreColumns from './ScoreColumns'
|
9 |
const ModelTable = ({ data }) => {
|
10 |
const [filters, setFilters] = useState({
|
frontend/src/components/ScoreColumns.js
CHANGED
@@ -64,7 +64,7 @@ const ScoreColumns = [
|
|
64 |
// />,
|
65 |
<Column
|
66 |
field='mmlu_accuracy'
|
67 |
-
header='
|
68 |
headerTooltip='Question Answering performance (accuracy on a sample of multilingual versions of the MMLU benchmark)'
|
69 |
sortable
|
70 |
body={scoreBodyTemplate('mmlu_accuracy', {
|
@@ -72,7 +72,18 @@ const ScoreColumns = [
|
|
72 |
maxScore: 1
|
73 |
})}
|
74 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
]
|
77 |
|
78 |
export default ScoreColumns
|
|
|
64 |
// />,
|
65 |
<Column
|
66 |
field='mmlu_accuracy'
|
67 |
+
header='Q&A'
|
68 |
headerTooltip='Question Answering performance (accuracy on a sample of multilingual versions of the MMLU benchmark)'
|
69 |
sortable
|
70 |
body={scoreBodyTemplate('mmlu_accuracy', {
|
|
|
72 |
maxScore: 1
|
73 |
})}
|
74 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
75 |
+
/>,
|
76 |
+
<Column
|
77 |
+
field='mgsm_accuracy'
|
78 |
+
header='Math'
|
79 |
+
headerTooltip='Math Problem Solving performance (accuracy on a sample of the MGMS benchmark)'
|
80 |
+
sortable
|
81 |
+
body={scoreBodyTemplate('mgsm_accuracy', {
|
82 |
+
minScore: 0,
|
83 |
+
maxScore: 1
|
84 |
+
})}
|
85 |
+
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
86 |
+
/>,
|
87 |
]
|
88 |
|
89 |
export default ScoreColumns
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|