davidpomerenke commited on
Commit
549360a
·
verified ·
1 Parent(s): 52abc5b

Upload from GitHub Actions: Add math benchmarks

Browse files
datasets.json CHANGED
@@ -249,6 +249,50 @@
249
  "implemented": false,
250
  "group": "Multitask Language Understanding"
251
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  {
253
  "name": "FLEURS",
254
  "author": "Meta",
@@ -477,49 +521,6 @@
477
  "implemented": false,
478
  "group": "Adversarial Language Modelling"
479
  },
480
- {
481
- "name": "MGSM",
482
- "author": "Google",
483
- "author_url": "https://google.com",
484
- "url": "https://huggingface.co/datasets/juletxara/mgsm",
485
- "n_languages": 10,
486
- "tasks": [
487
- "math"
488
- ],
489
- "parallel": true,
490
- "base": "MGSM",
491
- "group": "Grade School Math"
492
- },
493
- {
494
- "name": "AfriMGSM",
495
- "author": "Masakhane",
496
- "author_url": "https://www.masakhane.io",
497
- "url": "https://huggingface.co/datasets/masakhane/afrimgsm",
498
- "n_languages": 18,
499
- "tasks": [
500
- "math"
501
- ],
502
- "parallel": true,
503
- "translation": "human",
504
- "base": "MGSM",
505
- "implemented": false,
506
- "group": "Grade School Math"
507
- },
508
- {
509
- "name": "GSM8K-X",
510
- "author": "OpenGPT-X",
511
- "author_url": "https://opengpt-x.de",
512
- "url": "https://huggingface.co/datasets/openGPT-X/gsm8kx",
513
- "n_languages": 20,
514
- "tasks": [
515
- "math"
516
- ],
517
- "parallel": true,
518
- "translation": "machine",
519
- "base": "MGSM",
520
- "implemented": false,
521
- "group": "Grade School Math"
522
- },
523
  {
524
  "name": "WikiANN / PAN-X",
525
  "author": "Academic",
 
249
  "implemented": false,
250
  "group": "Multitask Language Understanding"
251
  },
252
+ {
253
+ "name": "MGSM",
254
+ "author": "Google",
255
+ "author_url": "https://google.com",
256
+ "url": "https://huggingface.co/datasets/juletxara/mgsm",
257
+ "n_languages": 10,
258
+ "tasks": [
259
+ "math"
260
+ ],
261
+ "parallel": true,
262
+ "base": "MGSM",
263
+ "implemented": true,
264
+ "group": "Grade School Math"
265
+ },
266
+ {
267
+ "name": "AfriMGSM",
268
+ "author": "Masakhane",
269
+ "author_url": "https://www.masakhane.io",
270
+ "url": "https://huggingface.co/datasets/masakhane/afrimgsm",
271
+ "n_languages": 18,
272
+ "tasks": [
273
+ "math"
274
+ ],
275
+ "parallel": true,
276
+ "translation": "human",
277
+ "base": "MGSM",
278
+ "implemented": true,
279
+ "group": "Grade School Math"
280
+ },
281
+ {
282
+ "name": "GSM8K-X",
283
+ "author": "OpenGPT-X",
284
+ "author_url": "https://opengpt-x.de",
285
+ "url": "https://huggingface.co/datasets/openGPT-X/gsm8kx",
286
+ "n_languages": 20,
287
+ "tasks": [
288
+ "math"
289
+ ],
290
+ "parallel": true,
291
+ "translation": "machine",
292
+ "base": "MGSM",
293
+ "implemented": true,
294
+ "group": "Grade School Math"
295
+ },
296
  {
297
  "name": "FLEURS",
298
  "author": "Meta",
 
521
  "implemented": false,
522
  "group": "Adversarial Language Modelling"
523
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  {
525
  "name": "WikiANN / PAN-X",
526
  "author": "Academic",
evals/backend.py CHANGED
@@ -25,6 +25,7 @@ task_metrics = [
25
  "translation_to_bleu",
26
  "classification_accuracy",
27
  "mmlu_accuracy",
 
28
  ]
29
 
30
 
 
25
  "translation_to_bleu",
26
  "classification_accuracy",
27
  "mmlu_accuracy",
28
+ "mgsm_accuracy",
29
  ]
30
 
31
 
evals/datasets_/mgsm.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets_.util import _get_dataset_config_names, _load_dataset
2
+ from langcodes import Language, standardize_tag
3
+
4
+ slug_mgsm = "juletxara/mgsm"
5
+ tags_mgsm = {
6
+ standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_mgsm)
7
+ }
8
+ slug_afrimgsm = "masakhane/afrimgsm"
9
+ tags_afrimgsm = {
10
+ standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_afrimgsm)
11
+ }
12
+ slug_gsm8kx = "Eurolingua/gsm8kx"
13
+ tags_gsm8kx = {
14
+ standardize_tag(a, macro=True): a
15
+ for a in _get_dataset_config_names(slug_gsm8kx, trust_remote_code=True)
16
+ }
17
+
18
+ def parse_number(i):
19
+ if isinstance(i, int):
20
+ return i
21
+ try:
22
+ return int(i.replace(",", "").replace(".", ""))
23
+ except ValueError:
24
+ return None
25
+
26
+ def load_mgsm(language_bcp_47, nr):
27
+ if language_bcp_47 in tags_mgsm.keys():
28
+ ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
29
+ return slug_mgsm, ds[nr]
30
+ elif language_bcp_47 in tags_afrimgsm.keys():
31
+ ds = _load_dataset(
32
+ slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
33
+ )
34
+ return slug_afrimgsm, ds[nr]
35
+ elif language_bcp_47 in tags_gsm8kx.keys():
36
+ row = _load_dataset(
37
+ slug_gsm8kx,
38
+ subset=tags_gsm8kx[language_bcp_47],
39
+ split="test",
40
+ trust_remote_code=True,
41
+ )[nr]
42
+ row["answer_number"] = row["answer"].split("####")[1].strip()
43
+ return slug_gsm8kx, row
44
+ else:
45
+ return None, None
evals/main.py CHANGED
@@ -16,12 +16,9 @@ n_models = 35
16
 
17
 
18
  async def evaluate():
19
- # save up-to-date info on models and languages
20
- args = dict(orient="records", indent=2, force_ascii=False)
21
- pd.DataFrame(models).to_json("models.json", **args)
22
- pd.DataFrame(languages).to_json("languages.json", **args)
23
  print("running evaluations")
24
  old_results = pd.read_json("results.json")
 
25
  # get all combinations of model, language and task
26
  combis = [
27
  (model, lang.bcp_47, task_name)
@@ -41,6 +38,7 @@ async def evaluate():
41
  ]
42
  results = await tqdm_asyncio.gather(*results, miniters=1)
43
  results = [r for group in results for r in group]
 
44
  if results:
45
  # aggregate results
46
  results = pd.DataFrame(results)
@@ -53,6 +51,11 @@ async def evaluate():
53
  results = pd.concat([old_results, results])
54
  results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
55
  results.to_json("results.json", **args)
 
 
 
 
 
56
 
57
 
58
  if __name__ == "__main__":
 
16
 
17
 
18
  async def evaluate():
 
 
 
 
19
  print("running evaluations")
20
  old_results = pd.read_json("results.json")
21
+ old_models = pd.read_json("models.json")
22
  # get all combinations of model, language and task
23
  combis = [
24
  (model, lang.bcp_47, task_name)
 
38
  ]
39
  results = await tqdm_asyncio.gather(*results, miniters=1)
40
  results = [r for group in results for r in group]
41
+ args = dict(orient="records", indent=2, force_ascii=False)
42
  if results:
43
  # aggregate results
44
  results = pd.DataFrame(results)
 
51
  results = pd.concat([old_results, results])
52
  results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
53
  results.to_json("results.json", **args)
54
+ # save up-to-date info on models and languages
55
+ all_models = pd.concat([old_models, pd.DataFrame(models)])
56
+ all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
57
+ all_models.to_json("models.json", **args)
58
+ pd.DataFrame(languages).to_json("languages.json", **args)
59
 
60
 
61
  if __name__ == "__main__":
evals/tasks.py CHANGED
@@ -1,10 +1,12 @@
1
  import random
2
  from functools import partial
 
3
 
4
  import evaluate
5
  import pandas as pd
6
  import sentencepiece as spm
7
  from datasets_.flores import flores_sentences
 
8
  from datasets_.mmlu import load_mmlu
9
  from languages import languages, script_name
10
  from models import complete, transcribe
@@ -247,6 +249,44 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
247
  ]
248
 
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  async def transcribe_and_evaluate(model, language_bcp_47, nr):
251
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
252
  fleurs = pd.read_csv(
@@ -284,5 +324,6 @@ tasks = {
284
  "classification": classify_and_evaluate,
285
  # "mlm": mlm_and_evaluate,
286
  "mmlu": mmlu_and_evaluate,
 
287
  # "asr": transcribe_and_evaluate,
288
  }
 
1
  import random
2
  from functools import partial
3
+ from textwrap import dedent
4
 
5
  import evaluate
6
  import pandas as pd
7
  import sentencepiece as spm
8
  from datasets_.flores import flores_sentences
9
+ from datasets_.mgsm import load_mgsm, parse_number
10
  from datasets_.mmlu import load_mmlu
11
  from languages import languages, script_name
12
  from models import complete, transcribe
 
249
  ]
250
 
251
 
252
+ async def mgsm_and_evaluate(model, language_bcp_47, nr):
253
+ system_prompt = """
254
+ Solve the math problem. Use reasoning, and finally give the answer as a number.
255
+ Response format: <reasoning> #### <number>
256
+ """
257
+ system_prompt = dedent(system_prompt).strip()
258
+ ds_slug, question = load_mgsm(language_bcp_47, nr)
259
+ if not question:
260
+ return []
261
+ response = await complete(
262
+ model=model,
263
+ messages=[
264
+ {"role": "system", "content": system_prompt},
265
+ {"role": "user", "content": question["question"]},
266
+ ],
267
+ temperature=0,
268
+ max_tokens=1024,
269
+ )
270
+ number = response.split("####")
271
+ if len(number) == 2:
272
+ accuracy = int(
273
+ parse_number(number[1].strip()) == parse_number(question["answer_number"])
274
+ )
275
+ else:
276
+ accuracy = 0
277
+
278
+ return [
279
+ {
280
+ "model": model,
281
+ "bcp_47": language_bcp_47,
282
+ "task": "mgsm",
283
+ "metric": "accuracy",
284
+ "score": accuracy,
285
+ "sentence_nr": nr,
286
+ }
287
+ ]
288
+
289
+
290
  async def transcribe_and_evaluate(model, language_bcp_47, nr):
291
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
292
  fleurs = pd.read_csv(
 
324
  "classification": classify_and_evaluate,
325
  # "mlm": mlm_and_evaluate,
326
  "mmlu": mmlu_and_evaluate,
327
+ "mgsm": mgsm_and_evaluate,
328
  # "asr": transcribe_and_evaluate,
329
  }
frontend/src/components/ModelTable.js CHANGED
@@ -5,7 +5,6 @@ import { MultiSelect } from 'primereact/multiselect'
5
  import { useState, useEffect } from 'react'
6
  import Medal from './Medal'
7
  import { Slider } from 'primereact/slider'
8
- import ScoreField from './ScoreField'
9
  import ScoreColumns from './ScoreColumns'
10
  const ModelTable = ({ data }) => {
11
  const [filters, setFilters] = useState({
 
5
  import { useState, useEffect } from 'react'
6
  import Medal from './Medal'
7
  import { Slider } from 'primereact/slider'
 
8
  import ScoreColumns from './ScoreColumns'
9
  const ModelTable = ({ data }) => {
10
  const [filters, setFilters] = useState({
frontend/src/components/ScoreColumns.js CHANGED
@@ -64,7 +64,7 @@ const ScoreColumns = [
64
  // />,
65
  <Column
66
  field='mmlu_accuracy'
67
- header='MMLU'
68
  headerTooltip='Question Answering performance (accuracy on a sample of multilingual versions of the MMLU benchmark)'
69
  sortable
70
  body={scoreBodyTemplate('mmlu_accuracy', {
@@ -72,7 +72,18 @@ const ScoreColumns = [
72
  maxScore: 1
73
  })}
74
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
75
- />
 
 
 
 
 
 
 
 
 
 
 
76
  ]
77
 
78
  export default ScoreColumns
 
64
  // />,
65
  <Column
66
  field='mmlu_accuracy'
67
+ header='Q&A'
68
  headerTooltip='Question Answering performance (accuracy on a sample of multilingual versions of the MMLU benchmark)'
69
  sortable
70
  body={scoreBodyTemplate('mmlu_accuracy', {
 
72
  maxScore: 1
73
  })}
74
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
75
+ />,
76
+ <Column
77
+ field='mgsm_accuracy'
78
+ header='Math'
79
+ headerTooltip='Math Problem Solving performance (accuracy on a sample of the MGMS benchmark)'
80
+ sortable
81
+ body={scoreBodyTemplate('mgsm_accuracy', {
82
+ minScore: 0,
83
+ maxScore: 1
84
+ })}
85
+ style={{ minWidth: '5rem', maxWidth: '10rem' }}
86
+ />,
87
  ]
88
 
89
  export default ScoreColumns
results.json CHANGED
The diff for this file is too large to render. See raw diff