File size: 2,183 Bytes
da6e1bc
 
4d13673
da6e1bc
8274634
da6e1bc
b311dd5
da6e1bc
 
 
8274634
52abc5b
 
da6e1bc
 
 
3ed02d5
da6e1bc
 
2f9dee1
549360a
b311dd5
 
 
2f9dee1
ce2acb0
 
b311dd5
 
 
f840423
 
b311dd5
 
 
 
 
da6e1bc
2f9dee1
d91b022
549360a
b311dd5
 
 
 
 
 
 
 
 
f840423
b311dd5
 
549360a
 
 
 
 
da6e1bc
 
 
2f9dee1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import asyncio

import pandas as pd
from languages import languages
from models import models
from tasks import tasks
from tqdm.asyncio import tqdm_asyncio

# ===== config =====

n_sentences = 10
n_languages = 20
n_models = 35

# ===== run evaluation and aggregate results =====


async def evaluate():
    print("running evaluations")
    old_results = pd.read_json("results.json")
    old_models = pd.read_json("models.json")
    # get all combinations of model, language and task
    combis = [
        (model, lang.bcp_47, task_name)
        for task_name, task in tasks.items()
        for lang in languages.iloc[:n_languages].itertuples()
        for model in models["id"].iloc[:n_models]
    ]
    # filter out combinations that have already been evaluated
    combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
    combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
    combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
    # run evaluations
    results = [
        tasks[task_name](model, bcp_47, i)
        for i in range(n_sentences)
        for model, bcp_47, task_name in combis.itertuples(index=False)
    ]
    results = await tqdm_asyncio.gather(*results, miniters=1)
    results = [r for group in results for r in group]
    args = dict(orient="records", indent=2, force_ascii=False)
    if results:
        # aggregate results
        results = pd.DataFrame(results)
        results = (
            results.groupby(["model", "bcp_47", "task", "metric"])
            .agg({"score": "mean"})
            .reset_index()
        )
        # save results
        results = pd.concat([old_results, results])
        results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
        results.to_json("results.json", **args)
    # save up-to-date info on models and languages
    all_models = pd.concat([old_models, pd.DataFrame(models)])
    all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
    all_models.to_json("models.json", **args)
    pd.DataFrame(languages).to_json("languages.json", **args)


if __name__ == "__main__":
    results = asyncio.run(evaluate())