File size: 2,039 Bytes
da6e1bc
 
4d13673
da6e1bc
8274634
da6e1bc
b311dd5
da6e1bc
 
 
8274634
092c06a
0bd935e
da6e1bc
 
 
3ed02d5
da6e1bc
b311dd5
 
 
 
da6e1bc
2f9dee1
b311dd5
 
 
2f9dee1
ce2acb0
 
b311dd5
 
 
 
 
 
 
 
 
 
 
da6e1bc
2f9dee1
d91b022
b311dd5
 
 
 
 
 
 
 
 
 
 
 
da6e1bc
 
 
2f9dee1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import asyncio

import pandas as pd
from languages import languages
from models import models
from tasks import tasks
from tqdm.asyncio import tqdm_asyncio

# ===== config =====

n_sentences = 10
n_languages = 20
n_models = 30

# ===== run evaluation and aggregate results =====


async def evaluate():
    # save up-to-date info on models and languages
    args = dict(orient="records", indent=2, force_ascii=False)
    pd.DataFrame(models).to_json("models.json", **args)
    pd.DataFrame(languages).to_json("languages.json", **args)
    print("running evaluations")
    old_results = pd.read_json("results.json")
    # get all combinations of model, language and task
    combis = [
        (model, lang.bcp_47, task_name)
        for task_name, task in tasks.items()
        for lang in languages.iloc[:n_languages].itertuples()
        for model in models["id"].iloc[:n_models]
    ]
    # filter out combinations that have already been evaluated
    combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
    combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
    combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
    print(combis["model"].unique())
    # run evaluations
    results = [
        tasks[task_name](model, bcp_47, i)
        for i in range(n_sentences)
        for model, bcp_47, task_name in combis.itertuples(index=False)
    ]
    results = await tqdm_asyncio.gather(*results, miniters=1)
    results = [r for group in results for r in group]
    if results:
        # aggregate results
        results = pd.DataFrame(results)
        results = (
            results.groupby(["model", "bcp_47", "task", "metric"])
            .agg({"score": "mean"})
            .reset_index()
        )
        # save results
        results = pd.concat([old_results, results])
        results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
        results.to_json("results.json", **args)


if __name__ == "__main__":
    results = asyncio.run(evaluate())