File size: 2,039 Bytes
da6e1bc 4d13673 da6e1bc 8274634 da6e1bc b311dd5 da6e1bc 8274634 092c06a 0bd935e da6e1bc 3ed02d5 da6e1bc b311dd5 da6e1bc 2f9dee1 b311dd5 2f9dee1 ce2acb0 b311dd5 da6e1bc 2f9dee1 d91b022 b311dd5 da6e1bc 2f9dee1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import asyncio
import pandas as pd
from languages import languages
from models import models
from tasks import tasks
from tqdm.asyncio import tqdm_asyncio
# ===== config =====
n_sentences = 10
n_languages = 20
n_models = 30
# ===== run evaluation and aggregate results =====
async def evaluate():
# save up-to-date info on models and languages
args = dict(orient="records", indent=2, force_ascii=False)
pd.DataFrame(models).to_json("models.json", **args)
pd.DataFrame(languages).to_json("languages.json", **args)
print("running evaluations")
old_results = pd.read_json("results.json")
# get all combinations of model, language and task
combis = [
(model, lang.bcp_47, task_name)
for task_name, task in tasks.items()
for lang in languages.iloc[:n_languages].itertuples()
for model in models["id"].iloc[:n_models]
]
# filter out combinations that have already been evaluated
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
print(combis["model"].unique())
# run evaluations
results = [
tasks[task_name](model, bcp_47, i)
for i in range(n_sentences)
for model, bcp_47, task_name in combis.itertuples(index=False)
]
results = await tqdm_asyncio.gather(*results, miniters=1)
results = [r for group in results for r in group]
if results:
# aggregate results
results = pd.DataFrame(results)
results = (
results.groupby(["model", "bcp_47", "task", "metric"])
.agg({"score": "mean"})
.reset_index()
)
# save results
results = pd.concat([old_results, results])
results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
results.to_json("results.json", **args)
if __name__ == "__main__":
results = asyncio.run(evaluate())
|