File size: 2,183 Bytes
da6e1bc 4d13673 da6e1bc 8274634 da6e1bc b311dd5 da6e1bc 8274634 52abc5b da6e1bc 3ed02d5 da6e1bc 2f9dee1 549360a b311dd5 2f9dee1 ce2acb0 b311dd5 f840423 b311dd5 da6e1bc 2f9dee1 d91b022 549360a b311dd5 f840423 b311dd5 549360a da6e1bc 2f9dee1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import asyncio
import pandas as pd
from languages import languages
from models import models
from tasks import tasks
from tqdm.asyncio import tqdm_asyncio
# ===== config =====
n_sentences = 10
n_languages = 20
n_models = 35
# ===== run evaluation and aggregate results =====
async def evaluate():
print("running evaluations")
old_results = pd.read_json("results.json")
old_models = pd.read_json("models.json")
# get all combinations of model, language and task
combis = [
(model, lang.bcp_47, task_name)
for task_name, task in tasks.items()
for lang in languages.iloc[:n_languages].itertuples()
for model in models["id"].iloc[:n_models]
]
# filter out combinations that have already been evaluated
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
# run evaluations
results = [
tasks[task_name](model, bcp_47, i)
for i in range(n_sentences)
for model, bcp_47, task_name in combis.itertuples(index=False)
]
results = await tqdm_asyncio.gather(*results, miniters=1)
results = [r for group in results for r in group]
args = dict(orient="records", indent=2, force_ascii=False)
if results:
# aggregate results
results = pd.DataFrame(results)
results = (
results.groupby(["model", "bcp_47", "task", "metric"])
.agg({"score": "mean"})
.reset_index()
)
# save results
results = pd.concat([old_results, results])
results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
results.to_json("results.json", **args)
# save up-to-date info on models and languages
all_models = pd.concat([old_models, pd.DataFrame(models)])
all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
all_models.to_json("models.json", **args)
pd.DataFrame(languages).to_json("languages.json", **args)
if __name__ == "__main__":
results = asyncio.run(evaluate())
|