Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 2,849 Bytes
3478401 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
from pathlib import Path
import pytest
from src.models import EvalResult, FullEvalResult
cur_fp = Path(__file__)
# Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
# 24.05
# | Task | dev | test |
# | ---- | --- | ---- |
# | Long-Doc | 4 | 11 |
# | QA | 54 | 53 |
#
# 24.04
# | Task | test |
# | ---- | ---- |
# | Long-Doc | 15 |
# | QA | 13 |
NUM_QA_BENCHMARKS_24_05 = 53
NUM_DOC_BENCHMARKS_24_05 = 11
NUM_QA_BENCHMARKS_24_04 = 13
NUM_DOC_BENCHMARKS_24_04 = 15
def test_eval_result():
EvalResult(
eval_name="eval_name",
retrieval_model="bge-m3",
reranking_model="NoReranking",
results=[{"domain": "law", "lang": "en", "dataset": "lex_files_500K-600K", "value": 0.45723}],
task="qa",
metric="ndcg_at_3",
timestamp="2024-05-14T03:09:08Z",
revision="1e243f14bd295ccdea7a118fe847399d",
is_anonymous=True,
)
@pytest.mark.parametrize(
"file_path",
[
"AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json",
"AIR-Bench_24.05/bge-m3/NoReranker/results.json",
],
)
def test_full_eval_result_init_from_json_file(file_path):
json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
assert json_fp.parents[0].stem == full_eval_result.reranking_model
assert json_fp.parents[1].stem == full_eval_result.retrieval_model
assert len(full_eval_result.results) == 70
@pytest.mark.parametrize(
"file_path, task, expected_num_results",
[
("AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json", "qa", NUM_QA_BENCHMARKS_24_04),
(
"AIR-Bench_24.04/bge-m3/jina-reranker-v2-base-multilingual/results.json",
"long-doc",
NUM_DOC_BENCHMARKS_24_04,
),
("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "qa", NUM_QA_BENCHMARKS_24_05),
("AIR-Bench_24.05/bge-m3/NoReranker/results.json", "long-doc", NUM_DOC_BENCHMARKS_24_05),
],
)
def test_full_eval_result_to_dict(file_path, task, expected_num_results):
json_fp = cur_fp.parents[1] / "toydata/eval_results/" / file_path
full_eval_result = FullEvalResult.init_from_json_file(json_fp)
result_dict_list = full_eval_result.to_dict(task)
assert len(result_dict_list) == 1
result = result_dict_list[0]
attr_list = frozenset(
[
"eval_name",
"Retrieval Method",
"Reranking Model",
"Retrieval Model LINK",
"Reranking Model LINK",
"Revision",
"Submission Date",
"Anonymous Submission",
]
)
result_cols = list(result.keys())
assert len(result_cols) == (expected_num_results + len(attr_list))
|