Spaces:
Restarting
on
CPU Upgrade
Restarting
on
CPU Upgrade
import pytest | |
from src.benchmarks import LongDocBenchmarks, QABenchmarks | |
from src.columns import ( | |
COL_NAME_AVG, | |
COL_NAME_RANK, | |
COL_NAME_RERANKING_MODEL, | |
COL_NAME_RETRIEVAL_MODEL, | |
COL_NAME_REVISION, | |
COL_NAME_TIMESTAMP, | |
get_default_auto_eval_column_dict, | |
get_default_col_names_and_types, | |
get_fixed_col_names_and_types, | |
make_autoevalcolumn, | |
) | |
# Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md | |
# 24.05 | |
# | Task | dev | test | | |
# | ---- | --- | ---- | | |
# | Long-Doc | 4 | 11 | | |
# | QA | 54 | 53 | | |
# | |
# 24.04 | |
# | Task | test | | |
# | ---- | ---- | | |
# | Long-Doc | 15 | | |
# | QA | 13 | | |
def expected_col_names(): | |
return [ | |
"rank", | |
"retrieval_model", | |
"reranking_model", | |
"revision", | |
"timestamp", | |
"average", | |
"retrieval_model_link", | |
"reranking_model_link", | |
"is_anonymous", | |
] | |
def expected_hidden_col_names(): | |
return [ | |
"retrieval_model_link", | |
"reranking_model_link", | |
"is_anonymous", | |
] | |
def test_get_default_auto_eval_column_dict(expected_col_names, expected_hidden_col_names): | |
col_list = get_default_auto_eval_column_dict() | |
assert len(col_list) == 9 | |
hidden_cols = [] | |
for col_tuple, expected_col in zip(col_list, expected_col_names): | |
col, _, col_content = col_tuple | |
assert col == expected_col | |
if col_content.hidden: | |
hidden_cols.append(col) | |
assert hidden_cols == expected_hidden_col_names | |
def test_get_fixed_col_names_and_types(): | |
col_names, col_types = get_fixed_col_names_and_types() | |
assert len(col_names) == 6 | |
assert len(col_types) == 6 | |
expected_col_and_type = [ | |
(COL_NAME_RANK, "number"), | |
(COL_NAME_RETRIEVAL_MODEL, "markdown"), | |
(COL_NAME_RERANKING_MODEL, "markdown"), | |
(COL_NAME_REVISION, "markdown"), | |
(COL_NAME_TIMESTAMP, "date"), | |
(COL_NAME_AVG, "number"), | |
] | |
for col_name, col_type, (c_name, c_type) in zip(col_names, col_types, expected_col_and_type): | |
assert col_name == c_name | |
assert col_type == c_type | |
def test_make_autoevalcolumn(benchmarks, expected_benchmark_len, expected_col_names): | |
expected_default_attrs = frozenset(expected_col_names) | |
for benchmark in benchmarks: | |
TestEvalColumn = make_autoevalcolumn("TestEvalColumn", benchmark) | |
attrs = [] | |
for k, v in TestEvalColumn.__dict__.items(): | |
if not k.startswith("__"): | |
attrs.append(k) | |
attrs = frozenset(attrs) | |
assert expected_default_attrs.issubset(attrs) | |
benchmark_attrs = attrs.difference(expected_default_attrs) | |
assert len(benchmark_attrs) == expected_benchmark_len[benchmark.name] | |
def test_get_default_col_names_and_types( | |
benchmarks, expected_benchmark_len, expected_col_names, expected_hidden_col_names | |
): | |
default_col_len = len(expected_col_names) | |
hidden_col_len = len(expected_hidden_col_names) | |
for benchmark in benchmarks: | |
col_names, col_types = get_default_col_names_and_types(benchmark) | |
assert len(col_names) == expected_benchmark_len[benchmark.name] + default_col_len - hidden_col_len | |