Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,721 Bytes
729aa2a 1f17567 729aa2a 1f17567 729aa2a 1f17567 729aa2a 1f17567 729aa2a 1f17567 729aa2a 1f17567 729aa2a 1f17567 729aa2a 1f17567 729aa2a 1f17567 729aa2a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import pytest
from src.benchmarks import LongDocBenchmarks, QABenchmarks
from src.columns import (
COL_NAME_AVG,
COL_NAME_RANK,
COL_NAME_RERANKING_MODEL,
COL_NAME_RETRIEVAL_MODEL,
COL_NAME_REVISION,
COL_NAME_TIMESTAMP,
get_default_auto_eval_column_dict,
get_default_col_names_and_types,
get_fixed_col_names_and_types,
make_autoevalcolumn,
)
# Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
# 24.05
# | Task | dev | test |
# | ---- | --- | ---- |
# | Long-Doc | 4 | 11 |
# | QA | 54 | 53 |
#
# 24.04
# | Task | test |
# | ---- | ---- |
# | Long-Doc | 15 |
# | QA | 13 |
@pytest.fixture()
def expected_col_names():
return [
"rank",
"retrieval_model",
"reranking_model",
"revision",
"timestamp",
"average",
"retrieval_model_link",
"reranking_model_link",
"is_anonymous",
]
@pytest.fixture()
def expected_hidden_col_names():
return [
"retrieval_model_link",
"reranking_model_link",
"is_anonymous",
]
def test_get_default_auto_eval_column_dict(expected_col_names, expected_hidden_col_names):
col_list = get_default_auto_eval_column_dict()
assert len(col_list) == 9
hidden_cols = []
for col_tuple, expected_col in zip(col_list, expected_col_names):
col, _, col_content = col_tuple
assert col == expected_col
if col_content.hidden:
hidden_cols.append(col)
assert hidden_cols == expected_hidden_col_names
def test_get_fixed_col_names_and_types():
col_names, col_types = get_fixed_col_names_and_types()
assert len(col_names) == 6
assert len(col_types) == 6
expected_col_and_type = [
(COL_NAME_RANK, "number"),
(COL_NAME_RETRIEVAL_MODEL, "markdown"),
(COL_NAME_RERANKING_MODEL, "markdown"),
(COL_NAME_REVISION, "markdown"),
(COL_NAME_TIMESTAMP, "date"),
(COL_NAME_AVG, "number"),
]
for col_name, col_type, (c_name, c_type) in zip(col_names, col_types, expected_col_and_type):
assert col_name == c_name
assert col_type == c_type
@pytest.mark.parametrize(
"benchmarks, expected_benchmark_len",
[
(QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
(LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11}),
],
)
def test_make_autoevalcolumn(benchmarks, expected_benchmark_len, expected_col_names):
expected_default_attrs = frozenset(expected_col_names)
for benchmark in benchmarks:
TestEvalColumn = make_autoevalcolumn("TestEvalColumn", benchmark)
attrs = []
for k, v in TestEvalColumn.__dict__.items():
if not k.startswith("__"):
attrs.append(k)
attrs = frozenset(attrs)
assert expected_default_attrs.issubset(attrs)
benchmark_attrs = attrs.difference(expected_default_attrs)
assert len(benchmark_attrs) == expected_benchmark_len[benchmark.name]
@pytest.mark.parametrize(
"benchmarks, expected_benchmark_len",
[
(QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
(LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11}),
],
)
def test_get_default_col_names_and_types(
benchmarks, expected_benchmark_len, expected_col_names, expected_hidden_col_names
):
default_col_len = len(expected_col_names)
hidden_col_len = len(expected_hidden_col_names)
for benchmark in benchmarks:
col_names, col_types = get_default_col_names_and_types(benchmark)
assert len(col_names) == expected_benchmark_len[benchmark.name] + default_col_len - hidden_col_len
|