Spaces:
AIR-Bench
/
Running on CPU Upgrade

File size: 3,721 Bytes
729aa2a
 
1f17567
 
 
 
 
 
 
 
 
 
 
 
 
729aa2a
 
 
 
 
 
 
 
 
 
 
 
 
 
1f17567
729aa2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f17567
729aa2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f17567
729aa2a
 
1f17567
 
729aa2a
1f17567
729aa2a
 
 
 
 
 
 
 
 
 
 
 
 
 
1f17567
729aa2a
 
1f17567
 
729aa2a
 
1f17567
 
729aa2a
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pytest

from src.benchmarks import LongDocBenchmarks, QABenchmarks
from src.columns import (
    COL_NAME_AVG,
    COL_NAME_RANK,
    COL_NAME_RERANKING_MODEL,
    COL_NAME_RETRIEVAL_MODEL,
    COL_NAME_REVISION,
    COL_NAME_TIMESTAMP,
    get_default_auto_eval_column_dict,
    get_default_col_names_and_types,
    get_fixed_col_names_and_types,
    make_autoevalcolumn,
)

# Ref: https://github.com/AIR-Bench/AIR-Bench/blob/4b27b8a8f2047a963805fcf6fb9d74be51ec440c/docs/available_tasks.md
# 24.05
# | Task | dev | test |
# | ---- | --- | ---- |
# | Long-Doc | 4 | 11 |
# | QA | 54 | 53 |
#
# 24.04
# | Task | test |
# | ---- | ---- |
# | Long-Doc | 15 |
# | QA | 13 |


@pytest.fixture()
def expected_col_names():
    return [
        "rank",
        "retrieval_model",
        "reranking_model",
        "revision",
        "timestamp",
        "average",
        "retrieval_model_link",
        "reranking_model_link",
        "is_anonymous",
    ]


@pytest.fixture()
def expected_hidden_col_names():
    return [
        "retrieval_model_link",
        "reranking_model_link",
        "is_anonymous",
    ]


def test_get_default_auto_eval_column_dict(expected_col_names, expected_hidden_col_names):
    col_list = get_default_auto_eval_column_dict()
    assert len(col_list) == 9
    hidden_cols = []
    for col_tuple, expected_col in zip(col_list, expected_col_names):
        col, _, col_content = col_tuple
        assert col == expected_col
        if col_content.hidden:
            hidden_cols.append(col)
    assert hidden_cols == expected_hidden_col_names


def test_get_fixed_col_names_and_types():
    col_names, col_types = get_fixed_col_names_and_types()
    assert len(col_names) == 6
    assert len(col_types) == 6
    expected_col_and_type = [
        (COL_NAME_RANK, "number"),
        (COL_NAME_RETRIEVAL_MODEL, "markdown"),
        (COL_NAME_RERANKING_MODEL, "markdown"),
        (COL_NAME_REVISION, "markdown"),
        (COL_NAME_TIMESTAMP, "date"),
        (COL_NAME_AVG, "number"),
    ]
    for col_name, col_type, (c_name, c_type) in zip(col_names, col_types, expected_col_and_type):
        assert col_name == c_name
        assert col_type == c_type


@pytest.mark.parametrize(
    "benchmarks, expected_benchmark_len",
    [
        (QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
        (LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11}),
    ],
)
def test_make_autoevalcolumn(benchmarks, expected_benchmark_len, expected_col_names):
    expected_default_attrs = frozenset(expected_col_names)
    for benchmark in benchmarks:
        TestEvalColumn = make_autoevalcolumn("TestEvalColumn", benchmark)
        attrs = []
        for k, v in TestEvalColumn.__dict__.items():
            if not k.startswith("__"):
                attrs.append(k)
        attrs = frozenset(attrs)
        assert expected_default_attrs.issubset(attrs)
        benchmark_attrs = attrs.difference(expected_default_attrs)
        assert len(benchmark_attrs) == expected_benchmark_len[benchmark.name]


@pytest.mark.parametrize(
    "benchmarks, expected_benchmark_len",
    [
        (QABenchmarks, {"air_bench_2404": 13, "air_bench_2405": 53}),
        (LongDocBenchmarks, {"air_bench_2404": 15, "air_bench_2405": 11}),
    ],
)
def test_get_default_col_names_and_types(
    benchmarks, expected_benchmark_len, expected_col_names, expected_hidden_col_names
):
    default_col_len = len(expected_col_names)
    hidden_col_len = len(expected_hidden_col_names)
    for benchmark in benchmarks:
        col_names, col_types = get_default_col_names_and_types(benchmark)
        assert len(col_names) == expected_benchmark_len[benchmark.name] + default_col_len - hidden_col_len