Spaces:
AIR-Bench
/
Running on CPU Upgrade

File size: 4,156 Bytes
649e0fb
 
 
 
 
7845083
 
649e0fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7845083
 
 
 
 
 
 
 
 
 
 
649e0fb
 
 
 
7845083
649e0fb
5e03e4a
7845083
 
 
649e0fb
 
5e03e4a
649e0fb
 
bf586e3
 
 
 
 
 
 
 
 
 
 
649e0fb
 
 
 
 
 
 
 
7845083
 
649e0fb
7845083
649e0fb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os.path
from typing import List

import pandas as pd

from src.envs import DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, COL_NAME_REVISION, COL_NAME_TIMESTAMP, \
    COL_NAME_IS_ANONYMOUS, BENCHMARK_VERSION_LIST

from src.models import FullEvalResult, LeaderboardDataStore
from src.utils import get_default_cols, get_leaderboard_df

pd.options.mode.copy_on_write = True


def load_raw_eval_results(results_path: str) -> List[FullEvalResult]:
    """
    Load the evaluation results from a json file
    """
    model_result_filepaths = []
    for root, dirs, files in os.walk(results_path):
        if len(files) == 0:
            continue

        # select the latest results
        for file in files:
            if not (file.startswith("results") and file.endswith(".json")):
                print(f'skip {file}')
                continue
            model_result_filepaths.append(os.path.join(root, file))

    eval_results = {}
    for model_result_filepath in model_result_filepaths:
        # create evaluation results
        try:
            eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
        except UnicodeDecodeError as e:
            print(f"loading file failed. {model_result_filepath}")
            continue
        print(f'file loaded: {model_result_filepath}')
        timestamp = eval_result.timestamp
        eval_results[timestamp] = eval_result

    results = []
    for k, v in eval_results.items():
        try:
            v.to_dict()
            results.append(v)
        except KeyError:
            print(f"loading failed: {k}")
            continue
    return results

def get_safe_name(name: str):
    """Get RFC 1123 compatible safe name"""
    name = name.replace('-', '_')
    return ''.join(
        character.lower()
        for character in name
        if (character.isalnum() or character == '_'))

def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
    slug = get_safe_name(version)[-4:]
    lb_data_store = LeaderboardDataStore(version, slug, None, None, None, None, None, None, None, None)
    lb_data_store.raw_data = load_raw_eval_results(file_path)
    print(f'raw data: {len(lb_data_store.raw_data)}')

    lb_data_store.raw_df_qa = get_leaderboard_df(
        lb_data_store, task='qa', metric=DEFAULT_METRIC_QA)
    print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}')
    lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy()
    shown_columns_qa, types_qa = get_default_cols('qa', lb_data_store.slug, add_fix_cols=True)
    # shown_columns_qa, types_qa = get_default_cols(
    #     'qa', lb_data_store.leaderboard_df_qa.columns, add_fix_cols=True)
    lb_data_store.types_qa = types_qa
    lb_data_store.leaderboard_df_qa = \
        lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
    lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)

    lb_data_store.raw_df_long_doc = get_leaderboard_df(
        lb_data_store, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
    print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
    lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
    shown_columns_long_doc, types_long_doc = get_default_cols(
        'long-doc', lb_data_store.slug, add_fix_cols=True)
    lb_data_store.types_long_doc = types_long_doc
    lb_data_store.leaderboard_df_long_doc = \
        lb_data_store.leaderboard_df_long_doc[
            ~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
    lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)

    lb_data_store.reranking_models = sorted(
        list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
    return lb_data_store


def load_eval_results(file_path: str):
    output = {}
    # versions = BENCHMARK_VERSION_LIST
    for version in BENCHMARK_VERSION_LIST:
        fn = f"{file_path}/{version}"
        output[version] = load_leaderboard_datastore(fn, version)
    return output