Spaces:
AIR-Bench
/
Running on CPU Upgrade

leaderboard / src /loaders.py
nan's picture
feat: add versioning for the long-doc
bf586e3
raw
history blame
4.16 kB
import os.path
from typing import List
import pandas as pd
from src.envs import DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, COL_NAME_REVISION, COL_NAME_TIMESTAMP, \
COL_NAME_IS_ANONYMOUS, BENCHMARK_VERSION_LIST
from src.models import FullEvalResult, LeaderboardDataStore
from src.utils import get_default_cols, get_leaderboard_df
pd.options.mode.copy_on_write = True
def load_raw_eval_results(results_path: str) -> List[FullEvalResult]:
"""
Load the evaluation results from a json file
"""
model_result_filepaths = []
for root, dirs, files in os.walk(results_path):
if len(files) == 0:
continue
# select the latest results
for file in files:
if not (file.startswith("results") and file.endswith(".json")):
print(f'skip {file}')
continue
model_result_filepaths.append(os.path.join(root, file))
eval_results = {}
for model_result_filepath in model_result_filepaths:
# create evaluation results
try:
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
except UnicodeDecodeError as e:
print(f"loading file failed. {model_result_filepath}")
continue
print(f'file loaded: {model_result_filepath}')
timestamp = eval_result.timestamp
eval_results[timestamp] = eval_result
results = []
for k, v in eval_results.items():
try:
v.to_dict()
results.append(v)
except KeyError:
print(f"loading failed: {k}")
continue
return results
def get_safe_name(name: str):
"""Get RFC 1123 compatible safe name"""
name = name.replace('-', '_')
return ''.join(
character.lower()
for character in name
if (character.isalnum() or character == '_'))
def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
slug = get_safe_name(version)[-4:]
lb_data_store = LeaderboardDataStore(version, slug, None, None, None, None, None, None, None, None)
lb_data_store.raw_data = load_raw_eval_results(file_path)
print(f'raw data: {len(lb_data_store.raw_data)}')
lb_data_store.raw_df_qa = get_leaderboard_df(
lb_data_store, task='qa', metric=DEFAULT_METRIC_QA)
print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}')
lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy()
shown_columns_qa, types_qa = get_default_cols('qa', lb_data_store.slug, add_fix_cols=True)
# shown_columns_qa, types_qa = get_default_cols(
# 'qa', lb_data_store.leaderboard_df_qa.columns, add_fix_cols=True)
lb_data_store.types_qa = types_qa
lb_data_store.leaderboard_df_qa = \
lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
lb_data_store.raw_df_long_doc = get_leaderboard_df(
lb_data_store, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
shown_columns_long_doc, types_long_doc = get_default_cols(
'long-doc', lb_data_store.slug, add_fix_cols=True)
lb_data_store.types_long_doc = types_long_doc
lb_data_store.leaderboard_df_long_doc = \
lb_data_store.leaderboard_df_long_doc[
~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
lb_data_store.reranking_models = sorted(
list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
return lb_data_store
def load_eval_results(file_path: str):
output = {}
# versions = BENCHMARK_VERSION_LIST
for version in BENCHMARK_VERSION_LIST:
fn = f"{file_path}/{version}"
output[version] = load_leaderboard_datastore(fn, version)
return output