Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,954 Bytes
649e0fb ec8e2d4 649e0fb ec8e2d4 649e0fb ec8e2d4 649e0fb ec8e2d4 649e0fb 1a22df4 7845083 ec8e2d4 7845083 1a22df4 7845083 649e0fb ec8e2d4 649e0fb ec8e2d4 5e03e4a ec8e2d4 649e0fb ec8e2d4 649e0fb ec8e2d4 bf586e3 ec8e2d4 bf586e3 ec8e2d4 bf586e3 649e0fb ec8e2d4 649e0fb 7845083 649e0fb 7845083 649e0fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import os.path
from typing import List
import pandas as pd
from src.envs import (
BENCHMARK_VERSION_LIST,
COL_NAME_IS_ANONYMOUS,
COL_NAME_REVISION,
COL_NAME_TIMESTAMP,
DEFAULT_METRIC_LONG_DOC,
DEFAULT_METRIC_QA,
)
from src.models import FullEvalResult, LeaderboardDataStore
from src.utils import get_default_cols, get_leaderboard_df
pd.options.mode.copy_on_write = True
def load_raw_eval_results(results_path: str) -> List[FullEvalResult]:
"""
Load the evaluation results from a json file
"""
model_result_filepaths = []
for root, dirs, files in os.walk(results_path):
if len(files) == 0:
continue
# select the latest results
for file in files:
if not (file.startswith("results") and file.endswith(".json")):
print(f"skip {file}")
continue
model_result_filepaths.append(os.path.join(root, file))
eval_results = {}
for model_result_filepath in model_result_filepaths:
# create evaluation results
try:
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
except UnicodeDecodeError:
print(f"loading file failed. {model_result_filepath}")
continue
print(f"file loaded: {model_result_filepath}")
timestamp = eval_result.timestamp
eval_results[timestamp] = eval_result
results = []
for k, v in eval_results.items():
try:
v.to_dict()
results.append(v)
except KeyError:
print(f"loading failed: {k}")
continue
return results
def get_safe_name(name: str):
"""Get RFC 1123 compatible safe name"""
name = name.replace("-", "_")
return "".join(character.lower() for character in name if (character.isalnum() or character == "_"))
def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
slug = get_safe_name(version)[-4:]
lb_data_store = LeaderboardDataStore(version, slug, None, None, None, None, None, None, None, None)
lb_data_store.raw_data = load_raw_eval_results(file_path)
print(f"raw data: {len(lb_data_store.raw_data)}")
lb_data_store.raw_df_qa = get_leaderboard_df(lb_data_store, task="qa", metric=DEFAULT_METRIC_QA)
print(f"QA data loaded: {lb_data_store.raw_df_qa.shape}")
lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy()
shown_columns_qa, types_qa = get_default_cols("qa", lb_data_store.slug, add_fix_cols=True)
lb_data_store.types_qa = types_qa
lb_data_store.leaderboard_df_qa = lb_data_store.leaderboard_df_qa[
~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]
][shown_columns_qa]
lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
lb_data_store.raw_df_long_doc = get_leaderboard_df(lb_data_store, task="long-doc", metric=DEFAULT_METRIC_LONG_DOC)
print(f"Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}")
lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
shown_columns_long_doc, types_long_doc = get_default_cols("long-doc", lb_data_store.slug, add_fix_cols=True)
lb_data_store.types_long_doc = types_long_doc
lb_data_store.leaderboard_df_long_doc = lb_data_store.leaderboard_df_long_doc[
~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]
][shown_columns_long_doc]
lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
lb_data_store.reranking_models = sorted(
list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data]))
)
return lb_data_store
def load_eval_results(file_path: str):
output = {}
for version in BENCHMARK_VERSION_LIST:
fn = f"{file_path}/{version}"
output[version] = load_leaderboard_datastore(fn, version)
return output
|