import os.path from typing import List import pandas as pd from src.envs import DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC from src.display.columns import COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS from src.models import FullEvalResult, LeaderboardDataStore from src.utils import get_default_cols, get_leaderboard_df pd.options.mode.copy_on_write = True def load_raw_eval_results(results_path: str) -> List[FullEvalResult]: """ Load the evaluation results from a json file """ model_result_filepaths = [] for root, dirs, files in os.walk(results_path): if len(files) == 0: continue # select the latest results for file in files: if not (file.startswith("results") and file.endswith(".json")): print(f'skip {file}') continue model_result_filepaths.append(os.path.join(root, file)) eval_results = {} for model_result_filepath in model_result_filepaths: # create evaluation results try: eval_result = FullEvalResult.init_from_json_file(model_result_filepath) except UnicodeDecodeError as e: print(f"loading file failed. {model_result_filepath}") continue print(f'file loaded: {model_result_filepath}') timestamp = eval_result.timestamp eval_results[timestamp] = eval_result results = [] for k, v in eval_results.items(): try: v.to_dict() results.append(v) except KeyError: print(f"loading failed: {k}") continue return results def load_leaderboard_datastore(file_path) -> LeaderboardDataStore: lb_data_store = LeaderboardDataStore(None, None, None, None, None, None, None, None) lb_data_store.raw_data = load_raw_eval_results(file_path) print(f'raw data: {len(lb_data_store.raw_data)}') lb_data_store.raw_df_qa = get_leaderboard_df( lb_data_store.raw_data, task='qa', metric=DEFAULT_METRIC_QA) print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}') lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy() shown_columns_qa, types_qa = get_default_cols( 'qa', lb_data_store.leaderboard_df_qa.columns, add_fix_cols=True) lb_data_store.types_qa = types_qa lb_data_store.leaderboard_df_qa = \ lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa] lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True) lb_data_store.raw_df_long_doc = get_leaderboard_df( lb_data_store.raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC) print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}') lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy() shown_columns_long_doc, types_long_doc = get_default_cols( 'long-doc', lb_data_store.leaderboard_df_long_doc.columns, add_fix_cols=True) lb_data_store.types_long_doc = types_long_doc lb_data_store.leaderboard_df_long_doc = \ lb_data_store.leaderboard_df_long_doc[ ~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc] lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True) lb_data_store.reranking_models = sorted( list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data]))) return lb_data_store def load_eval_results(file_path: str): output = {} versions = ("AIR-Bench_24.04",) for version in versions: fn = f"{file_path}/{version}" output[version] = load_leaderboard_datastore(fn) return output