"""A gradio app that renders a static leaderboard. This is used for Hugging Face Space.""" # import ast # import argparse # import glob # import pickle import gradio as gr import numpy as np import pandas as pd import os from collections import defaultdict from matplotlib.colors import LinearSegmentedColormap def make_default_md(): leaderboard_md = f""" # 🔎📚🪡📚❓ BABILong Leaderboard 🏆 [![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-lg.svg)](https://huggingface.co/datasets/booydar/babilong) | [GitHub](https://github.com/booydar/babilong) | [Paper](https://arxiv.org/abs/2406.10149) | [HF Dataset](https://huggingface.co/datasets/booydar/babilong) | [HF Dataset 1k samples per task](https://huggingface.co/datasets/RMT-team/babilong-1k-samples) | """ return leaderboard_md def make_arena_leaderboard_md(total_models): leaderboard_md = f"""Total #models: **{total_models}**. Last updated: July 29, 2024.""" return leaderboard_md def make_model_desc_md(f_len): desc_md = make_arena_leaderboard_md(f_len) models = next(os.walk('info'))[2] for model in models: model_name = model.split('.md')[0] with open(os.path.join('info', model), 'r') as f: description = f.read() desc_md += f"\n\n### {model_name}\n{description}" return desc_md def model_hyperlink(model_name, link): return f'{model_name}' def load_model(folders, tab_name, msg_lengths): results = defaultdict(list) class NA(): def __repr__(self) -> str: return '-' def __float__(self): return 0.0 mean_score = [] for i, folder in enumerate(folders): model_name = folder.split('/')[-1] if 'fine-tune' in model_name: model_name += ' 🛠️' if 'rag' in model_name.lower() or 'retrieve' in model_name.lower(): model_name += ' 🔎' results['Model'].append(model_name) for task in msg_lengths: if not os.path.isfile(f'{folder}/{tab_name}/{task}.csv'): results[msg_lengths[task]].append(NA()) else: df = pd.read_csv(f'{folder}/{tab_name}/{task}.csv') results[msg_lengths[task]].append(int(df['result'].sum() / len(df) * 100)) mean_score.append(-np.mean([float(results[msg_lengths[task]][i]) for task in list(msg_lengths.keys())[:10]])) res_df = pd.DataFrame(results) lengths = list(msg_lengths.values()) res_df['mean_score'] = mean_score res_df['num_lengths'] = -(res_df[lengths].astype(float) > 0).sum(axis=1) res_df = res_df[res_df.num_lengths != 0] res_df.sort_values(['num_lengths', 'mean_score'], inplace=True) res_df['Rank'] = range(1, res_df.shape[0] + 1) res_df['Avg ≤32k'] = res_df[lengths[:5]].astype(float).fillna(0).mean(axis=1).astype(int) res_df['Avg ≤128k'] = res_df[lengths[:7]].astype(float).fillna(0).mean(axis=1).astype(int) ordered_columns = ['Rank', 'Model', 'Avg ≤32k', 'Avg ≤128k'] + lengths res_df = res_df[ordered_columns] return res_df def build_leaderboard_tab(folders): default_md = make_default_md() md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown") msg_lengths = { '0': '0k', '4000': '4k', '8000': '8k', '16000': '16k', '32000': '32k', '64000': '64k', '128000': '128k', '500000': '500k', '1000000': '1M', '10000000': '10M' } with gr.Tabs() as tabs: for tab_id, tab_name in enumerate(['avg', 'qa1','qa2', 'qa3', 'qa4', 'qa5']): df = load_model(folders, tab_name, msg_lengths) cmap = LinearSegmentedColormap.from_list('ryg', ["red", "yellow", "green"], N=256) # df = df.style.background_gradient(cmap=cmap, vmin=0, vmax=100, subset=list(msg_lengths.values())) df = df.style.background_gradient(cmap=cmap, vmin=0, vmax=100, subset=df.columns[2:]) # arena table with gr.Tab(tab_name, id=tab_id): md = make_arena_leaderboard_md(len(folders)) gr.Markdown(md, elem_id="leaderboard_markdown") gr.Dataframe( headers=[ "Rank", "Model", ] + list(msg_lengths.values()) + ['Avg ≤32k', 'Avg ≤128k'], datatype=[ "str", "markdown", "str", "str", "str", "str", "str", "str", "str", "str", "str", ], value=df, elem_id="arena_leaderboard_dataframe", height=700, column_widths=[20, 150] + [30] * 2 + [20] * len(msg_lengths), wrap=True, ) with gr.Tab("Description", id=tab_id + 1): desc_md = make_model_desc_md(len(folders)) gr.Markdown(desc_md, elem_id="leaderboard_markdown") return [md_1] block_css = """ #notice_markdown { font-size: 104% } #notice_markdown th { display: none; } #notice_markdown td { padding-top: 6px; padding-bottom: 6px; } #leaderboard_markdown { font-size: 104% } #leaderboard_markdown td { padding-top: 6px; padding-bottom: 6px; } #leaderboard_dataframe td { line-height: 0.1em; } footer { display:none !important } .image-container { display: flex; align-items: center; padding: 1px; } .image-container img { margin: 0 30px; height: 20px; max-height: 100%; width: auto; max-width: 20%; } """ def build_demo(folders): text_size = gr.themes.sizes.text_lg with gr.Blocks( title="Babilong leaderboard", theme=gr.themes.Base(text_size=text_size), css=block_css, ) as demo: leader_components = build_leaderboard_tab(folders) return demo if __name__ == "__main__": folders = [f'results/{folders}' for folders in os.listdir('results')] demo = build_demo(folders) demo.launch(share=False)