dongsheng's picture
Update app.py
b06c3f3 verified
raw
history blame
10.1 kB
import gradio as gr
import json
import pandas as pd
from urllib.request import urlopen, URLError
import re
from datetime import datetime
# Constants
CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
author={OpenCompass Contributors},
howpublished = {\url{https://github.com/open-compass/opencompass}},
year={2023}
}"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
# 开发环境
# DATA_URL_BASE = "http://opencompass.oss-cn-shanghai.aliyuncs.com/dev-assets/research-rank/research-data.REALTIME."
# 生产环境
DATA_URL_BASE = "http://opencompass.oss-cn-shanghai.aliyuncs.com/assets/research-rank/research-data.REALTIME."
def find_latest_data_url():
"""Find the latest available data URL by trying different dates."""
today = datetime.now()
for i in range(365):
date = today.replace(day=today.day - i)
date_str = date.strftime("%Y%m%d")
url = f"{DATA_URL_BASE}{date_str}.json"
try:
urlopen(url)
return url, date_str
except URLError:
continue
return None, None
def get_latest_data():
"""Get latest data URL and update time"""
data_url, update_time = find_latest_data_url()
if not data_url:
raise Exception("Could not find valid data URL")
formatted_update_time = datetime.strptime(update_time, "%Y%m%d").strftime("%Y-%m-%d")
return data_url, formatted_update_time
def get_leaderboard_title(update_time):
return f"# CompassAcademic Leaderboard (Last Updated: {update_time})"
MAIN_LEADERBOARD_DESCRIPTION = """## Main Evaluation Results
The CompassAcademic currently focuses on the comprehensive reasoning abilities of LLMs.
- The datasets selected so far include General Knowledge Reasoning (MMLU-Pro/GPQA-Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Completion (LiveCodeBench, HumanEval), and Instruction Following (IFEval).
- Currently, the evaluation primarily targets chat models, with updates featuring the latest community models at irregular intervals.
- Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)🏆.
"""
MODEL_SIZE = ['<10B', '10B-70B', '>70B', 'Unknown']
MODEL_TYPE = ['API', 'OpenSource']
def load_data(data_url):
response = urlopen(data_url)
data = json.loads(response.read().decode('utf-8'))
return data
def build_main_table(data):
df = pd.DataFrame(data['globalData']['OverallTable'])
models_data = data['models']
df['OpenSource'] = df['model'].apply(
lambda x: 'Yes' if models_data[x]['release'] == 'OpenSource' else 'No'
)
df['Rank'] = df['Average'].rank(ascending=False, method='min').astype(int)
columns = {
'Rank': 'Rank', 'model': 'Model', 'org': 'Organization', 'num': 'Parameters',
'OpenSource': 'OpenSource', 'Average': 'Average Score', 'BBH': 'BBH',
'Math-500': 'Math-500', 'AIME': 'AIME', 'MMLU-Pro': 'MMLU-Pro',
'LiveCodeBench': 'LiveCodeBench', 'HumanEval': 'HumanEval',
'GQPA-Diamond': 'GQPA-Diamond', 'IFEval': 'IFEval',
}
df = df[list(columns.keys())].rename(columns=columns)
return df
def filter_table(df, size_ranges, model_types):
filtered_df = df.copy()
if size_ranges:
def get_size_in_B(param):
if param == 'N/A':
return None
try:
return float(param.replace('B', ''))
except:
return None
filtered_df['size_in_B'] = filtered_df['Parameters'].apply(get_size_in_B)
mask = pd.Series(False, index=filtered_df.index)
for size_range in size_ranges:
if size_range == '<10B':
mask |= (filtered_df['size_in_B'] < 10) & (filtered_df['size_in_B'].notna())
elif size_range == '10B-70B':
mask |= (filtered_df['size_in_B'] >= 10) & (filtered_df['size_in_B'] < 70)
elif size_range == '>70B':
mask |= filtered_df['size_in_B'] >= 70
elif size_range == 'Unknown':
mask |= filtered_df['size_in_B'].isna()
filtered_df = filtered_df[mask]
filtered_df.drop('size_in_B', axis=1, inplace=True)
if model_types:
type_mask = pd.Series(False, index=filtered_df.index)
for model_type in model_types:
if model_type == 'API':
type_mask |= filtered_df['OpenSource'] == 'No'
elif model_type == 'OpenSource':
type_mask |= filtered_df['OpenSource'] == 'Yes'
filtered_df = filtered_df[type_mask]
return filtered_df
def calculate_column_widths(df):
column_widths = []
for column in df.columns:
header_length = len(str(column))
max_content_length = df[column].astype(str).map(len).max()
width = max(header_length * 10, max_content_length * 8) + 20
width = max(160, min(400, width))
column_widths.append(width)
return column_widths
class DataState:
def __init__(self):
self.current_df = None
data_state = DataState()
def create_interface():
empty_df = pd.DataFrame(columns=[
'Rank', 'Model', 'Organization', 'Parameters', 'OpenSource',
'Average Score', 'BBH', 'Math-500', 'AIME', 'MMLU-Pro',
'LiveCodeBench', 'HumanEval', 'GQPA-Diamond', 'IFEval'
])
def load_initial_data():
try:
data_url, update_time = get_latest_data()
data = load_data(data_url)
new_df = build_main_table(data)
data_state.current_df = new_df
filtered_df = filter_table(new_df, MODEL_SIZE, MODEL_TYPE)
return get_leaderboard_title(update_time), filtered_df.sort_values("Average Score", ascending=False)
except Exception as e:
print(f"Error loading initial data: {e}")
return "# CompassAcademic Leaderboard (Error loading data)", empty_df
def refresh_data():
try:
data_url, update_time = get_latest_data()
data = load_data(data_url)
new_df = build_main_table(data)
data_state.current_df = new_df
filtered_df = filter_table(new_df, MODEL_SIZE, MODEL_TYPE)
return get_leaderboard_title(update_time), filtered_df.sort_values("Average Score", ascending=False)
except Exception as e:
print(f"Error refreshing data: {e}")
return None, None
def auto_refresh():
"""Single refresh function for automatic updates"""
title, data = refresh_data()
status = f"Last auto update: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
if title and data is not None:
return title, data, status
return None, None, None
def update_table(size_ranges, model_types):
if data_state.current_df is None:
return empty_df
filtered_df = filter_table(data_state.current_df, size_ranges, model_types)
return filtered_df.sort_values("Average Score", ascending=False)
initial_title, initial_data = load_initial_data()
with gr.Blocks() as demo:
title_comp = gr.Markdown(initial_title)
with gr.Tabs() as tabs:
with gr.TabItem("🏅 Main Leaderboard", elem_id='main'):
gr.Markdown(MAIN_LEADERBOARD_DESCRIPTION)
with gr.Row():
with gr.Column():
size_filter = gr.CheckboxGroup(
choices=MODEL_SIZE,
value=MODEL_SIZE,
label='Model Size',
interactive=True,
)
with gr.Column():
type_filter = gr.CheckboxGroup(
choices=MODEL_TYPE,
value=MODEL_TYPE,
label='Model Type',
interactive=True,
)
with gr.Column():
table = gr.DataFrame(
value=initial_data,
interactive=False,
wrap=False,
column_widths=calculate_column_widths(initial_data),
)
refresh_button = gr.Button("Refresh Data")
update_status = gr.Markdown("Last update: " + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
def refresh_and_update():
title, data = refresh_data()
status = f"Last manual update: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
return title, data, status
refresh_button.click(
fn=refresh_and_update,
outputs=[title_comp, table, update_status],
)
# 添加自动更新功能
demo.load(
fn=auto_refresh,
outputs=[title_comp, table, update_status],
every=10 # 每10秒更新一次
)
size_filter.change(
fn=update_table,
inputs=[size_filter, type_filter],
outputs=table,
)
type_filter.change(
fn=update_table,
inputs=[size_filter, type_filter],
outputs=table,
)
with gr.Row():
with gr.Accordion("Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id='citation-button',
)
return demo
if __name__ == '__main__':
demo = create_interface()
demo.queue()
demo.launch(server_name='0.0.0.0')