import gradio as gr import json import pandas as pd from collections import defaultdict import copy as cp from urllib.request import urlopen import re # Constants CITATION_BUTTON_TEXT = r"""@misc{2023opencompass, title={OpenCompass: A Universal Evaluation Platform for Foundation Models}, author={OpenCompass Contributors}, howpublished = {\url{https://github.com/open-compass/opencompass}}, year={2023} }, }""" CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" OPENCOMPASS_README = ( 'https://raw.githubusercontent.com/open-compass/opencompass/main/README.md' ) GITHUB_REPO = 'https://github.com/open-compass/opencompass' GITHUB_RAW = 'https://raw.githubusercontent.com/open-compass/opencompass' GITHUB_BLOB = 'https://github.com/open-compass/opencompass/blob' # URL for the JSON data DATA_URL = "http://opencompass.oss-cn-shanghai.aliyuncs.com/assets/research-rank/research-data.24-12.20241205.json" # Markdown content MAIN_LEADERBOARD_TITLE = "# CompassAcademic Leaderboard" MAIN_LEADERBOARD_DESCRIPTION = """## Main Evaluation Results The CompassAcademic currently focuses on the comprehensive reasoning abilities of LLMs. - The datasets selected so far include General Knowledge Reasoning (MMLU-Pro/GPQA-Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Completion (LiveCodeBench, HumanEval), and Instruction Following (IFEval). - Currently, the evaluation primarily targets chat models, with updates featuring the latest community models at irregular intervals. - Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)πŸ†. """ def fix_image_urls(content): """Fix image URLs in markdown content.""" # Handle the specific logo.svg path content = content.replace( 'docs/en/_static/image/logo.svg', 'https://raw.githubusercontent.com/open-compass/opencompass/main/docs/en/_static/image/logo.svg', ) # Replace other relative image paths with absolute GitHub URLs content = re.sub( r'!\[[^\]]*\]\((?!http)([^)]+)\)', lambda m: f'![{m.group(0)}](https://raw.githubusercontent.com/open-compass/opencompass/main/{m.group(1)})', content, ) return content MODEL_SIZE = ['<10B', '10B-70B', '>70B', 'Unknown'] MODEL_TYPE = ['API', 'OpenSource'] def load_data(): response = urlopen(DATA_URL) data = json.loads(response.read().decode('utf-8')) return data def build_main_table(data): df = pd.DataFrame(data['globalData']['OverallTable']) # Add OpenSource column based on models data models_data = data['models'] df['OpenSource'] = df['model'].apply( lambda x: 'Yes' if models_data[x]['release'] == 'OpenSource' else 'No' ) # Add Rank column based on Average Score df['Rank'] = df['Average'].rank(ascending=False, method='min').astype(int) columns = { 'Rank': 'Rank', 'model': 'Model', 'org': 'Organization', 'num': 'Parameters', 'OpenSource': 'OpenSource', 'Average': 'Average Score', 'BBH': 'BBH', 'Math-500': 'Math-500', 'AIME': 'AIME', 'MMLU-Pro': 'MMLU-Pro', 'LiveCodeBench': 'LiveCodeBench', 'HumanEval': 'HumanEval', 'GQPA-Diamond': 'GQPA-Diamond', 'IFEval': 'IFEval', } df = df[list(columns.keys())].rename(columns=columns) return df def filter_table(df, size_ranges, model_types): filtered_df = df.copy() # Filter by size if size_ranges: def get_size_in_B(param): if param == 'N/A': return None try: return float(param.replace('B', '')) except: return None filtered_df['size_in_B'] = filtered_df['Parameters'].apply( get_size_in_B ) mask = pd.Series(False, index=filtered_df.index) for size_range in size_ranges: if size_range == '<10B': mask |= (filtered_df['size_in_B'] < 10) & ( filtered_df['size_in_B'].notna() ) elif size_range == '10B-70B': mask |= (filtered_df['size_in_B'] >= 10) & ( filtered_df['size_in_B'] < 70 ) elif size_range == '>70B': mask |= filtered_df['size_in_B'] >= 70 elif size_range == 'Unknown': mask |= filtered_df['size_in_B'].isna() filtered_df = filtered_df[mask] filtered_df.drop('size_in_B', axis=1, inplace=True) # Filter by model type if model_types: type_mask = pd.Series(False, index=filtered_df.index) for model_type in model_types: if model_type == 'API': type_mask |= filtered_df['OpenSource'] == 'No' elif model_type == 'OpenSource': type_mask |= filtered_df['OpenSource'] == 'Yes' filtered_df = filtered_df[type_mask] # η›΄ζŽ₯θΏ”ε›žθΏ‡ζ»€εŽηš„ DataFrame return filtered_df def calculate_column_widths(df): """Dynamically calculate column widths based on content length.""" column_widths = [] for column in df.columns: # Get max length of column name and values header_length = len(str(column)) max_content_length = df[column].astype(str).map(len).max() # Use the larger of header or content length # Multiply by average character width (approximately 8 pixels) # Add padding (20 pixels) # Increase the multiplier for header length to ensure it fits width = max(header_length * 10, max_content_length * 8) + 20 # Set minimum width (200 pixels) width = max(160, width) # Set maximum width (400 pixels) to prevent extremely wide columns width = min(400, width) column_widths.append(width) return column_widths def create_interface(): data = load_data() df = build_main_table(data) with gr.Blocks() as demo: gr.Markdown(MAIN_LEADERBOARD_TITLE) with gr.Tabs() as tabs: with gr.TabItem("πŸ… Main Leaderboard", elem_id='main'): gr.Markdown(MAIN_LEADERBOARD_DESCRIPTION) with gr.Row(): with gr.Column(): size_filter = gr.CheckboxGroup( choices=MODEL_SIZE, value=MODEL_SIZE, label='Model Size', interactive=True, ) with gr.Column(): type_filter = gr.CheckboxGroup( choices=MODEL_TYPE, value=MODEL_TYPE, label='Model Type', interactive=True, ) with gr.Column(): table = gr.DataFrame( value=df.sort_values("Average Score", ascending=False), interactive=False, wrap=False, # 禁用θ‡ͺ动捒葌 column_widths=calculate_column_widths(df), ) def update_table(size_ranges, model_types): filtered_df = filter_table(df, size_ranges, model_types) return filtered_df.sort_values( "Average Score", ascending=False ) size_filter.change( fn=update_table, inputs=[size_filter, type_filter], outputs=table, ) type_filter.change( fn=update_table, inputs=[size_filter, type_filter], outputs=table, ) # with gr.TabItem("πŸ” About", elem_id='about'): # readme_content = urlopen(OPENCOMPASS_README).read().decode() # fixed_content = fix_image_urls(readme_content) # gr.Markdown(fixed_content) with gr.Row(): with gr.Accordion("Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id='citation-button', ) return demo if __name__ == '__main__': demo = create_interface() demo.launch(server_name='0.0.0.0')