File size: 9,554 Bytes
03b1dbc
 
 
4224b43
03b1dbc
4224b43
03b1dbc
 
 
 
 
 
 
 
 
003043b
3eb0235
003043b
3eb0235
4224b43
 
 
fb5b92b
4224b43
 
fb5b92b
4224b43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03b1dbc
4224b43
 
 
03b1dbc
 
 
 
 
 
 
 
 
 
4224b43
 
03b1dbc
 
 
 
 
 
 
 
 
e2eea98
5fe7967
03b1dbc
5fe7967
 
 
 
 
03b1dbc
 
 
 
 
 
5fe7967
03b1dbc
 
 
 
 
 
 
 
5fe7967
 
03b1dbc
5fe7967
03b1dbc
 
5fe7967
03b1dbc
5fe7967
03b1dbc
 
 
 
5fe7967
03b1dbc
 
5fe7967
03b1dbc
 
 
 
 
 
 
 
5fe7967
03b1dbc
 
 
 
 
 
 
 
5fe7967
03b1dbc
2262cad
b06c3f3
5fe7967
 
 
 
 
03b1dbc
 
5fe7967
 
 
 
 
03b1dbc
5fe7967
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03b1dbc
5fe7967
 
03b1dbc
 
 
5fe7967
03b1dbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fe7967
03b1dbc
 
5fe7967
03b1dbc
5fe7967
 
2262cad
5fe7967
 
 
 
 
003043b
5fe7967
 
 
003043b
5fe7967
 
03b1dbc
 
 
 
 
5fe7967
03b1dbc
 
 
 
 
 
 
 
 
 
 
 
c5f2ca3
 
 
03b1dbc
 
 
 
 
 
5fe7967
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import gradio as gr
import json
import pandas as pd
from urllib.request import urlopen, URLError
import re
from datetime import datetime

# Constants
CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
    author={OpenCompass Contributors},
    howpublished = {\url{https://github.com/open-compass/opencompass}},
    year={2023}
}"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
# 开发环境
# DATA_URL_BASE = "http://opencompass.oss-cn-shanghai.aliyuncs.com/dev-assets/research-rank/research-data.REALTIME."
# 生产环境
DATA_URL_BASE = "http://opencompass.oss-cn-shanghai.aliyuncs.com/assets/research-rank/research-data.REALTIME."

def find_latest_data_url():
    """Find the latest available data URL by trying different dates."""
    from datetime import timedelta
    today = datetime.now()
    for i in range(365):
        date = today - timedelta(days=i)
        date_str = date.strftime("%Y%m%d")
        url = f"{DATA_URL_BASE}{date_str}.json"
        try:
            urlopen(url)
            return url, date_str
        except URLError:
            continue
    return None, None

def get_latest_data():
    """Get latest data URL and update time"""
    data_url, update_time = find_latest_data_url()
    if not data_url:
        raise Exception("Could not find valid data URL")
    formatted_update_time = datetime.strptime(update_time, "%Y%m%d").strftime("%Y-%m-%d")
    return data_url, formatted_update_time

def get_leaderboard_title(update_time):
    return f"# CompassAcademic Leaderboard (Last Updated: {update_time})"

MAIN_LEADERBOARD_DESCRIPTION = """## Main Evaluation Results
The CompassAcademic currently focuses on the comprehensive reasoning abilities of LLMs.
- The datasets selected so far include General Knowledge Reasoning (MMLU-Pro/GPQA-Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Completion (LiveCodeBench, HumanEval), and Instruction Following (IFEval).
- Currently, the evaluation primarily targets chat models, with updates featuring the latest community models at irregular intervals. 
- Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)🏆.
"""

MODEL_SIZE = ['<10B', '10B-70B', '>70B', 'Unknown']
MODEL_TYPE = ['API', 'OpenSource']

def load_data(data_url):
    response = urlopen(data_url)
    data = json.loads(response.read().decode('utf-8'))
    return data

def build_main_table(data):
    df = pd.DataFrame(data['globalData']['OverallTable'])
    models_data = data['models']
    df['OpenSource'] = df['model'].apply(
        lambda x: 'Yes' if models_data[x]['release'] == 'OpenSource' else 'No'
    )
    df['Rank'] = df['Average'].rank(ascending=False, method='min').astype(int)
    
    columns = {
        'Rank': 'Rank', 'model': 'Model', 'org': 'Organization', 'num': 'Parameters',
        'OpenSource': 'OpenSource', 'Average': 'Average Score', 'BBH': 'BBH',
        'Math-500': 'Math-500', 'AIME': 'AIME', 'MMLU-Pro': 'MMLU-Pro',
        'LiveCodeBench': 'LiveCodeBench', 'HumanEval': 'HumanEval',
        'GQPA-Diamond': 'GQPA-Diamond', 'IFEval': 'IFEval',
    }
    df = df[list(columns.keys())].rename(columns=columns)
    return df

def filter_table(df, size_ranges, model_types):
    filtered_df = df.copy()
    
    if size_ranges:
        def get_size_in_B(param):
            if param == 'N/A':
                return None
            try:
                return float(param.replace('B', ''))
            except:
                return None
        
        filtered_df['size_in_B'] = filtered_df['Parameters'].apply(get_size_in_B)
        mask = pd.Series(False, index=filtered_df.index)
        
        for size_range in size_ranges:
            if size_range == '<10B':
                mask |= (filtered_df['size_in_B'] < 10) & (filtered_df['size_in_B'].notna())
            elif size_range == '10B-70B':
                mask |= (filtered_df['size_in_B'] >= 10) & (filtered_df['size_in_B'] < 70)
            elif size_range == '>70B':
                mask |= filtered_df['size_in_B'] >= 70
            elif size_range == 'Unknown':
                mask |= filtered_df['size_in_B'].isna()
        
        filtered_df = filtered_df[mask]
        filtered_df.drop('size_in_B', axis=1, inplace=True)
    
    if model_types:
        type_mask = pd.Series(False, index=filtered_df.index)
        for model_type in model_types:
            if model_type == 'API':
                type_mask |= filtered_df['OpenSource'] == 'No'
            elif model_type == 'OpenSource':
                type_mask |= filtered_df['OpenSource'] == 'Yes'
        filtered_df = filtered_df[type_mask]
    
    return filtered_df

def calculate_column_widths(df):
    column_widths = []
    for column in df.columns:
        header_length = len(str(column))
        max_content_length = df[column].astype(str).map(len).max()
        width = max(header_length * 10, max_content_length * 8) + 20
        width = max(160, min(400, width))
        column_widths.append(width)
    return column_widths

class DataState:
    def __init__(self):
        self.current_df = None

data_state = DataState()

def create_interface():
    empty_df = pd.DataFrame(columns=[
        'Rank', 'Model', 'Organization', 'Parameters', 'OpenSource', 
        'Average Score', 'BBH', 'Math-500', 'AIME', 'MMLU-Pro', 
        'LiveCodeBench', 'HumanEval', 'GQPA-Diamond', 'IFEval'
    ])

    def load_initial_data():
        try:
            data_url, update_time = get_latest_data()
            data = load_data(data_url)
            new_df = build_main_table(data)
            data_state.current_df = new_df
            filtered_df = filter_table(new_df, MODEL_SIZE, MODEL_TYPE)
            return get_leaderboard_title(update_time), filtered_df.sort_values("Average Score", ascending=False)
        except Exception as e:
            print(f"Error loading initial data: {e}")
            return "# CompassAcademic Leaderboard (Error loading data)", empty_df

    def refresh_data():
        try:
            data_url, update_time = get_latest_data()
            data = load_data(data_url)
            new_df = build_main_table(data)
            data_state.current_df = new_df
            filtered_df = filter_table(new_df, MODEL_SIZE, MODEL_TYPE)
            return get_leaderboard_title(update_time), filtered_df.sort_values("Average Score", ascending=False)
        except Exception as e:
            print(f"Error refreshing data: {e}")
            return None, None

    def update_table(size_ranges, model_types):
        if data_state.current_df is None:
            return empty_df
        filtered_df = filter_table(data_state.current_df, size_ranges, model_types)
        return filtered_df.sort_values("Average Score", ascending=False)

    initial_title, initial_data = load_initial_data()
    
    with gr.Blocks() as demo:
        title_comp = gr.Markdown(initial_title)
        
        with gr.Tabs() as tabs:
            with gr.TabItem("🏅 Main Leaderboard", elem_id='main'):
                gr.Markdown(MAIN_LEADERBOARD_DESCRIPTION)
                
                with gr.Row():
                    with gr.Column():
                        size_filter = gr.CheckboxGroup(
                            choices=MODEL_SIZE,
                            value=MODEL_SIZE,
                            label='Model Size',
                            interactive=True,
                        )
                    with gr.Column():
                        type_filter = gr.CheckboxGroup(
                            choices=MODEL_TYPE,
                            value=MODEL_TYPE,
                            label='Model Type',
                            interactive=True,
                        )
                
                with gr.Column():
                    table = gr.DataFrame(
                        value=initial_data,
                        interactive=False,
                        wrap=False,
                        column_widths=calculate_column_widths(initial_data),
                    )
                
                refresh_button = gr.Button("Refresh Data")

                def refresh_and_update():
                    title, data = refresh_data()
                    return title, data

                refresh_button.click(
                    fn=refresh_and_update,
                    outputs=[title_comp, table],
                )
                
                size_filter.change(
                    fn=update_table,
                    inputs=[size_filter, type_filter],
                    outputs=table,
                )
                
                type_filter.change(
                    fn=update_table,
                    inputs=[size_filter, type_filter],
                    outputs=table,
                )

        with gr.Row():
            with gr.Accordion("Citation", open=False):
                citation_button = gr.Textbox(
                    value=CITATION_BUTTON_TEXT,
                    label=CITATION_BUTTON_LABEL,
                    elem_id='citation-button',
                    lines=6,  # 增加行数
                    max_lines=8,  # 设置最大行数
                    show_copy_button=True  # 添加复制按钮使其更方便使用
                )

    return demo

if __name__ == '__main__':
    demo = create_interface()
    demo.queue()
    demo.launch(server_name='0.0.0.0')