Update app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1 |
import gradio as gr
|
2 |
import json
|
3 |
import pandas as pd
|
|
|
|
|
4 |
from urllib.request import urlopen, URLError
|
5 |
import re
|
6 |
from datetime import datetime
|
|
|
7 |
|
8 |
# Constants
|
9 |
CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
|
@@ -11,16 +14,23 @@ CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
|
|
11 |
author={OpenCompass Contributors},
|
12 |
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
13 |
year={2023}
|
|
|
14 |
}"""
|
15 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
19 |
DATA_URL_BASE = "http://opencompass.oss-cn-shanghai.aliyuncs.com/assets/research-rank/research-data.REALTIME."
|
20 |
|
21 |
def find_latest_data_url():
|
22 |
"""Find the latest available data URL by trying different dates."""
|
23 |
today = datetime.now()
|
|
|
24 |
for i in range(365):
|
25 |
date = today.replace(day=today.day - i)
|
26 |
date_str = date.strftime("%Y%m%d")
|
@@ -30,6 +40,7 @@ def find_latest_data_url():
|
|
30 |
return url, date_str
|
31 |
except URLError:
|
32 |
continue
|
|
|
33 |
return None, None
|
34 |
|
35 |
def get_latest_data():
|
@@ -40,6 +51,7 @@ def get_latest_data():
|
|
40 |
formatted_update_time = datetime.strptime(update_time, "%Y%m%d").strftime("%Y-%m-%d")
|
41 |
return data_url, formatted_update_time
|
42 |
|
|
|
43 |
def get_leaderboard_title(update_time):
|
44 |
return f"# CompassAcademic Leaderboard (Last Updated: {update_time})"
|
45 |
|
@@ -50,36 +62,72 @@ The CompassAcademic currently focuses on the comprehensive reasoning abilities o
|
|
50 |
- Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)🏆.
|
51 |
"""
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
MODEL_SIZE = ['<10B', '10B-70B', '>70B', 'Unknown']
|
54 |
MODEL_TYPE = ['API', 'OpenSource']
|
55 |
|
|
|
56 |
def load_data(data_url):
|
57 |
response = urlopen(data_url)
|
58 |
data = json.loads(response.read().decode('utf-8'))
|
59 |
return data
|
60 |
|
|
|
61 |
def build_main_table(data):
|
62 |
df = pd.DataFrame(data['globalData']['OverallTable'])
|
|
|
|
|
63 |
models_data = data['models']
|
64 |
df['OpenSource'] = df['model'].apply(
|
65 |
lambda x: 'Yes' if models_data[x]['release'] == 'OpenSource' else 'No'
|
66 |
)
|
|
|
|
|
67 |
df['Rank'] = df['Average'].rank(ascending=False, method='min').astype(int)
|
68 |
-
|
69 |
columns = {
|
70 |
-
'Rank': 'Rank',
|
71 |
-
'
|
72 |
-
'
|
73 |
-
'
|
74 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
}
|
76 |
df = df[list(columns.keys())].rename(columns=columns)
|
77 |
return df
|
78 |
|
|
|
79 |
def filter_table(df, size_ranges, model_types):
|
80 |
filtered_df = df.copy()
|
81 |
-
|
|
|
82 |
if size_ranges:
|
|
|
83 |
def get_size_in_B(param):
|
84 |
if param == 'N/A':
|
85 |
return None
|
@@ -87,23 +135,30 @@ def filter_table(df, size_ranges, model_types):
|
|
87 |
return float(param.replace('B', ''))
|
88 |
except:
|
89 |
return None
|
90 |
-
|
91 |
-
filtered_df['size_in_B'] = filtered_df['Parameters'].apply(
|
|
|
|
|
|
|
92 |
mask = pd.Series(False, index=filtered_df.index)
|
93 |
-
|
94 |
for size_range in size_ranges:
|
95 |
if size_range == '<10B':
|
96 |
-
mask |= (filtered_df['size_in_B'] < 10) & (
|
|
|
|
|
97 |
elif size_range == '10B-70B':
|
98 |
-
mask |= (filtered_df['size_in_B'] >= 10) & (
|
|
|
|
|
99 |
elif size_range == '>70B':
|
100 |
mask |= filtered_df['size_in_B'] >= 70
|
101 |
elif size_range == 'Unknown':
|
102 |
mask |= filtered_df['size_in_B'].isna()
|
103 |
-
|
104 |
filtered_df = filtered_df[mask]
|
105 |
filtered_df.drop('size_in_B', axis=1, inplace=True)
|
106 |
-
|
|
|
107 |
if model_types:
|
108 |
type_mask = pd.Series(False, index=filtered_df.index)
|
109 |
for model_type in model_types:
|
@@ -112,79 +167,49 @@ def filter_table(df, size_ranges, model_types):
|
|
112 |
elif model_type == 'OpenSource':
|
113 |
type_mask |= filtered_df['OpenSource'] == 'Yes'
|
114 |
filtered_df = filtered_df[type_mask]
|
115 |
-
|
116 |
return filtered_df
|
117 |
|
|
|
118 |
def calculate_column_widths(df):
|
|
|
119 |
column_widths = []
|
|
|
120 |
for column in df.columns:
|
|
|
121 |
header_length = len(str(column))
|
122 |
max_content_length = df[column].astype(str).map(len).max()
|
|
|
|
|
|
|
|
|
|
|
123 |
width = max(header_length * 10, max_content_length * 8) + 20
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
column_widths.append(width)
|
126 |
-
return column_widths
|
127 |
|
128 |
-
|
129 |
-
def __init__(self):
|
130 |
-
self.current_df = None
|
131 |
|
132 |
-
data_state = DataState()
|
133 |
|
134 |
def create_interface():
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
])
|
140 |
|
141 |
-
def load_initial_data():
|
142 |
-
try:
|
143 |
-
data_url, update_time = get_latest_data()
|
144 |
-
data = load_data(data_url)
|
145 |
-
new_df = build_main_table(data)
|
146 |
-
data_state.current_df = new_df
|
147 |
-
filtered_df = filter_table(new_df, MODEL_SIZE, MODEL_TYPE)
|
148 |
-
return get_leaderboard_title(update_time), filtered_df.sort_values("Average Score", ascending=False)
|
149 |
-
except Exception as e:
|
150 |
-
print(f"Error loading initial data: {e}")
|
151 |
-
return "# CompassAcademic Leaderboard (Error loading data)", empty_df
|
152 |
-
|
153 |
-
def refresh_data():
|
154 |
-
try:
|
155 |
-
data_url, update_time = get_latest_data()
|
156 |
-
data = load_data(data_url)
|
157 |
-
new_df = build_main_table(data)
|
158 |
-
data_state.current_df = new_df
|
159 |
-
filtered_df = filter_table(new_df, MODEL_SIZE, MODEL_TYPE)
|
160 |
-
return get_leaderboard_title(update_time), filtered_df.sort_values("Average Score", ascending=False)
|
161 |
-
except Exception as e:
|
162 |
-
print(f"Error refreshing data: {e}")
|
163 |
-
return None, None
|
164 |
-
|
165 |
-
def auto_refresh():
|
166 |
-
"""Single refresh function for automatic updates"""
|
167 |
-
title, data = refresh_data()
|
168 |
-
status = f"Last auto update: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
169 |
-
if title and data is not None:
|
170 |
-
return title, data, status
|
171 |
-
return None, None, None
|
172 |
-
|
173 |
-
def update_table(size_ranges, model_types):
|
174 |
-
if data_state.current_df is None:
|
175 |
-
return empty_df
|
176 |
-
filtered_df = filter_table(data_state.current_df, size_ranges, model_types)
|
177 |
-
return filtered_df.sort_values("Average Score", ascending=False)
|
178 |
-
|
179 |
-
initial_title, initial_data = load_initial_data()
|
180 |
-
|
181 |
with gr.Blocks() as demo:
|
182 |
-
title_comp = gr.Markdown(
|
183 |
-
|
184 |
with gr.Tabs() as tabs:
|
185 |
with gr.TabItem("🏅 Main Leaderboard", elem_id='main'):
|
186 |
gr.Markdown(MAIN_LEADERBOARD_DESCRIPTION)
|
187 |
-
|
188 |
with gr.Row():
|
189 |
with gr.Column():
|
190 |
size_filter = gr.CheckboxGroup(
|
@@ -200,47 +225,52 @@ def create_interface():
|
|
200 |
label='Model Type',
|
201 |
interactive=True,
|
202 |
)
|
203 |
-
|
204 |
with gr.Column():
|
205 |
table = gr.DataFrame(
|
206 |
-
value=
|
207 |
interactive=False,
|
208 |
-
wrap=False,
|
209 |
-
column_widths=calculate_column_widths(
|
210 |
)
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
|
|
232 |
size_filter.change(
|
233 |
fn=update_table,
|
234 |
inputs=[size_filter, type_filter],
|
235 |
outputs=table,
|
236 |
)
|
237 |
-
|
238 |
type_filter.change(
|
239 |
fn=update_table,
|
240 |
inputs=[size_filter, type_filter],
|
241 |
outputs=table,
|
242 |
)
|
243 |
|
|
|
|
|
|
|
244 |
with gr.Row():
|
245 |
with gr.Accordion("Citation", open=False):
|
246 |
citation_button = gr.Textbox(
|
@@ -251,7 +281,7 @@ def create_interface():
|
|
251 |
|
252 |
return demo
|
253 |
|
|
|
254 |
if __name__ == '__main__':
|
255 |
demo = create_interface()
|
256 |
-
demo.
|
257 |
-
demo.launch(server_name='0.0.0.0')
|
|
|
1 |
import gradio as gr
|
2 |
import json
|
3 |
import pandas as pd
|
4 |
+
from collections import defaultdict
|
5 |
+
import copy as cp
|
6 |
from urllib.request import urlopen, URLError
|
7 |
import re
|
8 |
from datetime import datetime
|
9 |
+
import time
|
10 |
|
11 |
# Constants
|
12 |
CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
|
|
|
14 |
author={OpenCompass Contributors},
|
15 |
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
16 |
year={2023}
|
17 |
+
},
|
18 |
}"""
|
19 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
20 |
+
OPENCOMPASS_README = (
|
21 |
+
'https://raw.githubusercontent.com/open-compass/opencompass/main/README.md'
|
22 |
+
)
|
23 |
+
GITHUB_REPO = 'https://github.com/open-compass/opencompass'
|
24 |
+
GITHUB_RAW = 'https://raw.githubusercontent.com/open-compass/opencompass'
|
25 |
+
GITHUB_BLOB = 'https://github.com/open-compass/opencompass/blob'
|
26 |
+
|
27 |
+
# Base URL for the JSON data
|
28 |
DATA_URL_BASE = "http://opencompass.oss-cn-shanghai.aliyuncs.com/assets/research-rank/research-data.REALTIME."
|
29 |
|
30 |
def find_latest_data_url():
|
31 |
"""Find the latest available data URL by trying different dates."""
|
32 |
today = datetime.now()
|
33 |
+
# Try last 365 days
|
34 |
for i in range(365):
|
35 |
date = today.replace(day=today.day - i)
|
36 |
date_str = date.strftime("%Y%m%d")
|
|
|
40 |
return url, date_str
|
41 |
except URLError:
|
42 |
continue
|
43 |
+
# If no valid URL found, return None
|
44 |
return None, None
|
45 |
|
46 |
def get_latest_data():
|
|
|
51 |
formatted_update_time = datetime.strptime(update_time, "%Y%m%d").strftime("%Y-%m-%d")
|
52 |
return data_url, formatted_update_time
|
53 |
|
54 |
+
# Markdown content
|
55 |
def get_leaderboard_title(update_time):
|
56 |
return f"# CompassAcademic Leaderboard (Last Updated: {update_time})"
|
57 |
|
|
|
62 |
- Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)🏆.
|
63 |
"""
|
64 |
|
65 |
+
def fix_image_urls(content):
|
66 |
+
"""Fix image URLs in markdown content."""
|
67 |
+
# Handle the specific logo.svg path
|
68 |
+
content = content.replace(
|
69 |
+
'docs/en/_static/image/logo.svg',
|
70 |
+
'https://raw.githubusercontent.com/open-compass/opencompass/main/docs/en/_static/image/logo.svg',
|
71 |
+
)
|
72 |
+
|
73 |
+
# Replace other relative image paths with absolute GitHub URLs
|
74 |
+
content = re.sub(
|
75 |
+
r'!\[[^\]]*\]\((?!http)([^)]+)\)',
|
76 |
+
lambda m: f'})',
|
77 |
+
content,
|
78 |
+
)
|
79 |
+
|
80 |
+
return content
|
81 |
+
|
82 |
+
|
83 |
MODEL_SIZE = ['<10B', '10B-70B', '>70B', 'Unknown']
|
84 |
MODEL_TYPE = ['API', 'OpenSource']
|
85 |
|
86 |
+
|
87 |
def load_data(data_url):
|
88 |
response = urlopen(data_url)
|
89 |
data = json.loads(response.read().decode('utf-8'))
|
90 |
return data
|
91 |
|
92 |
+
|
93 |
def build_main_table(data):
|
94 |
df = pd.DataFrame(data['globalData']['OverallTable'])
|
95 |
+
|
96 |
+
# Add OpenSource column based on models data
|
97 |
models_data = data['models']
|
98 |
df['OpenSource'] = df['model'].apply(
|
99 |
lambda x: 'Yes' if models_data[x]['release'] == 'OpenSource' else 'No'
|
100 |
)
|
101 |
+
|
102 |
+
# Add Rank column based on Average Score
|
103 |
df['Rank'] = df['Average'].rank(ascending=False, method='min').astype(int)
|
104 |
+
|
105 |
columns = {
|
106 |
+
'Rank': 'Rank',
|
107 |
+
'model': 'Model',
|
108 |
+
'org': 'Organization',
|
109 |
+
'num': 'Parameters',
|
110 |
+
'OpenSource': 'OpenSource',
|
111 |
+
'Average': 'Average Score',
|
112 |
+
'BBH': 'BBH',
|
113 |
+
'Math-500': 'Math-500',
|
114 |
+
'AIME': 'AIME',
|
115 |
+
'MMLU-Pro': 'MMLU-Pro',
|
116 |
+
'LiveCodeBench': 'LiveCodeBench',
|
117 |
+
'HumanEval': 'HumanEval',
|
118 |
+
'GQPA-Diamond': 'GQPA-Diamond',
|
119 |
+
'IFEval': 'IFEval',
|
120 |
}
|
121 |
df = df[list(columns.keys())].rename(columns=columns)
|
122 |
return df
|
123 |
|
124 |
+
|
125 |
def filter_table(df, size_ranges, model_types):
|
126 |
filtered_df = df.copy()
|
127 |
+
|
128 |
+
# Filter by size
|
129 |
if size_ranges:
|
130 |
+
|
131 |
def get_size_in_B(param):
|
132 |
if param == 'N/A':
|
133 |
return None
|
|
|
135 |
return float(param.replace('B', ''))
|
136 |
except:
|
137 |
return None
|
138 |
+
|
139 |
+
filtered_df['size_in_B'] = filtered_df['Parameters'].apply(
|
140 |
+
get_size_in_B
|
141 |
+
)
|
142 |
+
|
143 |
mask = pd.Series(False, index=filtered_df.index)
|
|
|
144 |
for size_range in size_ranges:
|
145 |
if size_range == '<10B':
|
146 |
+
mask |= (filtered_df['size_in_B'] < 10) & (
|
147 |
+
filtered_df['size_in_B'].notna()
|
148 |
+
)
|
149 |
elif size_range == '10B-70B':
|
150 |
+
mask |= (filtered_df['size_in_B'] >= 10) & (
|
151 |
+
filtered_df['size_in_B'] < 70
|
152 |
+
)
|
153 |
elif size_range == '>70B':
|
154 |
mask |= filtered_df['size_in_B'] >= 70
|
155 |
elif size_range == 'Unknown':
|
156 |
mask |= filtered_df['size_in_B'].isna()
|
157 |
+
|
158 |
filtered_df = filtered_df[mask]
|
159 |
filtered_df.drop('size_in_B', axis=1, inplace=True)
|
160 |
+
|
161 |
+
# Filter by model type
|
162 |
if model_types:
|
163 |
type_mask = pd.Series(False, index=filtered_df.index)
|
164 |
for model_type in model_types:
|
|
|
167 |
elif model_type == 'OpenSource':
|
168 |
type_mask |= filtered_df['OpenSource'] == 'Yes'
|
169 |
filtered_df = filtered_df[type_mask]
|
170 |
+
|
171 |
return filtered_df
|
172 |
|
173 |
+
|
174 |
def calculate_column_widths(df):
|
175 |
+
"""Dynamically calculate column widths based on content length."""
|
176 |
column_widths = []
|
177 |
+
|
178 |
for column in df.columns:
|
179 |
+
# Get max length of column name and values
|
180 |
header_length = len(str(column))
|
181 |
max_content_length = df[column].astype(str).map(len).max()
|
182 |
+
|
183 |
+
# Use the larger of header or content length
|
184 |
+
# Multiply by average character width (approximately 8 pixels)
|
185 |
+
# Add padding (20 pixels)
|
186 |
+
# Increase the multiplier for header length to ensure it fits
|
187 |
width = max(header_length * 10, max_content_length * 8) + 20
|
188 |
+
|
189 |
+
# Set minimum width (200 pixels)
|
190 |
+
width = max(160, width)
|
191 |
+
|
192 |
+
# Set maximum width (400 pixels) to prevent extremely wide columns
|
193 |
+
width = min(400, width)
|
194 |
+
|
195 |
column_widths.append(width)
|
|
|
196 |
|
197 |
+
return column_widths
|
|
|
|
|
198 |
|
|
|
199 |
|
200 |
def create_interface():
|
201 |
+
data_url, update_time = get_latest_data()
|
202 |
+
data = load_data(data_url)
|
203 |
+
df = build_main_table(data)
|
204 |
+
title = gr.Markdown(get_leaderboard_title(update_time))
|
|
|
205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
with gr.Blocks() as demo:
|
207 |
+
title_comp = gr.Markdown(get_leaderboard_title(update_time))
|
208 |
+
|
209 |
with gr.Tabs() as tabs:
|
210 |
with gr.TabItem("🏅 Main Leaderboard", elem_id='main'):
|
211 |
gr.Markdown(MAIN_LEADERBOARD_DESCRIPTION)
|
212 |
+
|
213 |
with gr.Row():
|
214 |
with gr.Column():
|
215 |
size_filter = gr.CheckboxGroup(
|
|
|
225 |
label='Model Type',
|
226 |
interactive=True,
|
227 |
)
|
228 |
+
|
229 |
with gr.Column():
|
230 |
table = gr.DataFrame(
|
231 |
+
value=df.sort_values("Average Score", ascending=False),
|
232 |
interactive=False,
|
233 |
+
wrap=False, # 禁用自动换行
|
234 |
+
column_widths=calculate_column_widths(df),
|
235 |
)
|
236 |
+
|
237 |
+
def update_data():
|
238 |
+
"""Periodically check for new data and update the interface"""
|
239 |
+
while True:
|
240 |
+
time.sleep(300) # Check every 5 minutes
|
241 |
+
try:
|
242 |
+
new_data_url, new_update_time = get_latest_data()
|
243 |
+
if new_data_url != data_url:
|
244 |
+
new_data = load_data(new_data_url)
|
245 |
+
new_df = build_main_table(new_data)
|
246 |
+
filtered_df = filter_table(new_df, size_filter.value, type_filter.value)
|
247 |
+
title_comp.value = get_leaderboard_title(new_update_time)
|
248 |
+
table.value = filtered_df.sort_values("Average Score", ascending=False)
|
249 |
+
except Exception as e:
|
250 |
+
print(f"Error updating data: {e}")
|
251 |
+
continue
|
252 |
+
|
253 |
+
def update_table(size_ranges, model_types):
|
254 |
+
filtered_df = filter_table(df, size_ranges, model_types)
|
255 |
+
return filtered_df.sort_values(
|
256 |
+
"Average Score", ascending=False
|
257 |
+
)
|
258 |
+
|
259 |
size_filter.change(
|
260 |
fn=update_table,
|
261 |
inputs=[size_filter, type_filter],
|
262 |
outputs=table,
|
263 |
)
|
264 |
+
|
265 |
type_filter.change(
|
266 |
fn=update_table,
|
267 |
inputs=[size_filter, type_filter],
|
268 |
outputs=table,
|
269 |
)
|
270 |
|
271 |
+
# Set up periodic data update
|
272 |
+
demo.load(update_data)
|
273 |
+
|
274 |
with gr.Row():
|
275 |
with gr.Accordion("Citation", open=False):
|
276 |
citation_button = gr.Textbox(
|
|
|
281 |
|
282 |
return demo
|
283 |
|
284 |
+
|
285 |
if __name__ == '__main__':
|
286 |
demo = create_interface()
|
287 |
+
demo.launch(server_name='0.0.0.0')
|
|