Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
refactor: refactor the data loading part
Browse files
app.py
CHANGED
@@ -82,8 +82,8 @@ from typing import Optional
|
|
82 |
@dataclass
|
83 |
class LeaderboardDataStore:
|
84 |
raw_data: Optional[list]
|
85 |
-
|
86 |
-
|
87 |
leaderboard_df_qa: Optional[pd.DataFrame]
|
88 |
leaderboard_df_long_doc: Optional[pd.DataFrame]
|
89 |
reranking_models: Optional[list]
|
@@ -91,41 +91,52 @@ class LeaderboardDataStore:
|
|
91 |
types_long_doc: Optional[list]
|
92 |
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
def load_eval_results(file_path: str):
|
95 |
output = {}
|
96 |
versions = ("AIR-Bench_24.04",)
|
97 |
for version in versions:
|
98 |
-
|
99 |
-
output[version]
|
100 |
-
output[version].raw_qa_df = get_leaderboard_df(
|
101 |
-
output[version].raw_data, task='qa', metric=DEFAULT_METRIC_QA)
|
102 |
-
output[version].original_df_long_doc = get_leaderboard_df(
|
103 |
-
output[version].raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
|
104 |
-
print(f'raw data: {len(output[version].raw_data)}')
|
105 |
-
print(f'QA data loaded: {output[version].raw_qa_df.shape}')
|
106 |
-
print(f'Long-Doc data loaded: {len(output[version].original_df_long_doc)}')
|
107 |
-
|
108 |
-
output[version].leaderboard_df_qa = output[version].raw_qa_df.copy()
|
109 |
-
# leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
|
110 |
-
shown_columns_qa, types_qa = get_default_cols(
|
111 |
-
'qa', output[version].leaderboard_df_qa.columns, add_fix_cols=True)
|
112 |
-
output[version].types_qa = types_qa
|
113 |
-
output[version].leaderboard_df_qa = output[version].leaderboard_df_qa[~output[version].leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
|
114 |
-
output[version].leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
115 |
-
|
116 |
-
output[version].leaderboard_df_long_doc = output[version].original_df_long_doc.copy()
|
117 |
-
shown_columns_long_doc, types_long_doc = get_default_cols(
|
118 |
-
'long-doc', output[version].leaderboard_df_long_doc.columns, add_fix_cols=True)
|
119 |
-
output[version].types_long_doc = types_long_doc
|
120 |
-
output[version].leaderboard_df_long_doc = output[version].leaderboard_df_long_doc[~output[version].leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
|
121 |
-
output[version].leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
122 |
-
|
123 |
-
output[version].reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in output[version].raw_data])))
|
124 |
return output
|
125 |
|
126 |
|
127 |
data = load_eval_results(EVAL_RESULTS_PATH)
|
128 |
|
|
|
129 |
def update_metric_qa(
|
130 |
metric: str,
|
131 |
domains: list,
|
@@ -133,9 +144,11 @@ def update_metric_qa(
|
|
133 |
reranking_model: list,
|
134 |
query: str,
|
135 |
show_anonymous: bool,
|
136 |
-
show_revision_and_timestamp,
|
|
|
137 |
):
|
138 |
-
return update_metric(data[
|
|
|
139 |
|
140 |
def update_metric_long_doc(
|
141 |
metric: str,
|
@@ -188,7 +201,7 @@ with demo:
|
|
188 |
selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
|
189 |
leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, data["AIR-Bench_24.04"].types_qa)
|
190 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
191 |
-
hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].
|
192 |
|
193 |
set_listeners(
|
194 |
"qa",
|
@@ -213,10 +226,13 @@ with demo:
|
|
213 |
search_bar,
|
214 |
show_anonymous,
|
215 |
show_revision_and_timestamp,
|
|
|
216 |
],
|
217 |
leaderboard_table,
|
218 |
queue=True
|
219 |
)
|
|
|
|
|
220 |
with gr.TabItem("Retrieval Only", id=11):
|
221 |
with gr.Row():
|
222 |
with gr.Column(scale=1):
|
@@ -227,7 +243,7 @@ with demo:
|
|
227 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
228 |
lb_table_retriever = get_leaderboard_table(lb_df_retriever, data["AIR-Bench_24.04"].types_qa)
|
229 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
230 |
-
hidden_lb_df_retriever = data["AIR-Bench_24.04"].
|
231 |
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
232 |
hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, data["AIR-Bench_24.04"].types_qa, visible=False)
|
233 |
|
@@ -254,6 +270,7 @@ with demo:
|
|
254 |
search_bar_retriever,
|
255 |
show_anonymous,
|
256 |
show_revision_and_timestamp,
|
|
|
257 |
],
|
258 |
lb_table_retriever,
|
259 |
queue=True
|
@@ -268,7 +285,7 @@ with demo:
|
|
268 |
with gr.Column(scale=1):
|
269 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
270 |
lb_table_reranker = get_leaderboard_table(lb_df_reranker, data["AIR-Bench_24.04"].types_qa)
|
271 |
-
hidden_lb_df_reranker = data["AIR-Bench_24.04"].
|
272 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
273 |
hidden_lb_table_reranker = get_leaderboard_table(
|
274 |
hidden_lb_df_reranker, data["AIR-Bench_24.04"].types_qa, visible=False
|
@@ -296,6 +313,7 @@ with demo:
|
|
296 |
search_bar_reranker,
|
297 |
show_anonymous,
|
298 |
show_revision_and_timestamp,
|
|
|
299 |
],
|
300 |
lb_table_reranker,
|
301 |
queue=True
|
@@ -334,7 +352,7 @@ with demo:
|
|
334 |
|
335 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
336 |
hidden_lb_table_for_search = get_leaderboard_table(
|
337 |
-
data["AIR-Bench_24.04"].
|
338 |
)
|
339 |
|
340 |
set_listeners(
|
@@ -374,8 +392,8 @@ with demo:
|
|
374 |
data["AIR-Bench_24.04"].leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
375 |
]
|
376 |
lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
|
377 |
-
hidden_lb_db_retriever_long_doc = data["AIR-Bench_24.04"].
|
378 |
-
data["AIR-Bench_24.04"].
|
379 |
]
|
380 |
hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
|
381 |
lb_table_retriever_long_doc = get_leaderboard_table(
|
@@ -422,7 +440,7 @@ with demo:
|
|
422 |
with gr.Column(scale=1):
|
423 |
search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
|
424 |
lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc)
|
425 |
-
hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].
|
426 |
hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
|
427 |
hidden_lb_table_reranker_ldoc = get_leaderboard_table(
|
428 |
hidden_lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc, visible=False
|
@@ -521,6 +539,7 @@ with demo:
|
|
521 |
|
522 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
523 |
gr.Markdown(BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
524 |
|
525 |
if __name__ == "__main__":
|
526 |
scheduler = BackgroundScheduler()
|
|
|
82 |
@dataclass
|
83 |
class LeaderboardDataStore:
|
84 |
raw_data: Optional[list]
|
85 |
+
raw_df_qa: Optional[pd.DataFrame]
|
86 |
+
raw_df_long_doc: Optional[pd.DataFrame]
|
87 |
leaderboard_df_qa: Optional[pd.DataFrame]
|
88 |
leaderboard_df_long_doc: Optional[pd.DataFrame]
|
89 |
reranking_models: Optional[list]
|
|
|
91 |
types_long_doc: Optional[list]
|
92 |
|
93 |
|
94 |
+
def load_leaderboard_data(file_path) -> LeaderboardDataStore:
|
95 |
+
lb_data_store = LeaderboardDataStore(None, None, None, None, None, None, None, None)
|
96 |
+
lb_data_store.raw_data = get_raw_eval_results(file_path)
|
97 |
+
print(f'raw data: {len(lb_data_store.raw_data)}')
|
98 |
+
|
99 |
+
lb_data_store.raw_df_qa = get_leaderboard_df(
|
100 |
+
lb_data_store.raw_data, task='qa', metric=DEFAULT_METRIC_QA)
|
101 |
+
lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy()
|
102 |
+
# leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
|
103 |
+
print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}')
|
104 |
+
shown_columns_qa, types_qa = get_default_cols(
|
105 |
+
'qa', lb_data_store.leaderboard_df_qa.columns, add_fix_cols=True)
|
106 |
+
lb_data_store.types_qa = types_qa
|
107 |
+
lb_data_store.leaderboard_df_qa = \
|
108 |
+
lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
|
109 |
+
lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
110 |
+
|
111 |
+
lb_data_store.raw_df_long_doc = get_leaderboard_df(
|
112 |
+
lb_data_store.raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
|
113 |
+
print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
|
114 |
+
lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
|
115 |
+
shown_columns_long_doc, types_long_doc = get_default_cols(
|
116 |
+
'long-doc', lb_data_store.leaderboard_df_long_doc.columns, add_fix_cols=True)
|
117 |
+
lb_data_store.types_long_doc = types_long_doc
|
118 |
+
lb_data_store.leaderboard_df_long_doc = \
|
119 |
+
lb_data_store.leaderboard_df_long_doc[~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][
|
120 |
+
shown_columns_long_doc]
|
121 |
+
lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
122 |
+
|
123 |
+
lb_data_store.reranking_models = sorted(
|
124 |
+
list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
|
125 |
+
return lb_data_store
|
126 |
+
|
127 |
+
|
128 |
def load_eval_results(file_path: str):
|
129 |
output = {}
|
130 |
versions = ("AIR-Bench_24.04",)
|
131 |
for version in versions:
|
132 |
+
fn = f"{file_path}/{version}"
|
133 |
+
output[version] = load_leaderboard_data(fn)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
return output
|
135 |
|
136 |
|
137 |
data = load_eval_results(EVAL_RESULTS_PATH)
|
138 |
|
139 |
+
|
140 |
def update_metric_qa(
|
141 |
metric: str,
|
142 |
domains: list,
|
|
|
144 |
reranking_model: list,
|
145 |
query: str,
|
146 |
show_anonymous: bool,
|
147 |
+
show_revision_and_timestamp: bool,
|
148 |
+
selected_version: str,
|
149 |
):
|
150 |
+
return update_metric(data[selected_version].raw_data, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
|
151 |
+
|
152 |
|
153 |
def update_metric_long_doc(
|
154 |
metric: str,
|
|
|
201 |
selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
|
202 |
leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, data["AIR-Bench_24.04"].types_qa)
|
203 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
204 |
+
hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].raw_df_qa, data["AIR-Bench_24.04"].types_qa, visible=False)
|
205 |
|
206 |
set_listeners(
|
207 |
"qa",
|
|
|
226 |
search_bar,
|
227 |
show_anonymous,
|
228 |
show_revision_and_timestamp,
|
229 |
+
selected_version,
|
230 |
],
|
231 |
leaderboard_table,
|
232 |
queue=True
|
233 |
)
|
234 |
+
|
235 |
+
"""
|
236 |
with gr.TabItem("Retrieval Only", id=11):
|
237 |
with gr.Row():
|
238 |
with gr.Column(scale=1):
|
|
|
243 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
244 |
lb_table_retriever = get_leaderboard_table(lb_df_retriever, data["AIR-Bench_24.04"].types_qa)
|
245 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
246 |
+
hidden_lb_df_retriever = data["AIR-Bench_24.04"].raw_df_qa[data["AIR-Bench_24.04"].raw_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
247 |
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
248 |
hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, data["AIR-Bench_24.04"].types_qa, visible=False)
|
249 |
|
|
|
270 |
search_bar_retriever,
|
271 |
show_anonymous,
|
272 |
show_revision_and_timestamp,
|
273 |
+
selected_version,
|
274 |
],
|
275 |
lb_table_retriever,
|
276 |
queue=True
|
|
|
285 |
with gr.Column(scale=1):
|
286 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
287 |
lb_table_reranker = get_leaderboard_table(lb_df_reranker, data["AIR-Bench_24.04"].types_qa)
|
288 |
+
hidden_lb_df_reranker = data["AIR-Bench_24.04"].raw_df_qa[data["AIR-Bench_24.04"].raw_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
289 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
290 |
hidden_lb_table_reranker = get_leaderboard_table(
|
291 |
hidden_lb_df_reranker, data["AIR-Bench_24.04"].types_qa, visible=False
|
|
|
313 |
search_bar_reranker,
|
314 |
show_anonymous,
|
315 |
show_revision_and_timestamp,
|
316 |
+
selected_version,
|
317 |
],
|
318 |
lb_table_reranker,
|
319 |
queue=True
|
|
|
352 |
|
353 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
354 |
hidden_lb_table_for_search = get_leaderboard_table(
|
355 |
+
data["AIR-Bench_24.04"].raw_df_long_doc, data["AIR-Bench_24.04"].types_long_doc, visible=False
|
356 |
)
|
357 |
|
358 |
set_listeners(
|
|
|
392 |
data["AIR-Bench_24.04"].leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
393 |
]
|
394 |
lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
|
395 |
+
hidden_lb_db_retriever_long_doc = data["AIR-Bench_24.04"].raw_df_long_doc[
|
396 |
+
data["AIR-Bench_24.04"].raw_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
|
397 |
]
|
398 |
hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
|
399 |
lb_table_retriever_long_doc = get_leaderboard_table(
|
|
|
440 |
with gr.Column(scale=1):
|
441 |
search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
|
442 |
lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc)
|
443 |
+
hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].raw_df_long_doc[data["AIR-Bench_24.04"].raw_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
444 |
hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
|
445 |
hidden_lb_table_reranker_ldoc = get_leaderboard_table(
|
446 |
hidden_lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc, visible=False
|
|
|
539 |
|
540 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
541 |
gr.Markdown(BENCHMARKS_TEXT, elem_classes="markdown-text")
|
542 |
+
"""
|
543 |
|
544 |
if __name__ == "__main__":
|
545 |
scheduler = BackgroundScheduler()
|