Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: add versioning for the long-doc
Browse files- app.py +25 -20
- src/benchmarks.py +2 -0
- src/loaders.py +11 -11
- src/utils.py +9 -44
app.py
CHANGED
@@ -69,16 +69,16 @@ def update_metric_qa(
|
|
69 |
return update_metric(datastore, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
|
70 |
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
|
83 |
|
84 |
def update_datastore(version):
|
@@ -274,18 +274,15 @@ with demo:
|
|
274 |
lb_table_reranker,
|
275 |
queue=True
|
276 |
)
|
277 |
-
"""
|
278 |
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
|
279 |
with gr.Row():
|
280 |
with gr.Column(min_width=320):
|
281 |
# select domain
|
282 |
with gr.Row():
|
283 |
-
selected_domains = get_domain_dropdown(
|
284 |
# select language
|
285 |
with gr.Row():
|
286 |
-
selected_langs = get_language_dropdown(
|
287 |
-
LANG_COLS_LONG_DOC, LANG_COLS_LONG_DOC
|
288 |
-
)
|
289 |
with gr.Column():
|
290 |
# select the metric
|
291 |
with gr.Row():
|
@@ -301,22 +298,29 @@ with demo:
|
|
301 |
search_bar = get_search_bar()
|
302 |
# select reranking model
|
303 |
with gr.Column():
|
304 |
-
selected_rerankings = get_reranking_dropdown(
|
305 |
|
306 |
lb_table = get_leaderboard_table(
|
307 |
-
|
308 |
)
|
309 |
|
310 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
311 |
-
|
312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
)
|
314 |
|
315 |
set_listeners(
|
316 |
"long-doc",
|
317 |
lb_table,
|
318 |
-
|
319 |
search_bar,
|
|
|
320 |
selected_domains,
|
321 |
selected_langs,
|
322 |
selected_rerankings,
|
@@ -339,6 +343,7 @@ with demo:
|
|
339 |
lb_table,
|
340 |
queue=True
|
341 |
)
|
|
|
342 |
with gr.TabItem("Retrieval Only", id=21):
|
343 |
with gr.Row():
|
344 |
with gr.Column(scale=1):
|
|
|
69 |
return update_metric(datastore, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
|
70 |
|
71 |
|
72 |
+
def update_metric_long_doc(
|
73 |
+
metric: str,
|
74 |
+
domains: list,
|
75 |
+
langs: list,
|
76 |
+
reranking_model: list,
|
77 |
+
query: str,
|
78 |
+
show_anonymous: bool,
|
79 |
+
show_revision_and_timestamp,
|
80 |
+
):
|
81 |
+
return update_metric(datastore, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
|
82 |
|
83 |
|
84 |
def update_datastore(version):
|
|
|
274 |
lb_table_reranker,
|
275 |
queue=True
|
276 |
)
|
|
|
277 |
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
|
278 |
with gr.Row():
|
279 |
with gr.Column(min_width=320):
|
280 |
# select domain
|
281 |
with gr.Row():
|
282 |
+
selected_domains = get_domain_dropdown(LongDocBenchmarks[datastore.slug])
|
283 |
# select language
|
284 |
with gr.Row():
|
285 |
+
selected_langs = get_language_dropdown(LongDocBenchmarks[datastore.slug])
|
|
|
|
|
286 |
with gr.Column():
|
287 |
# select the metric
|
288 |
with gr.Row():
|
|
|
298 |
search_bar = get_search_bar()
|
299 |
# select reranking model
|
300 |
with gr.Column():
|
301 |
+
selected_rerankings = get_reranking_dropdown(datastore.reranking_models)
|
302 |
|
303 |
lb_table = get_leaderboard_table(
|
304 |
+
datastore.leaderboard_df_long_doc, datastore.types_long_doc
|
305 |
)
|
306 |
|
307 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
308 |
+
hidden_lb_table = get_leaderboard_table(
|
309 |
+
datastore.raw_df_long_doc, datastore.types_long_doc, visible=False
|
310 |
+
)
|
311 |
+
|
312 |
+
selected_version.change(
|
313 |
+
update_datastore,
|
314 |
+
[selected_version,],
|
315 |
+
[selected_domains, selected_langs, selected_rerankings, lb_table, hidden_lb_table]
|
316 |
)
|
317 |
|
318 |
set_listeners(
|
319 |
"long-doc",
|
320 |
lb_table,
|
321 |
+
hidden_lb_table,
|
322 |
search_bar,
|
323 |
+
selected_version,
|
324 |
selected_domains,
|
325 |
selected_langs,
|
326 |
selected_rerankings,
|
|
|
343 |
lb_table,
|
344 |
queue=True
|
345 |
)
|
346 |
+
"""
|
347 |
with gr.TabItem("Retrieval Only", id=21):
|
348 |
with gr.Row():
|
349 |
with gr.Column(scale=1):
|
src/benchmarks.py
CHANGED
@@ -51,6 +51,8 @@ def get_benchmarks_enum(benchmark_version, task_type):
|
|
51 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
52 |
benchmark_name = get_safe_name(benchmark_name)
|
53 |
col_name = benchmark_name
|
|
|
|
|
54 |
for metric in METRIC_LIST:
|
55 |
benchmark_dict[benchmark_name] = \
|
56 |
Benchmark(benchmark_name, metric, col_name, domain, lang, task)
|
|
|
51 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
52 |
benchmark_name = get_safe_name(benchmark_name)
|
53 |
col_name = benchmark_name
|
54 |
+
if "test" not in dataset_list[dataset]["splits"]:
|
55 |
+
continue
|
56 |
for metric in METRIC_LIST:
|
57 |
benchmark_dict[benchmark_name] = \
|
58 |
Benchmark(benchmark_name, metric, col_name, domain, lang, task)
|
src/loaders.py
CHANGED
@@ -76,17 +76,17 @@ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
|
|
76 |
lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
|
77 |
lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
|
91 |
lb_data_store.reranking_models = sorted(
|
92 |
list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
|
|
|
76 |
lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
|
77 |
lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
78 |
|
79 |
+
lb_data_store.raw_df_long_doc = get_leaderboard_df(
|
80 |
+
lb_data_store, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
|
81 |
+
print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
|
82 |
+
lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
|
83 |
+
shown_columns_long_doc, types_long_doc = get_default_cols(
|
84 |
+
'long-doc', lb_data_store.slug, add_fix_cols=True)
|
85 |
+
lb_data_store.types_long_doc = types_long_doc
|
86 |
+
lb_data_store.leaderboard_df_long_doc = \
|
87 |
+
lb_data_store.leaderboard_df_long_doc[
|
88 |
+
~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
|
89 |
+
lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
90 |
|
91 |
lb_data_store.reranking_models = sorted(
|
92 |
list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
|
src/utils.py
CHANGED
@@ -64,12 +64,12 @@ def get_default_cols(task: str, version_slug, add_fix_cols: bool=True) -> tuple:
|
|
64 |
types = []
|
65 |
if task == "qa":
|
66 |
benchmarks = QABenchmarks[version_slug]
|
67 |
-
|
68 |
-
|
69 |
-
# types_list = TYPES_QA
|
70 |
-
benchmark_list = [c.value.col_name for c in list(benchmarks.value)]
|
71 |
else:
|
72 |
raise NotImplemented
|
|
|
|
|
73 |
for col_name, col_type in zip(cols_list, types_list):
|
74 |
if col_name not in benchmark_list:
|
75 |
continue
|
@@ -90,40 +90,6 @@ def get_default_cols(task: str, version_slug, add_fix_cols: bool=True) -> tuple:
|
|
90 |
return cols, types
|
91 |
|
92 |
|
93 |
-
# def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
|
94 |
-
# cols = []
|
95 |
-
# types = []
|
96 |
-
# if task == "qa":
|
97 |
-
# cols_list = COLS_QA
|
98 |
-
# types_list = TYPES_QA
|
99 |
-
# benchmark_list = [c.value.col_name for c in list(QABenchmarks)]
|
100 |
-
# elif task == "long-doc":
|
101 |
-
# cols_list = COLS_LONG_DOC
|
102 |
-
# types_list = TYPES_LONG_DOC
|
103 |
-
# benchmark_list = [c.value.col_name for c in list(LongDocBenchmarks)]
|
104 |
-
# else:
|
105 |
-
# raise NotImplemented
|
106 |
-
# for col_name, col_type in zip(cols_list, types_list):
|
107 |
-
# if col_name not in benchmark_list:
|
108 |
-
# continue
|
109 |
-
# if len(columns) > 0 and col_name not in columns:
|
110 |
-
# continue
|
111 |
-
# cols.append(col_name)
|
112 |
-
# types.append(col_type)
|
113 |
-
#
|
114 |
-
# if add_fix_cols:
|
115 |
-
# _cols = []
|
116 |
-
# _types = []
|
117 |
-
# for col_name, col_type in zip(cols, types):
|
118 |
-
# if col_name in FIXED_COLS:
|
119 |
-
# continue
|
120 |
-
# _cols.append(col_name)
|
121 |
-
# _types.append(col_type)
|
122 |
-
# cols = FIXED_COLS + _cols
|
123 |
-
# types = FIXED_COLS_TYPES + _types
|
124 |
-
# return cols, types
|
125 |
-
|
126 |
-
|
127 |
def select_columns(
|
128 |
df: pd.DataFrame,
|
129 |
domain_query: list,
|
@@ -360,14 +326,13 @@ def get_leaderboard_df(datastore, task: str, metric: str) -> pd.DataFrame:
|
|
360 |
cols = [COL_NAME_IS_ANONYMOUS, ]
|
361 |
if task == "qa":
|
362 |
benchmarks = QABenchmarks[datastore.slug]
|
363 |
-
|
364 |
-
|
365 |
-
benchmark_cols = [t.value.col_name for t in list(benchmarks.value)]
|
366 |
-
# elif task == "long-doc":
|
367 |
-
# cols += COLS_LONG_DOC
|
368 |
-
# benchmark_cols = [t.value.col_name for t in LongDocBenchmarks]
|
369 |
else:
|
370 |
raise NotImplemented
|
|
|
|
|
|
|
371 |
all_data_json = []
|
372 |
for v in raw_data:
|
373 |
all_data_json += v.to_dict(task=task, metric=metric)
|
|
|
64 |
types = []
|
65 |
if task == "qa":
|
66 |
benchmarks = QABenchmarks[version_slug]
|
67 |
+
elif task == "long-doc":
|
68 |
+
benchmarks = LongDocBenchmarks[version_slug]
|
|
|
|
|
69 |
else:
|
70 |
raise NotImplemented
|
71 |
+
cols_list, types_list = get_default_col_names_and_types(benchmarks)
|
72 |
+
benchmark_list = [c.value.col_name for c in list(benchmarks.value)]
|
73 |
for col_name, col_type in zip(cols_list, types_list):
|
74 |
if col_name not in benchmark_list:
|
75 |
continue
|
|
|
90 |
return cols, types
|
91 |
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
def select_columns(
|
94 |
df: pd.DataFrame,
|
95 |
domain_query: list,
|
|
|
326 |
cols = [COL_NAME_IS_ANONYMOUS, ]
|
327 |
if task == "qa":
|
328 |
benchmarks = QABenchmarks[datastore.slug]
|
329 |
+
elif task == "long-doc":
|
330 |
+
benchmarks = LongDocBenchmarks[datastore.slug]
|
|
|
|
|
|
|
|
|
331 |
else:
|
332 |
raise NotImplemented
|
333 |
+
cols_qa, _ = get_default_col_names_and_types(benchmarks)
|
334 |
+
cols += cols_qa
|
335 |
+
benchmark_cols = [t.value.col_name for t in list(benchmarks.value)]
|
336 |
all_data_json = []
|
337 |
for v in raw_data:
|
338 |
all_data_json += v.to_dict(task=task, metric=metric)
|