Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
1199e4c
·
2 Parent(s): 1981c31 f765492

Merge branch 'feat-add-versions-to-benchmarks-1015' into pr/28

Browse files
app.py CHANGED
@@ -1,105 +1,63 @@
1
  import gradio as gr
 
2
  from apscheduler.schedulers.background import BackgroundScheduler
3
- from huggingface_hub import snapshot_download
4
 
5
  from src.about import (
6
  INTRODUCTION_TEXT,
7
- BENCHMARKS_TEXT,
8
- TITLE,
9
- EVALUATION_QUEUE_TEXT
10
  )
11
  from src.benchmarks import (
12
- DOMAIN_COLS_QA,
13
- LANG_COLS_QA,
14
- DOMAIN_COLS_LONG_DOC,
15
- LANG_COLS_LONG_DOC,
16
- METRIC_LIST,
17
- DEFAULT_METRIC_QA,
18
- DEFAULT_METRIC_LONG_DOC
19
  )
20
  from src.display.css_html_js import custom_css
21
- from src.display.utils import (
22
- COL_NAME_IS_ANONYMOUS,
23
- COL_NAME_REVISION,
24
- COL_NAME_TIMESTAMP,
25
- COL_NAME_RERANKING_MODEL,
26
- COL_NAME_RETRIEVAL_MODEL
27
- )
28
  from src.envs import (
29
  API,
30
  EVAL_RESULTS_PATH,
31
- REPO_ID,
32
- RESULTS_REPO,
33
- TOKEN,
34
- BM25_LINK,
35
- BENCHMARK_VERSION_LIST,
36
- LATEST_BENCHMARK_VERSION
37
  )
38
- from src.read_evals import (
39
- get_raw_eval_results,
40
- get_leaderboard_df
41
  )
42
  from src.utils import (
43
  update_metric,
44
- upload_file,
45
- get_default_cols,
46
- submit_results,
47
  reset_rank,
48
- remove_html
49
  )
50
  from src.display.gradio_formatting import (
51
  get_version_dropdown,
52
  get_search_bar,
53
  get_reranking_dropdown,
 
54
  get_metric_dropdown,
55
  get_domain_dropdown,
56
  get_language_dropdown,
57
  get_anonymous_checkbox,
58
  get_revision_and_ts_checkbox,
59
- get_leaderboard_table,
60
- get_noreranking_dropdown
61
  )
62
- from src.display.gradio_listener import set_listeners
63
-
64
- def restart_space():
65
- API.restart_space(repo_id=REPO_ID)
66
 
 
67
 
68
- try:
69
- snapshot_download(
70
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
71
- token=TOKEN
72
- )
73
- except Exception as e:
74
- print(f'failed to download')
75
- restart_space()
76
 
77
- raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/{LATEST_BENCHMARK_VERSION}")
78
-
79
- original_df_qa = get_leaderboard_df(
80
- raw_data, task='qa', metric=DEFAULT_METRIC_QA)
81
- original_df_long_doc = get_leaderboard_df(
82
- raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
83
- print(f'raw data: {len(raw_data)}')
84
- print(f'QA data loaded: {original_df_qa.shape}')
85
- print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
86
-
87
- leaderboard_df_qa = original_df_qa.copy()
88
- # leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
89
- shown_columns_qa, types_qa = get_default_cols(
90
- 'qa', leaderboard_df_qa.columns, add_fix_cols=True)
91
- leaderboard_df_qa = leaderboard_df_qa[~leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
92
- leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
93
 
94
- leaderboard_df_long_doc = original_df_long_doc.copy()
95
- shown_columns_long_doc, types_long_doc = get_default_cols(
96
- 'long-doc', leaderboard_df_long_doc.columns, add_fix_cols=True)
97
- leaderboard_df_long_doc = leaderboard_df_long_doc[~leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
98
- leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
99
 
100
- # select reranking model
101
- reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in raw_data])))
 
 
 
 
 
 
102
 
 
 
 
 
103
 
104
  def update_metric_qa(
105
  metric: str,
@@ -108,9 +66,10 @@ def update_metric_qa(
108
  reranking_model: list,
109
  query: str,
110
  show_anonymous: bool,
111
- show_revision_and_timestamp,
112
  ):
113
- return update_metric(raw_data, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
 
114
 
115
  def update_metric_long_doc(
116
  metric: str,
@@ -121,7 +80,37 @@ def update_metric_long_doc(
121
  show_anonymous: bool,
122
  show_revision_and_timestamp,
123
  ):
124
- return update_metric(raw_data, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
 
127
  demo = gr.Blocks(css=custom_css)
@@ -140,11 +129,10 @@ with demo:
140
  with gr.Column(min_width=320):
141
  # select domain
142
  with gr.Row():
143
- selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
144
  # select language
145
  with gr.Row():
146
- selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
147
-
148
  with gr.Column():
149
  # select the metric
150
  selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
@@ -160,16 +148,26 @@ with demo:
160
  search_bar = get_search_bar()
161
  # select reranking models
162
  with gr.Column():
163
- selected_rerankings = get_reranking_dropdown(reranking_models)
164
- leaderboard_table = get_leaderboard_table(leaderboard_df_qa, types_qa)
 
 
165
  # Dummy leaderboard for handling the case when the user uses backspace key
166
- hidden_leaderboard_table_for_search = get_leaderboard_table(original_df_qa, types_qa, visible=False)
 
 
 
 
 
 
 
167
 
168
  set_listeners(
169
  "qa",
170
- leaderboard_table,
171
- hidden_leaderboard_table_for_search,
172
  search_bar,
 
173
  selected_domains,
174
  selected_langs,
175
  selected_rerankings,
@@ -189,28 +187,45 @@ with demo:
189
  show_anonymous,
190
  show_revision_and_timestamp,
191
  ],
192
- leaderboard_table,
193
  queue=True
194
  )
 
195
  with gr.TabItem("Retrieval Only", id=11):
196
  with gr.Row():
197
  with gr.Column(scale=1):
198
  search_bar_retriever = get_search_bar()
199
  with gr.Column(scale=1):
200
  selected_noreranker = get_noreranking_dropdown()
201
- lb_df_retriever = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
 
202
  lb_df_retriever = reset_rank(lb_df_retriever)
203
- lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
 
 
204
  # Dummy leaderboard for handling the case when the user uses backspace key
205
- hidden_lb_df_retriever = original_df_qa[original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
206
  hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
207
- hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, types_qa, visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
  set_listeners(
210
  "qa",
211
  lb_table_retriever,
212
  hidden_lb_table_retriever,
213
  search_bar_retriever,
 
214
  selected_domains,
215
  selected_langs,
216
  selected_noreranker,
@@ -234,7 +249,12 @@ with demo:
234
  queue=True
235
  )
236
  with gr.TabItem("Reranking Only", id=12):
237
- lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
 
 
 
 
 
238
  lb_df_reranker = reset_rank(lb_df_reranker)
239
  reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
240
  with gr.Row():
@@ -242,11 +262,26 @@ with demo:
242
  selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
243
  with gr.Column(scale=1):
244
  search_bar_reranker = gr.Textbox(show_label=False, visible=False)
245
- lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
246
- hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
 
 
247
  hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
248
  hidden_lb_table_reranker = get_leaderboard_table(
249
- hidden_lb_df_reranker, types_qa, visible=False
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  )
251
 
252
  set_listeners(
@@ -254,6 +289,7 @@ with demo:
254
  lb_table_reranker,
255
  hidden_lb_table_reranker,
256
  search_bar_reranker,
 
257
  selected_domains,
258
  selected_langs,
259
  selected_rerankings_reranker,
@@ -280,12 +316,10 @@ with demo:
280
  with gr.Column(min_width=320):
281
  # select domain
282
  with gr.Row():
283
- selected_domains = get_domain_dropdown(DOMAIN_COLS_LONG_DOC, DOMAIN_COLS_LONG_DOC)
284
  # select language
285
  with gr.Row():
286
- selected_langs = get_language_dropdown(
287
- LANG_COLS_LONG_DOC, LANG_COLS_LONG_DOC
288
- )
289
  with gr.Column():
290
  # select the metric
291
  with gr.Row():
@@ -301,22 +335,35 @@ with demo:
301
  search_bar = get_search_bar()
302
  # select reranking model
303
  with gr.Column():
304
- selected_rerankings = get_reranking_dropdown(reranking_models)
305
 
306
- lb_table = get_leaderboard_table(
307
- leaderboard_df_long_doc, types_long_doc
308
  )
309
 
310
  # Dummy leaderboard for handling the case when the user uses backspace key
311
- hidden_lb_table_for_search = get_leaderboard_table(
312
- original_df_long_doc, types_long_doc, visible=False
 
 
 
 
 
 
 
 
 
 
 
 
313
  )
314
 
315
  set_listeners(
316
  "long-doc",
317
- lb_table,
318
- hidden_lb_table_for_search,
319
  search_bar,
 
320
  selected_domains,
321
  selected_langs,
322
  selected_rerankings,
@@ -336,7 +383,7 @@ with demo:
336
  show_anonymous,
337
  show_revision_and_timestamp
338
  ],
339
- lb_table,
340
  queue=True
341
  )
342
  with gr.TabItem("Retrieval Only", id=21):
@@ -345,18 +392,31 @@ with demo:
345
  search_bar_retriever = get_search_bar()
346
  with gr.Column(scale=1):
347
  selected_noreranker = get_noreranking_dropdown()
348
- lb_df_retriever_long_doc = leaderboard_df_long_doc[
349
- leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
350
  ]
351
  lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
352
- hidden_lb_db_retriever_long_doc = original_df_long_doc[
353
- original_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
354
- ]
355
- hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
356
  lb_table_retriever_long_doc = get_leaderboard_table(
357
- lb_df_retriever_long_doc, types_long_doc)
 
 
 
 
 
358
  hidden_lb_table_retriever_long_doc = get_leaderboard_table(
359
- hidden_lb_db_retriever_long_doc, types_long_doc, visible=False
 
 
 
 
 
 
 
 
 
 
 
 
360
  )
361
 
362
  set_listeners(
@@ -364,6 +424,7 @@ with demo:
364
  lb_table_retriever_long_doc,
365
  hidden_lb_table_retriever_long_doc,
366
  search_bar_retriever,
 
367
  selected_domains,
368
  selected_langs,
369
  selected_noreranker,
@@ -386,8 +447,11 @@ with demo:
386
  queue=True
387
  )
388
  with gr.TabItem("Reranking Only", id=22):
389
- lb_df_reranker_ldoc = leaderboard_df_long_doc[
390
- leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
 
 
 
391
  ]
392
  lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
393
  reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
@@ -396,11 +460,23 @@ with demo:
396
  selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc)
397
  with gr.Column(scale=1):
398
  search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
399
- lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
400
- hidden_lb_df_reranker_ldoc = original_df_long_doc[original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
401
  hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
402
  hidden_lb_table_reranker_ldoc = get_leaderboard_table(
403
- hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
 
 
 
 
 
 
 
 
 
 
 
 
404
  )
405
 
406
  set_listeners(
@@ -408,6 +484,7 @@ with demo:
408
  lb_table_reranker_ldoc,
409
  hidden_lb_table_reranker_ldoc,
410
  search_bar_reranker_ldoc,
 
411
  selected_domains,
412
  selected_langs,
413
  selected_rerankings_reranker_ldoc,
@@ -503,3 +580,5 @@ if __name__ == "__main__":
503
  scheduler.start()
504
  demo.queue(default_concurrency_limit=40)
505
  demo.launch()
 
 
 
1
  import gradio as gr
2
+ import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
 
4
 
5
  from src.about import (
6
  INTRODUCTION_TEXT,
7
+ TITLE
 
 
8
  )
9
  from src.benchmarks import (
10
+ QABenchmarks,
11
+ LongDocBenchmarks
 
 
 
 
 
12
  )
13
  from src.display.css_html_js import custom_css
 
 
 
 
 
 
 
14
  from src.envs import (
15
  API,
16
  EVAL_RESULTS_PATH,
17
+ REPO_ID, DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, METRIC_LIST, LATEST_BENCHMARK_VERSION, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, BM25_LINK, BENCHMARK_VERSION_LIST
 
 
 
 
 
18
  )
19
+ from src.loaders import (
20
+ load_eval_results
 
21
  )
22
  from src.utils import (
23
  update_metric,
24
+ set_listeners,
 
 
25
  reset_rank,
26
+ remove_html, upload_file, submit_results
27
  )
28
  from src.display.gradio_formatting import (
29
  get_version_dropdown,
30
  get_search_bar,
31
  get_reranking_dropdown,
32
+ get_noreranking_dropdown,
33
  get_metric_dropdown,
34
  get_domain_dropdown,
35
  get_language_dropdown,
36
  get_anonymous_checkbox,
37
  get_revision_and_ts_checkbox,
38
+ get_leaderboard_table
 
39
  )
 
 
 
 
40
 
41
+ from src.about import EVALUATION_QUEUE_TEXT, BENCHMARKS_TEXT
42
 
 
 
 
 
 
 
 
 
43
 
44
+ def restart_space():
45
+ API.restart_space(repo_id=REPO_ID)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
 
 
 
 
 
47
 
48
+ # try:
49
+ # snapshot_download(
50
+ # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
51
+ # token=TOKEN
52
+ # )
53
+ # except Exception as e:
54
+ # print(f'failed to download')
55
+ # restart_space()
56
 
57
+ global data
58
+ data = load_eval_results(EVAL_RESULTS_PATH)
59
+ global datastore
60
+ datastore = data[LATEST_BENCHMARK_VERSION]
61
 
62
  def update_metric_qa(
63
  metric: str,
 
66
  reranking_model: list,
67
  query: str,
68
  show_anonymous: bool,
69
+ show_revision_and_timestamp: bool,
70
  ):
71
+ return update_metric(datastore, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
72
+
73
 
74
  def update_metric_long_doc(
75
  metric: str,
 
80
  show_anonymous: bool,
81
  show_revision_and_timestamp,
82
  ):
83
+ return update_metric(datastore, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
84
+
85
+
86
+ def update_datastore(version):
87
+ print("triggered update_datastore")
88
+ global datastore
89
+ global data
90
+ datastore = data[version]
91
+ selected_domains = get_domain_dropdown(QABenchmarks[datastore.slug])
92
+ selected_langs = get_language_dropdown(QABenchmarks[datastore.slug])
93
+ selected_rerankings = get_reranking_dropdown(datastore.reranking_models)
94
+ leaderboard_table = get_leaderboard_table(
95
+ datastore.leaderboard_df_qa, datastore.types_qa)
96
+ hidden_leaderboard_table = get_leaderboard_table(
97
+ datastore.raw_df_qa, datastore.types_qa, visible=False)
98
+ return selected_domains, selected_langs, selected_rerankings, leaderboard_table, hidden_leaderboard_table
99
+
100
+
101
+ def update_datastore_long_doc(version):
102
+ global datastore
103
+ global data
104
+ print("triggered update_datastore_long_doc")
105
+ datastore = data[version]
106
+ selected_domains = get_domain_dropdown(LongDocBenchmarks[datastore.slug])
107
+ selected_langs = get_language_dropdown(LongDocBenchmarks[datastore.slug])
108
+ selected_rerankings = get_reranking_dropdown(datastore.reranking_models)
109
+ leaderboard_table = get_leaderboard_table(
110
+ datastore.leaderboard_df_long_doc, datastore.types_long_doc)
111
+ hidden_leaderboard_table = get_leaderboard_table(
112
+ datastore.raw_df_long_doc, datastore.types_long_doc, visible=False)
113
+ return selected_domains, selected_langs, selected_rerankings, leaderboard_table, hidden_leaderboard_table
114
 
115
 
116
  demo = gr.Blocks(css=custom_css)
 
129
  with gr.Column(min_width=320):
130
  # select domain
131
  with gr.Row():
132
+ selected_domains = get_domain_dropdown(QABenchmarks[datastore.slug])
133
  # select language
134
  with gr.Row():
135
+ selected_langs = get_language_dropdown(QABenchmarks[datastore.slug])
 
136
  with gr.Column():
137
  # select the metric
138
  selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
 
148
  search_bar = get_search_bar()
149
  # select reranking models
150
  with gr.Column():
151
+ selected_rerankings = get_reranking_dropdown(datastore.reranking_models)
152
+ # shown_table
153
+ lb_table = get_leaderboard_table(
154
+ datastore.leaderboard_df_qa, datastore.types_qa)
155
  # Dummy leaderboard for handling the case when the user uses backspace key
156
+ hidden_lb_table = get_leaderboard_table(
157
+ datastore.raw_df_qa, datastore.types_qa, visible=False)
158
+
159
+ selected_version.change(
160
+ update_datastore,
161
+ [selected_version,],
162
+ [selected_domains, selected_langs, selected_rerankings, lb_table, hidden_lb_table]
163
+ )
164
 
165
  set_listeners(
166
  "qa",
167
+ lb_table,
168
+ hidden_lb_table,
169
  search_bar,
170
+ selected_version,
171
  selected_domains,
172
  selected_langs,
173
  selected_rerankings,
 
187
  show_anonymous,
188
  show_revision_and_timestamp,
189
  ],
190
+ lb_table,
191
  queue=True
192
  )
193
+
194
  with gr.TabItem("Retrieval Only", id=11):
195
  with gr.Row():
196
  with gr.Column(scale=1):
197
  search_bar_retriever = get_search_bar()
198
  with gr.Column(scale=1):
199
  selected_noreranker = get_noreranking_dropdown()
200
+
201
+ lb_df_retriever = datastore.leaderboard_df_qa[datastore.leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
202
  lb_df_retriever = reset_rank(lb_df_retriever)
203
+ lb_table_retriever = get_leaderboard_table(
204
+ lb_df_retriever, datastore.types_qa)
205
+
206
  # Dummy leaderboard for handling the case when the user uses backspace key
207
+ hidden_lb_df_retriever = datastore.raw_df_qa[datastore.raw_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
208
  hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
209
+ hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, datastore.types_qa, visible=False)
210
+
211
+ selected_version.change(
212
+ update_datastore,
213
+ [selected_version,],
214
+ [
215
+ selected_domains,
216
+ selected_langs,
217
+ selected_noreranker,
218
+ lb_table_retriever,
219
+ hidden_lb_table_retriever
220
+ ]
221
+ )
222
 
223
  set_listeners(
224
  "qa",
225
  lb_table_retriever,
226
  hidden_lb_table_retriever,
227
  search_bar_retriever,
228
+ selected_version,
229
  selected_domains,
230
  selected_langs,
231
  selected_noreranker,
 
249
  queue=True
250
  )
251
  with gr.TabItem("Reranking Only", id=12):
252
+ lb_df_reranker = \
253
+ datastore.leaderboard_df_qa[
254
+ datastore.leaderboard_df_qa[
255
+ COL_NAME_RETRIEVAL_MODEL
256
+ ] == BM25_LINK
257
+ ]
258
  lb_df_reranker = reset_rank(lb_df_reranker)
259
  reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
260
  with gr.Row():
 
262
  selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
263
  with gr.Column(scale=1):
264
  search_bar_reranker = gr.Textbox(show_label=False, visible=False)
265
+ lb_table_reranker = get_leaderboard_table(
266
+ lb_df_reranker, datastore.types_qa)
267
+
268
+ hidden_lb_df_reranker = datastore.raw_df_qa[datastore.raw_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
269
  hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
270
  hidden_lb_table_reranker = get_leaderboard_table(
271
+ hidden_lb_df_reranker,
272
+ datastore.types_qa, visible=False
273
+ )
274
+
275
+ selected_version.change(
276
+ update_datastore,
277
+ [selected_version,],
278
+ [
279
+ selected_domains,
280
+ selected_langs,
281
+ selected_rerankings_reranker,
282
+ lb_table_reranker,
283
+ hidden_lb_table_reranker
284
+ ]
285
  )
286
 
287
  set_listeners(
 
289
  lb_table_reranker,
290
  hidden_lb_table_reranker,
291
  search_bar_reranker,
292
+ selected_version,
293
  selected_domains,
294
  selected_langs,
295
  selected_rerankings_reranker,
 
316
  with gr.Column(min_width=320):
317
  # select domain
318
  with gr.Row():
319
+ selected_domains = get_domain_dropdown(LongDocBenchmarks[datastore.slug])
320
  # select language
321
  with gr.Row():
322
+ selected_langs = get_language_dropdown(LongDocBenchmarks[datastore.slug])
 
 
323
  with gr.Column():
324
  # select the metric
325
  with gr.Row():
 
335
  search_bar = get_search_bar()
336
  # select reranking model
337
  with gr.Column():
338
+ selected_rerankings = get_reranking_dropdown(datastore.reranking_models)
339
 
340
+ lb_table_long_doc = get_leaderboard_table(
341
+ datastore.leaderboard_df_long_doc, datastore.types_long_doc
342
  )
343
 
344
  # Dummy leaderboard for handling the case when the user uses backspace key
345
+ hidden_lb_table_long_doc = get_leaderboard_table(
346
+ datastore.raw_df_long_doc, datastore.types_long_doc, visible=False
347
+ )
348
+
349
+ selected_version.change(
350
+ update_datastore_long_doc,
351
+ [selected_version,],
352
+ [
353
+ selected_domains,
354
+ selected_langs,
355
+ selected_rerankings,
356
+ lb_table_long_doc,
357
+ hidden_lb_table_long_doc
358
+ ]
359
  )
360
 
361
  set_listeners(
362
  "long-doc",
363
+ lb_table_long_doc,
364
+ hidden_lb_table_long_doc,
365
  search_bar,
366
+ selected_version,
367
  selected_domains,
368
  selected_langs,
369
  selected_rerankings,
 
383
  show_anonymous,
384
  show_revision_and_timestamp
385
  ],
386
+ lb_table_long_doc,
387
  queue=True
388
  )
389
  with gr.TabItem("Retrieval Only", id=21):
 
392
  search_bar_retriever = get_search_bar()
393
  with gr.Column(scale=1):
394
  selected_noreranker = get_noreranking_dropdown()
395
+ lb_df_retriever_long_doc = datastore.leaderboard_df_long_doc[
396
+ datastore.leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
397
  ]
398
  lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
 
 
 
 
399
  lb_table_retriever_long_doc = get_leaderboard_table(
400
+ lb_df_retriever_long_doc, datastore.types_long_doc)
401
+
402
+ hidden_lb_df_retriever_long_doc = datastore.raw_df_long_doc[
403
+ datastore.raw_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
404
+ ]
405
+ hidden_lb_df_retriever_long_doc = reset_rank(hidden_lb_df_retriever_long_doc)
406
  hidden_lb_table_retriever_long_doc = get_leaderboard_table(
407
+ hidden_lb_df_retriever_long_doc, datastore.types_long_doc, visible=False
408
+ )
409
+
410
+ selected_version.change(
411
+ update_datastore_long_doc,
412
+ [selected_version,],
413
+ [
414
+ selected_domains,
415
+ selected_langs,
416
+ selected_noreranker,
417
+ lb_table_retriever_long_doc,
418
+ hidden_lb_table_retriever_long_doc
419
+ ]
420
  )
421
 
422
  set_listeners(
 
424
  lb_table_retriever_long_doc,
425
  hidden_lb_table_retriever_long_doc,
426
  search_bar_retriever,
427
+ selected_version,
428
  selected_domains,
429
  selected_langs,
430
  selected_noreranker,
 
447
  queue=True
448
  )
449
  with gr.TabItem("Reranking Only", id=22):
450
+ lb_df_reranker_ldoc = \
451
+ datastore.leaderboard_df_long_doc[
452
+ datastore.leaderboard_df_long_doc[
453
+ COL_NAME_RETRIEVAL_MODEL
454
+ ] == BM25_LINK
455
  ]
456
  lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
457
  reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
 
460
  selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc)
461
  with gr.Column(scale=1):
462
  search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
463
+ lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, datastore.types_long_doc)
464
+ hidden_lb_df_reranker_ldoc = datastore.raw_df_long_doc[datastore.raw_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
465
  hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
466
  hidden_lb_table_reranker_ldoc = get_leaderboard_table(
467
+ hidden_lb_df_reranker_ldoc, datastore.types_long_doc, visible=False
468
+ )
469
+
470
+ selected_version.change(
471
+ update_datastore_long_doc,
472
+ [selected_version,],
473
+ [
474
+ selected_domains,
475
+ selected_langs,
476
+ selected_rerankings_reranker_ldoc,
477
+ lb_table_reranker_ldoc,
478
+ hidden_lb_table_reranker_ldoc
479
+ ]
480
  )
481
 
482
  set_listeners(
 
484
  lb_table_reranker_ldoc,
485
  hidden_lb_table_reranker_ldoc,
486
  search_bar_reranker_ldoc,
487
+ selected_version,
488
  selected_domains,
489
  selected_langs,
490
  selected_rerankings_reranker_ldoc,
 
580
  scheduler.start()
581
  demo.queue(default_concurrency_limit=40)
582
  demo.launch()
583
+
584
+
src/benchmarks.py CHANGED
@@ -1,7 +1,10 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
 
3
  from air_benchmark.tasks.tasks import BenchmarkTable
4
 
 
 
5
 
6
  def get_safe_name(name: str):
7
  """Get RFC 1123 compatible safe name"""
@@ -12,40 +15,6 @@ def get_safe_name(name: str):
12
  if (character.isalnum() or character == '_'))
13
 
14
 
15
- METRIC_LIST = [
16
- "ndcg_at_1",
17
- "ndcg_at_3",
18
- "ndcg_at_5",
19
- "ndcg_at_10",
20
- "ndcg_at_100",
21
- "ndcg_at_1000",
22
- "map_at_1",
23
- "map_at_3",
24
- "map_at_5",
25
- "map_at_10",
26
- "map_at_100",
27
- "map_at_1000",
28
- "recall_at_1",
29
- "recall_at_3",
30
- "recall_at_5",
31
- "recall_at_10",
32
- "recall_at_100",
33
- "recall_at_1000",
34
- "precision_at_1",
35
- "precision_at_3",
36
- "precision_at_5",
37
- "precision_at_10",
38
- "precision_at_100",
39
- "precision_at_1000",
40
- "mrr_at_1",
41
- "mrr_at_3",
42
- "mrr_at_5",
43
- "mrr_at_10",
44
- "mrr_at_100",
45
- "mrr_at_1000"
46
- ]
47
-
48
-
49
  @dataclass
50
  class Benchmark:
51
  name: str # [domain]_[language]_[metric], task_key in the json file,
@@ -56,37 +25,53 @@ class Benchmark:
56
  task: str
57
 
58
 
59
- qa_benchmark_dict = {}
60
- long_doc_benchmark_dict = {}
61
- for task, domain_dict in BenchmarkTable['AIR-Bench_24.04'].items():
62
- for domain, lang_dict in domain_dict.items():
63
- for lang, dataset_list in lang_dict.items():
64
- if task == "qa":
65
- benchmark_name = f"{domain}_{lang}"
66
- benchmark_name = get_safe_name(benchmark_name)
67
- col_name = benchmark_name
68
- for metric in dataset_list:
69
- qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
70
- elif task == "long-doc":
71
- for dataset in dataset_list:
72
- benchmark_name = f"{domain}_{lang}_{dataset}"
73
- benchmark_name = get_safe_name(benchmark_name)
74
  col_name = benchmark_name
75
- for metric in METRIC_LIST:
76
- long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain,
77
- lang, task)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
80
- BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
81
 
82
- BENCHMARK_COLS_QA = [c.col_name for c in qa_benchmark_dict.values()]
83
- BENCHMARK_COLS_LONG_DOC = [c.col_name for c in long_doc_benchmark_dict.values()]
 
 
 
84
 
85
- DOMAIN_COLS_QA = list(frozenset([c.domain for c in qa_benchmark_dict.values()]))
86
- LANG_COLS_QA = list(frozenset([c.lang for c in qa_benchmark_dict.values()]))
 
 
87
 
88
- DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
89
- LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
90
 
91
- DEFAULT_METRIC_QA = "ndcg_at_10"
92
- DEFAULT_METRIC_LONG_DOC = "recall_at_10"
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
+
4
  from air_benchmark.tasks.tasks import BenchmarkTable
5
 
6
+ from src.envs import METRIC_LIST
7
+
8
 
9
  def get_safe_name(name: str):
10
  """Get RFC 1123 compatible safe name"""
 
15
  if (character.isalnum() or character == '_'))
16
 
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  @dataclass
19
  class Benchmark:
20
  name: str # [domain]_[language]_[metric], task_key in the json file,
 
25
  task: str
26
 
27
 
28
+ # create a function return an enum class containing all the benchmarks
29
+ def get_benchmarks_enum(benchmark_version, task_type):
30
+ benchmark_dict = {}
31
+ if task_type == "qa":
32
+ for task, domain_dict in BenchmarkTable[benchmark_version].items():
33
+ if task != task_type:
34
+ continue
35
+ for domain, lang_dict in domain_dict.items():
36
+ for lang, dataset_list in lang_dict.items():
37
+ benchmark_name = get_safe_name(f"{domain}_{lang}")
 
 
 
 
 
38
  col_name = benchmark_name
39
+ for metric in dataset_list:
40
+ if "test" not in dataset_list[metric]["splits"]:
41
+ continue
42
+ benchmark_dict[benchmark_name] = \
43
+ Benchmark(benchmark_name, metric, col_name, domain, lang, task)
44
+ elif task_type == "long-doc":
45
+ for task, domain_dict in BenchmarkTable[benchmark_version].items():
46
+ if task != task_type:
47
+ continue
48
+ for domain, lang_dict in domain_dict.items():
49
+ for lang, dataset_list in lang_dict.items():
50
+ for dataset in dataset_list:
51
+ benchmark_name = f"{domain}_{lang}_{dataset}"
52
+ benchmark_name = get_safe_name(benchmark_name)
53
+ col_name = benchmark_name
54
+ if "test" not in dataset_list[dataset]["splits"]:
55
+ continue
56
+ for metric in METRIC_LIST:
57
+ benchmark_dict[benchmark_name] = \
58
+ Benchmark(benchmark_name, metric, col_name, domain, lang, task)
59
+ return benchmark_dict
60
 
 
 
61
 
62
+ versions = ("AIR-Bench_24.04", "AIR-Bench_24.05")
63
+ qa_benchmark_dict = {}
64
+ for version in versions:
65
+ safe_version_name = get_safe_name(version)[-4:]
66
+ qa_benchmark_dict[safe_version_name] = Enum(f"QABenchmarks_{safe_version_name}", get_benchmarks_enum(version, "qa"))
67
 
68
+ long_doc_benchmark_dict = {}
69
+ for version in versions:
70
+ safe_version_name = get_safe_name(version)[-4:]
71
+ long_doc_benchmark_dict[safe_version_name] = Enum(f"LongDocBenchmarks_{safe_version_name}", get_benchmarks_enum(version, "long-doc"))
72
 
73
+ # _qa_benchmark_dict, = get_benchmarks_enum('AIR-Bench_24.04', "qa")
74
+ # _long_doc_benchmark_dict = get_benchmarks_enum('AIR-Bench_24.04', "long-doc")
75
 
76
+ QABenchmarks = Enum('QABenchmarks', qa_benchmark_dict)
77
+ LongDocBenchmarks = Enum('LongDocBenchmarks', long_doc_benchmark_dict)
src/display/{utils.py → columns.py} RENAMED
@@ -1,6 +1,8 @@
1
  from dataclasses import dataclass, make_dataclass
2
 
3
- from src.benchmarks import BenchmarksQA, BenchmarksLongDoc
 
 
4
 
5
 
6
  def fields(raw_class):
@@ -19,17 +21,6 @@ class ColumnContent:
19
  never_hidden: bool = False
20
 
21
 
22
- COL_NAME_AVG = "Average ⬆️"
23
- COL_NAME_RETRIEVAL_MODEL = "Retrieval Method"
24
- COL_NAME_RERANKING_MODEL = "Reranking Model"
25
- COL_NAME_RETRIEVAL_MODEL_LINK = "Retrieval Model LINK"
26
- COL_NAME_RERANKING_MODEL_LINK = "Reranking Model LINK"
27
- COL_NAME_RANK = "Rank 🏆"
28
- COL_NAME_REVISION = "Revision"
29
- COL_NAME_TIMESTAMP = "Submission Date"
30
- COL_NAME_IS_ANONYMOUS = "Anonymous Submission"
31
-
32
-
33
  def get_default_auto_eval_column_dict():
34
  auto_eval_column_dict = []
35
  # Init
@@ -37,10 +28,12 @@ def get_default_auto_eval_column_dict():
37
  ["rank", ColumnContent, ColumnContent(COL_NAME_RANK, "number", True)]
38
  )
39
  auto_eval_column_dict.append(
40
- ["retrieval_model", ColumnContent, ColumnContent(COL_NAME_RETRIEVAL_MODEL, "markdown", True, hidden=False, never_hidden=True)]
 
41
  )
42
  auto_eval_column_dict.append(
43
- ["reranking_model", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, hidden=False, never_hidden=True)]
 
44
  )
45
  auto_eval_column_dict.append(
46
  ["revision", ColumnContent, ColumnContent(COL_NAME_REVISION, "markdown", True, never_hidden=True)]
@@ -52,10 +45,12 @@ def get_default_auto_eval_column_dict():
52
  ["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
53
  )
54
  auto_eval_column_dict.append(
55
- ["retrieval_model_link", ColumnContent, ColumnContent(COL_NAME_RETRIEVAL_MODEL_LINK, "markdown", False, hidden=True, never_hidden=False)]
 
56
  )
57
  auto_eval_column_dict.append(
58
- ["reranking_model_link", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL_LINK, "markdown", False, hidden=True, never_hidden=False)]
 
59
  )
60
  auto_eval_column_dict.append(
61
  ["is_anonymous", ColumnContent, ColumnContent(COL_NAME_IS_ANONYMOUS, "bool", False, hidden=True)]
@@ -63,10 +58,10 @@ def get_default_auto_eval_column_dict():
63
  return auto_eval_column_dict
64
 
65
 
66
- def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
67
  auto_eval_column_dict = get_default_auto_eval_column_dict()
68
- ## Leaderboard columns
69
- for benchmark in benchmarks:
70
  auto_eval_column_dict.append(
71
  [benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
72
  )
@@ -75,19 +70,28 @@ def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
75
  return make_dataclass(cls_name, auto_eval_column_dict, frozen=True)
76
 
77
 
78
- AutoEvalColumnQA = make_autoevalcolumn(
79
- "AutoEvalColumnQA", BenchmarksQA)
80
- AutoEvalColumnLongDoc = make_autoevalcolumn(
81
- "AutoEvalColumnLongDoc", BenchmarksLongDoc)
 
82
 
 
 
 
83
 
84
- # Column selection
85
- COLS_QA = [c.name for c in fields(AutoEvalColumnQA) if not c.hidden]
86
- COLS_LONG_DOC = [c.name for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
87
- TYPES_QA = [c.type for c in fields(AutoEvalColumnQA) if not c.hidden]
88
- TYPES_LONG_DOC = [c.type for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
89
- COLS_LITE = [c.name for c in fields(AutoEvalColumnQA) if c.displayed_by_default and not c.hidden]
90
 
91
- QA_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksQA]
 
 
 
 
 
 
92
 
93
- LONG_DOC_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksLongDoc]
 
 
 
 
 
 
1
  from dataclasses import dataclass, make_dataclass
2
 
3
+ from src.benchmarks import QABenchmarks, LongDocBenchmarks
4
+ from src.envs import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL_LINK, \
5
+ COL_NAME_RERANKING_MODEL_LINK, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
6
 
7
 
8
  def fields(raw_class):
 
21
  never_hidden: bool = False
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
24
  def get_default_auto_eval_column_dict():
25
  auto_eval_column_dict = []
26
  # Init
 
28
  ["rank", ColumnContent, ColumnContent(COL_NAME_RANK, "number", True)]
29
  )
30
  auto_eval_column_dict.append(
31
+ ["retrieval_model", ColumnContent,
32
+ ColumnContent(COL_NAME_RETRIEVAL_MODEL, "markdown", True, hidden=False, never_hidden=True)]
33
  )
34
  auto_eval_column_dict.append(
35
+ ["reranking_model", ColumnContent,
36
+ ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, hidden=False, never_hidden=True)]
37
  )
38
  auto_eval_column_dict.append(
39
  ["revision", ColumnContent, ColumnContent(COL_NAME_REVISION, "markdown", True, never_hidden=True)]
 
45
  ["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
46
  )
47
  auto_eval_column_dict.append(
48
+ ["retrieval_model_link", ColumnContent,
49
+ ColumnContent(COL_NAME_RETRIEVAL_MODEL_LINK, "markdown", False, hidden=True, never_hidden=False)]
50
  )
51
  auto_eval_column_dict.append(
52
+ ["reranking_model_link", ColumnContent,
53
+ ColumnContent(COL_NAME_RERANKING_MODEL_LINK, "markdown", False, hidden=True, never_hidden=False)]
54
  )
55
  auto_eval_column_dict.append(
56
  ["is_anonymous", ColumnContent, ColumnContent(COL_NAME_IS_ANONYMOUS, "bool", False, hidden=True)]
 
58
  return auto_eval_column_dict
59
 
60
 
61
+ def make_autoevalcolumn(cls_name, benchmarks):
62
  auto_eval_column_dict = get_default_auto_eval_column_dict()
63
+ # Leaderboard columns
64
+ for benchmark in list(benchmarks.value):
65
  auto_eval_column_dict.append(
66
  [benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
67
  )
 
70
  return make_dataclass(cls_name, auto_eval_column_dict, frozen=True)
71
 
72
 
73
+ def get_default_col_names_and_types(benchmarks):
74
+ AutoEvalColumn = make_autoevalcolumn("AutoEvalColumn", benchmarks)
75
+ col_names = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
76
+ col_types = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
77
+ return col_names, col_types
78
 
79
+ # AutoEvalColumnQA = make_autoevalcolumn("AutoEvalColumnQA", QABenchmarks)
80
+ # COLS_QA = [c.name for c in fields(AutoEvalColumnQA) if not c.hidden]
81
+ # TYPES_QA = [c.type for c in fields(AutoEvalColumnQA) if not c.hidden]
82
 
 
 
 
 
 
 
83
 
84
+ def get_fixed_col_names_and_types():
85
+ fixed_cols = get_default_auto_eval_column_dict()[:-3]
86
+ return [c.name for _, _, c in fixed_cols], [c.type for _, _, c in fixed_cols]
87
+
88
+ # fixed_cols = get_default_auto_eval_column_dict()[:-3]
89
+ # FIXED_COLS = [c.name for _, _, c in fixed_cols]
90
+ # FIXED_COLS_TYPES = [c.type for _, _, c in fixed_cols]
91
 
92
+
93
+ # AutoEvalColumnLongDoc = make_autoevalcolumn("AutoEvalColumnLongDoc", LongDocBenchmarks)
94
+ # COLS_LONG_DOC = [c.name for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
95
+ # TYPES_LONG_DOC = [c.type for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
96
+
97
+ # Column selection
src/display/gradio_formatting.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  from src.envs import BENCHMARK_VERSION_LIST, LATEST_BENCHMARK_VERSION
 
3
 
4
  def get_version_dropdown():
5
  return gr.Dropdown(
@@ -52,7 +53,10 @@ def get_metric_dropdown(metric_list, default_metrics):
52
  )
53
 
54
 
55
- def get_domain_dropdown(domain_list, default_domains):
 
 
 
56
  return gr.CheckboxGroup(
57
  choices=domain_list,
58
  value=default_domains,
@@ -61,10 +65,13 @@ def get_domain_dropdown(domain_list, default_domains):
61
  )
62
 
63
 
64
- def get_language_dropdown(language_list, default_languages):
 
 
 
65
  return gr.Dropdown(
66
  choices=language_list,
67
- value=language_list,
68
  label="Select the languages",
69
  multiselect=True,
70
  interactive=True
 
1
  import gradio as gr
2
  from src.envs import BENCHMARK_VERSION_LIST, LATEST_BENCHMARK_VERSION
3
+ from src.benchmarks import QABenchmarks
4
 
5
  def get_version_dropdown():
6
  return gr.Dropdown(
 
53
  )
54
 
55
 
56
+ def get_domain_dropdown(benchmarks, default_domains=None):
57
+ domain_list = list(frozenset([c.value.domain for c in list(benchmarks.value)]))
58
+ if default_domains is None:
59
+ default_domains = domain_list
60
  return gr.CheckboxGroup(
61
  choices=domain_list,
62
  value=default_domains,
 
65
  )
66
 
67
 
68
+ def get_language_dropdown(benchmarks, default_languages=None):
69
+ language_list = list(frozenset([c.value.lang for c in list(benchmarks.value)]))
70
+ if default_languages is None:
71
+ default_languages = language_list
72
  return gr.Dropdown(
73
  choices=language_list,
74
+ value=default_languages,
75
  label="Select the languages",
76
  multiselect=True,
77
  interactive=True
src/display/gradio_listener.py DELETED
@@ -1,53 +0,0 @@
1
- from src.utils import update_table, update_table_long_doc
2
-
3
-
4
- def set_listeners(
5
- task,
6
- displayed_leaderboard,
7
- hidden_leaderboard,
8
- search_bar,
9
- selected_domains,
10
- selected_langs,
11
- selected_rerankings,
12
- show_anonymous,
13
- show_revision_and_timestamp,
14
-
15
- ):
16
- if task == "qa":
17
- update_table_func = update_table
18
- elif task == "long-doc":
19
- update_table_func = update_table_long_doc
20
- else:
21
- raise NotImplementedError
22
- # Set search_bar listener
23
- search_bar.submit(
24
- update_table_func,
25
- [
26
- hidden_leaderboard, # hidden_leaderboard_table_for_search,
27
- selected_domains,
28
- selected_langs,
29
- selected_rerankings,
30
- search_bar,
31
- show_anonymous,
32
- ],
33
- displayed_leaderboard
34
- )
35
-
36
- # Set column-wise listener
37
- for selector in [
38
- selected_domains, selected_langs, show_anonymous, show_revision_and_timestamp, selected_rerankings
39
- ]:
40
- selector.change(
41
- update_table_func,
42
- [
43
- hidden_leaderboard,
44
- selected_domains,
45
- selected_langs,
46
- selected_rerankings,
47
- search_bar,
48
- show_anonymous,
49
- show_revision_and_timestamp
50
- ],
51
- displayed_leaderboard,
52
- queue=True,
53
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/envs.py CHANGED
@@ -30,4 +30,47 @@ BENCHMARK_VERSION_LIST = [
30
  # "AIR-Bench_24.05",
31
  ]
32
 
33
- LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # "AIR-Bench_24.05",
31
  ]
32
 
33
+ LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[0]
34
+ DEFAULT_METRIC_QA = "ndcg_at_10"
35
+ DEFAULT_METRIC_LONG_DOC = "recall_at_10"
36
+ METRIC_LIST = [
37
+ "ndcg_at_1",
38
+ "ndcg_at_3",
39
+ "ndcg_at_5",
40
+ "ndcg_at_10",
41
+ "ndcg_at_100",
42
+ "ndcg_at_1000",
43
+ "map_at_1",
44
+ "map_at_3",
45
+ "map_at_5",
46
+ "map_at_10",
47
+ "map_at_100",
48
+ "map_at_1000",
49
+ "recall_at_1",
50
+ "recall_at_3",
51
+ "recall_at_5",
52
+ "recall_at_10",
53
+ "recall_at_100",
54
+ "recall_at_1000",
55
+ "precision_at_1",
56
+ "precision_at_3",
57
+ "precision_at_5",
58
+ "precision_at_10",
59
+ "precision_at_100",
60
+ "precision_at_1000",
61
+ "mrr_at_1",
62
+ "mrr_at_3",
63
+ "mrr_at_5",
64
+ "mrr_at_10",
65
+ "mrr_at_100",
66
+ "mrr_at_1000"
67
+ ]
68
+ COL_NAME_AVG = "Average ⬆️"
69
+ COL_NAME_RETRIEVAL_MODEL = "Retrieval Method"
70
+ COL_NAME_RERANKING_MODEL = "Reranking Model"
71
+ COL_NAME_RETRIEVAL_MODEL_LINK = "Retrieval Model LINK"
72
+ COL_NAME_RERANKING_MODEL_LINK = "Reranking Model LINK"
73
+ COL_NAME_RANK = "Rank 🏆"
74
+ COL_NAME_REVISION = "Revision"
75
+ COL_NAME_TIMESTAMP = "Submission Date"
76
+ COL_NAME_IS_ANONYMOUS = "Anonymous Submission"
src/loaders.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+ from typing import List
3
+
4
+ import pandas as pd
5
+
6
+ from src.envs import DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, COL_NAME_REVISION, COL_NAME_TIMESTAMP, \
7
+ COL_NAME_IS_ANONYMOUS, BENCHMARK_VERSION_LIST
8
+
9
+ from src.models import FullEvalResult, LeaderboardDataStore
10
+ from src.utils import get_default_cols, get_leaderboard_df
11
+
12
+ pd.options.mode.copy_on_write = True
13
+
14
+
15
+ def load_raw_eval_results(results_path: str) -> List[FullEvalResult]:
16
+ """
17
+ Load the evaluation results from a json file
18
+ """
19
+ model_result_filepaths = []
20
+ for root, dirs, files in os.walk(results_path):
21
+ if len(files) == 0:
22
+ continue
23
+
24
+ # select the latest results
25
+ for file in files:
26
+ if not (file.startswith("results") and file.endswith(".json")):
27
+ print(f'skip {file}')
28
+ continue
29
+ model_result_filepaths.append(os.path.join(root, file))
30
+
31
+ eval_results = {}
32
+ for model_result_filepath in model_result_filepaths:
33
+ # create evaluation results
34
+ try:
35
+ eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
36
+ except UnicodeDecodeError as e:
37
+ print(f"loading file failed. {model_result_filepath}")
38
+ continue
39
+ print(f'file loaded: {model_result_filepath}')
40
+ timestamp = eval_result.timestamp
41
+ eval_results[timestamp] = eval_result
42
+
43
+ results = []
44
+ for k, v in eval_results.items():
45
+ try:
46
+ v.to_dict()
47
+ results.append(v)
48
+ except KeyError:
49
+ print(f"loading failed: {k}")
50
+ continue
51
+ return results
52
+
53
+ def get_safe_name(name: str):
54
+ """Get RFC 1123 compatible safe name"""
55
+ name = name.replace('-', '_')
56
+ return ''.join(
57
+ character.lower()
58
+ for character in name
59
+ if (character.isalnum() or character == '_'))
60
+
61
+ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
62
+ slug = get_safe_name(version)[-4:]
63
+ lb_data_store = LeaderboardDataStore(version, slug, None, None, None, None, None, None, None, None)
64
+ lb_data_store.raw_data = load_raw_eval_results(file_path)
65
+ print(f'raw data: {len(lb_data_store.raw_data)}')
66
+
67
+ lb_data_store.raw_df_qa = get_leaderboard_df(
68
+ lb_data_store, task='qa', metric=DEFAULT_METRIC_QA)
69
+ print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}')
70
+ lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy()
71
+ shown_columns_qa, types_qa = get_default_cols('qa', lb_data_store.slug, add_fix_cols=True)
72
+ # shown_columns_qa, types_qa = get_default_cols(
73
+ # 'qa', lb_data_store.leaderboard_df_qa.columns, add_fix_cols=True)
74
+ lb_data_store.types_qa = types_qa
75
+ lb_data_store.leaderboard_df_qa = \
76
+ lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
77
+ lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
78
+
79
+ lb_data_store.raw_df_long_doc = get_leaderboard_df(
80
+ lb_data_store, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
81
+ print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
82
+ lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
83
+ shown_columns_long_doc, types_long_doc = get_default_cols(
84
+ 'long-doc', lb_data_store.slug, add_fix_cols=True)
85
+ lb_data_store.types_long_doc = types_long_doc
86
+ lb_data_store.leaderboard_df_long_doc = \
87
+ lb_data_store.leaderboard_df_long_doc[
88
+ ~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
89
+ lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
90
+
91
+ lb_data_store.reranking_models = sorted(
92
+ list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
93
+ return lb_data_store
94
+
95
+
96
+ def load_eval_results(file_path: str):
97
+ output = {}
98
+ # versions = BENCHMARK_VERSION_LIST
99
+ for version in BENCHMARK_VERSION_LIST:
100
+ fn = f"{file_path}/{version}"
101
+ output[version] = load_leaderboard_datastore(fn, version)
102
+ return output
src/{read_evals.py → models.py} RENAMED
@@ -1,38 +1,15 @@
1
  import json
2
- import os.path
3
  from collections import defaultdict
4
  from dataclasses import dataclass
5
- from typing import List
6
 
7
  import pandas as pd
8
 
9
  from src.benchmarks import get_safe_name
10
- from src.display.utils import (
11
- COL_NAME_RERANKING_MODEL,
12
- COL_NAME_RETRIEVAL_MODEL,
13
- COL_NAME_RERANKING_MODEL_LINK,
14
- COL_NAME_RETRIEVAL_MODEL_LINK,
15
- COL_NAME_REVISION,
16
- COL_NAME_TIMESTAMP,
17
- COL_NAME_IS_ANONYMOUS,
18
- COLS_QA,
19
- QA_BENCHMARK_COLS,
20
- COLS_LONG_DOC,
21
- LONG_DOC_BENCHMARK_COLS,
22
- COL_NAME_AVG,
23
- COL_NAME_RANK
24
- )
25
-
26
  from src.display.formatting import make_clickable_model
27
 
28
- pd.options.mode.copy_on_write = True
29
-
30
- def calculate_mean(row):
31
- if pd.isna(row).any():
32
- return -1
33
- else:
34
- return row.mean()
35
-
36
 
37
  @dataclass
38
  class EvalResult:
@@ -149,80 +126,15 @@ class FullEvalResult:
149
  return [v for v in results.values()]
150
 
151
 
152
- def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
153
- """
154
- Load the evaluation results from a json file
155
- """
156
- model_result_filepaths = []
157
- for root, dirs, files in os.walk(results_path):
158
- if len(files) == 0:
159
- continue
160
-
161
- # select the latest results
162
- for file in files:
163
- if not (file.startswith("results") and file.endswith(".json")):
164
- print(f'skip {file}')
165
- continue
166
- model_result_filepaths.append(os.path.join(root, file))
167
-
168
- eval_results = {}
169
- for model_result_filepath in model_result_filepaths:
170
- # create evaluation results
171
- try:
172
- eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
173
- except UnicodeDecodeError as e:
174
- print(f"loading file failed. {model_result_filepath}")
175
- continue
176
- print(f'file loaded: {model_result_filepath}')
177
- timestamp = eval_result.timestamp
178
- eval_results[timestamp] = eval_result
179
-
180
- results = []
181
- for k, v in eval_results.items():
182
- try:
183
- v.to_dict()
184
- results.append(v)
185
- except KeyError:
186
- print(f"loading failed: {k}")
187
- continue
188
- return results
189
-
190
-
191
- def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -> pd.DataFrame:
192
- """
193
- Creates a dataframe from all the individual experiment results
194
- """
195
- cols = [COL_NAME_IS_ANONYMOUS, ]
196
- if task == "qa":
197
- cols += COLS_QA
198
- benchmark_cols = QA_BENCHMARK_COLS
199
- elif task == "long-doc":
200
- cols += COLS_LONG_DOC
201
- benchmark_cols = LONG_DOC_BENCHMARK_COLS
202
- else:
203
- raise NotImplemented
204
- all_data_json = []
205
- for v in raw_data:
206
- all_data_json += v.to_dict(task=task, metric=metric)
207
- df = pd.DataFrame.from_records(all_data_json)
208
- # print(f'dataframe created: {df.shape}')
209
-
210
- _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
211
-
212
- # calculate the average score for selected benchmarks
213
- df[COL_NAME_AVG] = df[list(_benchmark_cols)].apply(calculate_mean, axis=1).round(decimals=2)
214
- df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
215
- df.reset_index(inplace=True, drop=True)
216
-
217
- _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
218
- df = df[_cols].round(decimals=2)
219
-
220
- # filter out if any of the benchmarks have not been produced
221
- df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
222
-
223
- # shorten the revision
224
- df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6]
225
-
226
- # # replace "0" with "-" for average score
227
- # df[COL_NAME_AVG] = df[COL_NAME_AVG].replace(0, "-")
228
- return df
 
1
  import json
 
2
  from collections import defaultdict
3
  from dataclasses import dataclass
4
+ from typing import List, Optional
5
 
6
  import pandas as pd
7
 
8
  from src.benchmarks import get_safe_name
9
+ from src.envs import COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL_LINK, \
10
+ COL_NAME_RERANKING_MODEL_LINK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from src.display.formatting import make_clickable_model
12
 
 
 
 
 
 
 
 
 
13
 
14
  @dataclass
15
  class EvalResult:
 
126
  return [v for v in results.values()]
127
 
128
 
129
+ @dataclass
130
+ class LeaderboardDataStore:
131
+ version: str
132
+ slug: str
133
+ raw_data: Optional[list]
134
+ raw_df_qa: Optional[pd.DataFrame]
135
+ raw_df_long_doc: Optional[pd.DataFrame]
136
+ leaderboard_df_qa: Optional[pd.DataFrame]
137
+ leaderboard_df_long_doc: Optional[pd.DataFrame]
138
+ reranking_models: Optional[list]
139
+ types_qa: Optional[list]
140
+ types_long_doc: Optional[list]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils.py CHANGED
@@ -2,20 +2,24 @@ import json
2
  import hashlib
3
  from datetime import datetime, timezone
4
  from pathlib import Path
5
- from typing import List
6
 
7
  import pandas as pd
8
 
9
- from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
10
  from src.display.formatting import styled_message, styled_error
11
- from src.display.utils import COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, COL_NAME_RANK, COL_NAME_AVG, \
12
- COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, COL_NAME_IS_ANONYMOUS, COL_NAME_TIMESTAMP, COL_NAME_REVISION, get_default_auto_eval_column_dict
13
- from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION
14
- from src.read_evals import FullEvalResult, get_leaderboard_df, calculate_mean
15
 
16
  import re
17
 
18
 
 
 
 
 
 
 
19
  def remove_html(input_str):
20
  # Regular expression for finding HTML tags
21
  clean = re.sub(r'<.*?>', '', input_str)
@@ -55,67 +59,61 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
55
  return df[(df[COL_NAME_RETRIEVAL_MODEL].str.contains(query, case=False))]
56
 
57
 
58
- def get_default_cols(task: str, columns: list=[], add_fix_cols: bool=True) -> list:
59
  cols = []
60
  types = []
61
  if task == "qa":
62
- cols_list = COLS_QA
63
- types_list = TYPES_QA
64
- benchmark_list = BENCHMARK_COLS_QA
65
  elif task == "long-doc":
66
- cols_list = COLS_LONG_DOC
67
- types_list = TYPES_LONG_DOC
68
- benchmark_list = BENCHMARK_COLS_LONG_DOC
69
  else:
70
  raise NotImplemented
 
 
71
  for col_name, col_type in zip(cols_list, types_list):
72
  if col_name not in benchmark_list:
73
  continue
74
- if len(columns) > 0 and col_name not in columns:
75
- continue
76
  cols.append(col_name)
77
  types.append(col_type)
78
 
79
  if add_fix_cols:
80
  _cols = []
81
  _types = []
 
82
  for col_name, col_type in zip(cols, types):
83
- if col_name in FIXED_COLS:
84
  continue
85
  _cols.append(col_name)
86
  _types.append(col_type)
87
- cols = FIXED_COLS + _cols
88
- types = FIXED_COLS_TYPES + _types
89
  return cols, types
90
 
91
 
92
- fixed_cols = get_default_auto_eval_column_dict()[:-3]
93
-
94
- FIXED_COLS = [c.name for _, _, c in fixed_cols]
95
- FIXED_COLS_TYPES = [c.type for _, _, c in fixed_cols]
96
-
97
-
98
  def select_columns(
99
  df: pd.DataFrame,
100
  domain_query: list,
101
  language_query: list,
102
  task: str = "qa",
103
- reset_ranking: bool = True
 
104
  ) -> pd.DataFrame:
105
- cols, _ = get_default_cols(task=task, columns=df.columns, add_fix_cols=False)
106
  selected_cols = []
107
  for c in cols:
108
  if task == "qa":
109
- eval_col = BenchmarksQA[c].value
110
  elif task == "long-doc":
111
- eval_col = BenchmarksLongDoc[c].value
112
  if eval_col.domain not in domain_query:
113
  continue
114
  if eval_col.lang not in language_query:
115
  continue
116
  selected_cols.append(c)
117
  # We use COLS to maintain sorting
118
- filtered_df = df[FIXED_COLS + selected_cols]
 
 
119
  if reset_ranking:
120
  filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].apply(calculate_mean, axis=1).round(decimals=2)
121
  filtered_df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
@@ -124,9 +122,17 @@ def select_columns(
124
 
125
  return filtered_df
126
 
 
 
 
 
 
 
 
127
 
128
  def _update_table(
129
  task: str,
 
130
  hidden_df: pd.DataFrame,
131
  domains: list,
132
  langs: list,
@@ -136,32 +142,20 @@ def _update_table(
136
  reset_ranking: bool = True,
137
  show_revision_and_timestamp: bool = False
138
  ):
 
139
  filtered_df = hidden_df.copy()
140
  if not show_anonymous:
141
  filtered_df = filtered_df[~filtered_df[COL_NAME_IS_ANONYMOUS]]
142
  filtered_df = filter_models(filtered_df, reranking_query)
143
  filtered_df = filter_queries(query, filtered_df)
144
- filtered_df = select_columns(filtered_df, domains, langs, task, reset_ranking)
145
  if not show_revision_and_timestamp:
146
  filtered_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
147
  return filtered_df
148
 
149
 
150
- def update_table(
151
- hidden_df: pd.DataFrame,
152
- domains: list,
153
- langs: list,
154
- reranking_query: list,
155
- query: str,
156
- show_anonymous: bool,
157
- show_revision_and_timestamp: bool = False,
158
- reset_ranking: bool = True
159
- ):
160
- return _update_table(
161
- "qa", hidden_df, domains, langs, reranking_query, query, show_anonymous, reset_ranking, show_revision_and_timestamp)
162
-
163
-
164
  def update_table_long_doc(
 
165
  hidden_df: pd.DataFrame,
166
  domains: list,
167
  langs: list,
@@ -173,11 +167,13 @@ def update_table_long_doc(
173
 
174
  ):
175
  return _update_table(
176
- "long-doc", hidden_df, domains, langs, reranking_query, query, show_anonymous, reset_ranking, show_revision_and_timestamp)
 
 
177
 
178
 
179
  def update_metric(
180
- raw_data: List[FullEvalResult],
181
  task: str,
182
  metric: str,
183
  domains: list,
@@ -187,9 +183,12 @@ def update_metric(
187
  show_anonymous: bool = False,
188
  show_revision_and_timestamp: bool = False,
189
  ) -> pd.DataFrame:
 
190
  if task == 'qa':
191
- leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
 
192
  return update_table(
 
193
  leaderboard_df,
194
  domains,
195
  langs,
@@ -199,8 +198,10 @@ def update_metric(
199
  show_revision_and_timestamp
200
  )
201
  elif task == "long-doc":
202
- leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
 
203
  return update_table_long_doc(
 
204
  leaderboard_df,
205
  domains,
206
  langs,
@@ -218,7 +219,6 @@ def upload_file(filepath: str):
218
  return filepath
219
 
220
 
221
-
222
  def get_iso_format_timestamp():
223
  # Get the current timestamp with UTC as the timezone
224
  current_timestamp = datetime.now(timezone.utc)
@@ -316,3 +316,95 @@ def submit_results(
316
  def reset_rank(df):
317
  df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
318
  return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import hashlib
3
  from datetime import datetime, timezone
4
  from pathlib import Path
 
5
 
6
  import pandas as pd
7
 
8
+ from src.benchmarks import QABenchmarks, LongDocBenchmarks
9
  from src.display.formatting import styled_message, styled_error
10
+ from src.display.columns import get_default_col_names_and_types, get_fixed_col_names_and_types
11
+ from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION, COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, \
12
+ COL_NAME_RERANKING_MODEL, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
 
13
 
14
  import re
15
 
16
 
17
+ def calculate_mean(row):
18
+ if pd.isna(row).any():
19
+ return -1
20
+ else:
21
+ return row.mean()
22
+
23
  def remove_html(input_str):
24
  # Regular expression for finding HTML tags
25
  clean = re.sub(r'<.*?>', '', input_str)
 
59
  return df[(df[COL_NAME_RETRIEVAL_MODEL].str.contains(query, case=False))]
60
 
61
 
62
+ def get_default_cols(task: str, version_slug, add_fix_cols: bool=True) -> tuple:
63
  cols = []
64
  types = []
65
  if task == "qa":
66
+ benchmarks = QABenchmarks[version_slug]
 
 
67
  elif task == "long-doc":
68
+ benchmarks = LongDocBenchmarks[version_slug]
 
 
69
  else:
70
  raise NotImplemented
71
+ cols_list, types_list = get_default_col_names_and_types(benchmarks)
72
+ benchmark_list = [c.value.col_name for c in list(benchmarks.value)]
73
  for col_name, col_type in zip(cols_list, types_list):
74
  if col_name not in benchmark_list:
75
  continue
 
 
76
  cols.append(col_name)
77
  types.append(col_type)
78
 
79
  if add_fix_cols:
80
  _cols = []
81
  _types = []
82
+ fixed_cols, fixed_cols_types = get_fixed_col_names_and_types()
83
  for col_name, col_type in zip(cols, types):
84
+ if col_name in fixed_cols:
85
  continue
86
  _cols.append(col_name)
87
  _types.append(col_type)
88
+ cols = fixed_cols + _cols
89
+ types = fixed_cols_types + _types
90
  return cols, types
91
 
92
 
 
 
 
 
 
 
93
  def select_columns(
94
  df: pd.DataFrame,
95
  domain_query: list,
96
  language_query: list,
97
  task: str = "qa",
98
+ reset_ranking: bool = True,
99
+ version_slug: str = None
100
  ) -> pd.DataFrame:
101
+ cols, _ = get_default_cols(task=task, version_slug=version_slug, add_fix_cols=False)
102
  selected_cols = []
103
  for c in cols:
104
  if task == "qa":
105
+ eval_col = QABenchmarks[version_slug].value[c].value
106
  elif task == "long-doc":
107
+ eval_col = LongDocBenchmarks[version_slug].value[c].value
108
  if eval_col.domain not in domain_query:
109
  continue
110
  if eval_col.lang not in language_query:
111
  continue
112
  selected_cols.append(c)
113
  # We use COLS to maintain sorting
114
+ fixed_cols, _ = get_fixed_col_names_and_types()
115
+ filtered_df = df[fixed_cols + selected_cols]
116
+ filtered_df.replace({"": pd.NA}, inplace=True)
117
  if reset_ranking:
118
  filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].apply(calculate_mean, axis=1).round(decimals=2)
119
  filtered_df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
 
122
 
123
  return filtered_df
124
 
125
+ def get_safe_name(name: str):
126
+ """Get RFC 1123 compatible safe name"""
127
+ name = name.replace('-', '_')
128
+ return ''.join(
129
+ character.lower()
130
+ for character in name
131
+ if (character.isalnum() or character == '_'))
132
 
133
  def _update_table(
134
  task: str,
135
+ version: str,
136
  hidden_df: pd.DataFrame,
137
  domains: list,
138
  langs: list,
 
142
  reset_ranking: bool = True,
143
  show_revision_and_timestamp: bool = False
144
  ):
145
+ version_slug = get_safe_name(version)[-4:]
146
  filtered_df = hidden_df.copy()
147
  if not show_anonymous:
148
  filtered_df = filtered_df[~filtered_df[COL_NAME_IS_ANONYMOUS]]
149
  filtered_df = filter_models(filtered_df, reranking_query)
150
  filtered_df = filter_queries(query, filtered_df)
151
+ filtered_df = select_columns(filtered_df, domains, langs, task, reset_ranking, version_slug)
152
  if not show_revision_and_timestamp:
153
  filtered_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
154
  return filtered_df
155
 
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  def update_table_long_doc(
158
+ version: str,
159
  hidden_df: pd.DataFrame,
160
  domains: list,
161
  langs: list,
 
167
 
168
  ):
169
  return _update_table(
170
+ "long-doc",
171
+ version,
172
+ hidden_df, domains, langs, reranking_query, query, show_anonymous, reset_ranking, show_revision_and_timestamp)
173
 
174
 
175
  def update_metric(
176
+ datastore,
177
  task: str,
178
  metric: str,
179
  domains: list,
 
183
  show_anonymous: bool = False,
184
  show_revision_and_timestamp: bool = False,
185
  ) -> pd.DataFrame:
186
+ # raw_data = datastore.raw_data
187
  if task == 'qa':
188
+ leaderboard_df = get_leaderboard_df(datastore, task=task, metric=metric)
189
+ version = datastore.version
190
  return update_table(
191
+ version,
192
  leaderboard_df,
193
  domains,
194
  langs,
 
198
  show_revision_and_timestamp
199
  )
200
  elif task == "long-doc":
201
+ leaderboard_df = get_leaderboard_df(datastore, task=task, metric=metric)
202
+ version = datastore.version
203
  return update_table_long_doc(
204
+ version,
205
  leaderboard_df,
206
  domains,
207
  langs,
 
219
  return filepath
220
 
221
 
 
222
  def get_iso_format_timestamp():
223
  # Get the current timestamp with UTC as the timezone
224
  current_timestamp = datetime.now(timezone.utc)
 
316
  def reset_rank(df):
317
  df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
318
  return df
319
+
320
+
321
+ def get_leaderboard_df(datastore, task: str, metric: str) -> pd.DataFrame:
322
+ """
323
+ Creates a dataframe from all the individual experiment results
324
+ """
325
+ raw_data = datastore.raw_data
326
+ cols = [COL_NAME_IS_ANONYMOUS, ]
327
+ if task == "qa":
328
+ benchmarks = QABenchmarks[datastore.slug]
329
+ elif task == "long-doc":
330
+ benchmarks = LongDocBenchmarks[datastore.slug]
331
+ else:
332
+ raise NotImplemented
333
+ cols_qa, _ = get_default_col_names_and_types(benchmarks)
334
+ cols += cols_qa
335
+ benchmark_cols = [t.value.col_name for t in list(benchmarks.value)]
336
+ all_data_json = []
337
+ for v in raw_data:
338
+ all_data_json += v.to_dict(task=task, metric=metric)
339
+ df = pd.DataFrame.from_records(all_data_json)
340
+
341
+ _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
342
+
343
+ # calculate the average score for selected benchmarks
344
+ df[COL_NAME_AVG] = df[list(_benchmark_cols)].apply(calculate_mean, axis=1).round(decimals=2)
345
+ df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
346
+ df.reset_index(inplace=True, drop=True)
347
+
348
+ _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
349
+ df = df[_cols].round(decimals=2)
350
+
351
+ # filter out if any of the benchmarks have not been produced
352
+ df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
353
+
354
+ # shorten the revision
355
+ df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6]
356
+
357
+ # # replace "0" with "-" for average score
358
+ # df[COL_NAME_AVG] = df[COL_NAME_AVG].replace(0, "-")
359
+ return df
360
+
361
+
362
+ def set_listeners(
363
+ task,
364
+ target_df,
365
+ source_df,
366
+ search_bar,
367
+ version,
368
+ selected_domains,
369
+ selected_langs,
370
+ selected_rerankings,
371
+ show_anonymous,
372
+ show_revision_and_timestamp,
373
+ ):
374
+ if task == "qa":
375
+ update_table_func = update_table
376
+ elif task == "long-doc":
377
+ update_table_func = update_table_long_doc
378
+ else:
379
+ raise NotImplementedError
380
+ selector_list = [
381
+ selected_domains,
382
+ selected_langs,
383
+ selected_rerankings,
384
+ search_bar,
385
+ show_anonymous
386
+ ]
387
+ search_bar_args = [source_df, version,] + selector_list
388
+ selector_args = [version, source_df] + selector_list + [show_revision_and_timestamp,]
389
+ # Set search_bar listener
390
+ search_bar.submit(update_table_func, search_bar_args, target_df)
391
+
392
+ # Set column-wise listener
393
+ for selector in selector_list:
394
+ selector.change(update_table_func, selector_args, target_df, queue=True,)
395
+
396
+ def update_table(
397
+ version: str,
398
+ hidden_df: pd.DataFrame,
399
+ domains: list,
400
+ langs: list,
401
+ reranking_query: list,
402
+ query: str,
403
+ show_anonymous: bool,
404
+ show_revision_and_timestamp: bool = False,
405
+ reset_ranking: bool = True,
406
+ ):
407
+ return _update_table(
408
+ "qa",
409
+ version,
410
+ hidden_df, domains, langs, reranking_query, query, show_anonymous, reset_ranking, show_revision_and_timestamp)
tests/src/display/test_utils.py CHANGED
@@ -1,5 +1,5 @@
1
  import pytest
2
- from src.display.utils import fields, AutoEvalColumnQA, COLS_QA, COLS_LONG_DOC, COLS_LITE, TYPES_QA, TYPES_LONG_DOC, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS, get_default_auto_eval_column_dict
3
 
4
 
5
  def test_fields():
@@ -10,11 +10,8 @@ def test_fields():
10
  def test_macro_variables():
11
  print(f'COLS_QA: {COLS_QA}')
12
  print(f'COLS_LONG_DOC: {COLS_LONG_DOC}')
13
- print(f'COLS_LITE: {COLS_LITE}')
14
  print(f'TYPES_QA: {TYPES_QA}')
15
  print(f'TYPES_LONG_DOC: {TYPES_LONG_DOC}')
16
- print(f'QA_BENCHMARK_COLS: {QA_BENCHMARK_COLS}')
17
- print(f'LONG_DOC_BENCHMARK_COLS: {LONG_DOC_BENCHMARK_COLS}')
18
 
19
 
20
  def test_get_default_auto_eval_column_dict():
 
1
  import pytest
2
+ from src.display.utils import fields, AutoEvalColumnQA, COLS_QA, COLS_LONG_DOC, TYPES_QA, TYPES_LONG_DOC, get_default_auto_eval_column_dict
3
 
4
 
5
  def test_fields():
 
10
  def test_macro_variables():
11
  print(f'COLS_QA: {COLS_QA}')
12
  print(f'COLS_LONG_DOC: {COLS_LONG_DOC}')
 
13
  print(f'TYPES_QA: {TYPES_QA}')
14
  print(f'TYPES_LONG_DOC: {TYPES_LONG_DOC}')
 
 
15
 
16
 
17
  def test_get_default_auto_eval_column_dict():
tests/src/test_benchmarks.py CHANGED
@@ -1,9 +1,16 @@
1
- from src.benchmarks import BenchmarksQA, BenchmarksLongDoc
2
 
3
 
4
  def test_qabenchmarks():
5
- print(list(BenchmarksQA))
 
 
 
 
 
 
 
6
 
7
 
8
  def test_longdocbenchmarks():
9
- print(list(BenchmarksLongDoc))
 
1
+ from src.benchmarks import QABenchmarks, LongDocBenchmarks
2
 
3
 
4
  def test_qabenchmarks():
5
+ for benchmark_list in list(QABenchmarks):
6
+ print(benchmark_list.name)
7
+ for b in list(benchmark_list.value):
8
+ print(b)
9
+ qa_benchmarks = QABenchmarks["2404"]
10
+ l = list(frozenset([c.value.domain for c in list(qa_benchmarks.value)]))
11
+ print(l)
12
+
13
 
14
 
15
  def test_longdocbenchmarks():
16
+ print(list(LongDocBenchmarks))
tests/src/test_read_evals.py CHANGED
@@ -1,6 +1,8 @@
1
  from pathlib import Path
2
 
3
- from src.read_evals import FullEvalResult, get_raw_eval_results, get_leaderboard_df
 
 
4
 
5
  cur_fp = Path(__file__)
6
 
@@ -29,7 +31,7 @@ def test_to_dict():
29
 
30
  def test_get_raw_eval_results():
31
  results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
32
- results = get_raw_eval_results(results_path)
33
  # only load the latest results
34
  assert len(results) == 4
35
  assert results[0].eval_name == "bge-base-en-v1.5_NoReranker"
@@ -40,7 +42,7 @@ def test_get_raw_eval_results():
40
 
41
  def test_get_leaderboard_df():
42
  results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
43
- raw_data = get_raw_eval_results(results_path)
44
  df = get_leaderboard_df(raw_data, 'qa', 'ndcg_at_10')
45
  assert df.shape[0] == 4
46
  # the results contain only one embedding model
@@ -55,7 +57,7 @@ def test_get_leaderboard_df():
55
 
56
  def test_get_leaderboard_df_long_doc():
57
  results_path = cur_fp.parents[2] / "toydata" / "test_results"
58
- raw_data = get_raw_eval_results(results_path)
59
  df = get_leaderboard_df(raw_data, 'long-doc', 'ndcg_at_1')
60
  assert df.shape[0] == 2
61
  # the results contain only one embedding model
 
1
  from pathlib import Path
2
 
3
+ from src.read_evals import load_raw_eval_results
4
+ from src.utils import get_leaderboard_df
5
+ from src.models import FullEvalResult
6
 
7
  cur_fp = Path(__file__)
8
 
 
31
 
32
  def test_get_raw_eval_results():
33
  results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
34
+ results = load_raw_eval_results(results_path)
35
  # only load the latest results
36
  assert len(results) == 4
37
  assert results[0].eval_name == "bge-base-en-v1.5_NoReranker"
 
42
 
43
  def test_get_leaderboard_df():
44
  results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
45
+ raw_data = load_raw_eval_results(results_path)
46
  df = get_leaderboard_df(raw_data, 'qa', 'ndcg_at_10')
47
  assert df.shape[0] == 4
48
  # the results contain only one embedding model
 
57
 
58
  def test_get_leaderboard_df_long_doc():
59
  results_path = cur_fp.parents[2] / "toydata" / "test_results"
60
+ raw_data = load_raw_eval_results(results_path)
61
  df = get_leaderboard_df(raw_data, 'long-doc', 'ndcg_at_1')
62
  assert df.shape[0] == 2
63
  # the results contain only one embedding model
tests/test_utils.py CHANGED
@@ -1,8 +1,10 @@
1
  import pandas as pd
2
  import pytest
3
 
4
- from src.utils import filter_models, search_table, filter_queries, select_columns, update_table_long_doc, get_iso_format_timestamp, get_default_cols, update_table
5
- from src.display.utils import COL_NAME_IS_ANONYMOUS, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RANK, COL_NAME_AVG
 
 
6
 
7
 
8
  @pytest.fixture
 
1
  import pandas as pd
2
  import pytest
3
 
4
+ from src.utils import filter_models, search_table, filter_queries, select_columns, update_table_long_doc, get_iso_format_timestamp, get_default_cols
5
+ from app import update_table
6
+ from src.envs import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, COL_NAME_REVISION, \
7
+ COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
8
 
9
 
10
  @pytest.fixture