Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
2508d96
1 Parent(s): ebf3ceb

refactor: refactor the data loading part

Browse files
Files changed (1) hide show
  1. app.py +56 -37
app.py CHANGED
@@ -82,8 +82,8 @@ from typing import Optional
82
  @dataclass
83
  class LeaderboardDataStore:
84
  raw_data: Optional[list]
85
- raw_qa_df: Optional[pd.DataFrame]
86
- original_df_long_doc: Optional[pd.DataFrame]
87
  leaderboard_df_qa: Optional[pd.DataFrame]
88
  leaderboard_df_long_doc: Optional[pd.DataFrame]
89
  reranking_models: Optional[list]
@@ -91,41 +91,52 @@ class LeaderboardDataStore:
91
  types_long_doc: Optional[list]
92
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  def load_eval_results(file_path: str):
95
  output = {}
96
  versions = ("AIR-Bench_24.04",)
97
  for version in versions:
98
- output[version] = LeaderboardDataStore(None, None, None, None, None, None, None, None)
99
- output[version].raw_data = get_raw_eval_results(f"{file_path}/{version}")
100
- output[version].raw_qa_df = get_leaderboard_df(
101
- output[version].raw_data, task='qa', metric=DEFAULT_METRIC_QA)
102
- output[version].original_df_long_doc = get_leaderboard_df(
103
- output[version].raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
104
- print(f'raw data: {len(output[version].raw_data)}')
105
- print(f'QA data loaded: {output[version].raw_qa_df.shape}')
106
- print(f'Long-Doc data loaded: {len(output[version].original_df_long_doc)}')
107
-
108
- output[version].leaderboard_df_qa = output[version].raw_qa_df.copy()
109
- # leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
110
- shown_columns_qa, types_qa = get_default_cols(
111
- 'qa', output[version].leaderboard_df_qa.columns, add_fix_cols=True)
112
- output[version].types_qa = types_qa
113
- output[version].leaderboard_df_qa = output[version].leaderboard_df_qa[~output[version].leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
114
- output[version].leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
115
-
116
- output[version].leaderboard_df_long_doc = output[version].original_df_long_doc.copy()
117
- shown_columns_long_doc, types_long_doc = get_default_cols(
118
- 'long-doc', output[version].leaderboard_df_long_doc.columns, add_fix_cols=True)
119
- output[version].types_long_doc = types_long_doc
120
- output[version].leaderboard_df_long_doc = output[version].leaderboard_df_long_doc[~output[version].leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
121
- output[version].leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
122
-
123
- output[version].reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in output[version].raw_data])))
124
  return output
125
 
126
 
127
  data = load_eval_results(EVAL_RESULTS_PATH)
128
 
 
129
  def update_metric_qa(
130
  metric: str,
131
  domains: list,
@@ -133,9 +144,11 @@ def update_metric_qa(
133
  reranking_model: list,
134
  query: str,
135
  show_anonymous: bool,
136
- show_revision_and_timestamp,
 
137
  ):
138
- return update_metric(data["AIR-Bench_24.04"].raw_data, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
 
139
 
140
  def update_metric_long_doc(
141
  metric: str,
@@ -188,7 +201,7 @@ with demo:
188
  selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
189
  leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, data["AIR-Bench_24.04"].types_qa)
190
  # Dummy leaderboard for handling the case when the user uses backspace key
191
- hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].raw_qa_df, data["AIR-Bench_24.04"].types_qa, visible=False)
192
 
193
  set_listeners(
194
  "qa",
@@ -213,10 +226,13 @@ with demo:
213
  search_bar,
214
  show_anonymous,
215
  show_revision_and_timestamp,
 
216
  ],
217
  leaderboard_table,
218
  queue=True
219
  )
 
 
220
  with gr.TabItem("Retrieval Only", id=11):
221
  with gr.Row():
222
  with gr.Column(scale=1):
@@ -227,7 +243,7 @@ with demo:
227
  lb_df_retriever = reset_rank(lb_df_retriever)
228
  lb_table_retriever = get_leaderboard_table(lb_df_retriever, data["AIR-Bench_24.04"].types_qa)
229
  # Dummy leaderboard for handling the case when the user uses backspace key
230
- hidden_lb_df_retriever = data["AIR-Bench_24.04"].raw_qa_df[data["AIR-Bench_24.04"].raw_qa_df[COL_NAME_RERANKING_MODEL] == "NoReranker"]
231
  hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
232
  hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, data["AIR-Bench_24.04"].types_qa, visible=False)
233
 
@@ -254,6 +270,7 @@ with demo:
254
  search_bar_retriever,
255
  show_anonymous,
256
  show_revision_and_timestamp,
 
257
  ],
258
  lb_table_retriever,
259
  queue=True
@@ -268,7 +285,7 @@ with demo:
268
  with gr.Column(scale=1):
269
  search_bar_reranker = gr.Textbox(show_label=False, visible=False)
270
  lb_table_reranker = get_leaderboard_table(lb_df_reranker, data["AIR-Bench_24.04"].types_qa)
271
- hidden_lb_df_reranker = data["AIR-Bench_24.04"].raw_qa_df[data["AIR-Bench_24.04"].raw_qa_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
272
  hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
273
  hidden_lb_table_reranker = get_leaderboard_table(
274
  hidden_lb_df_reranker, data["AIR-Bench_24.04"].types_qa, visible=False
@@ -296,6 +313,7 @@ with demo:
296
  search_bar_reranker,
297
  show_anonymous,
298
  show_revision_and_timestamp,
 
299
  ],
300
  lb_table_reranker,
301
  queue=True
@@ -334,7 +352,7 @@ with demo:
334
 
335
  # Dummy leaderboard for handling the case when the user uses backspace key
336
  hidden_lb_table_for_search = get_leaderboard_table(
337
- data["AIR-Bench_24.04"].original_df_long_doc, data["AIR-Bench_24.04"].types_long_doc, visible=False
338
  )
339
 
340
  set_listeners(
@@ -374,8 +392,8 @@ with demo:
374
  data["AIR-Bench_24.04"].leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
375
  ]
376
  lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
377
- hidden_lb_db_retriever_long_doc = data["AIR-Bench_24.04"].original_df_long_doc[
378
- data["AIR-Bench_24.04"].original_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
379
  ]
380
  hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
381
  lb_table_retriever_long_doc = get_leaderboard_table(
@@ -422,7 +440,7 @@ with demo:
422
  with gr.Column(scale=1):
423
  search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
424
  lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc)
425
- hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].original_df_long_doc[data["AIR-Bench_24.04"].original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
426
  hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
427
  hidden_lb_table_reranker_ldoc = get_leaderboard_table(
428
  hidden_lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc, visible=False
@@ -521,6 +539,7 @@ with demo:
521
 
522
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
523
  gr.Markdown(BENCHMARKS_TEXT, elem_classes="markdown-text")
 
524
 
525
  if __name__ == "__main__":
526
  scheduler = BackgroundScheduler()
 
82
  @dataclass
83
  class LeaderboardDataStore:
84
  raw_data: Optional[list]
85
+ raw_df_qa: Optional[pd.DataFrame]
86
+ raw_df_long_doc: Optional[pd.DataFrame]
87
  leaderboard_df_qa: Optional[pd.DataFrame]
88
  leaderboard_df_long_doc: Optional[pd.DataFrame]
89
  reranking_models: Optional[list]
 
91
  types_long_doc: Optional[list]
92
 
93
 
94
+ def load_leaderboard_data(file_path) -> LeaderboardDataStore:
95
+ lb_data_store = LeaderboardDataStore(None, None, None, None, None, None, None, None)
96
+ lb_data_store.raw_data = get_raw_eval_results(file_path)
97
+ print(f'raw data: {len(lb_data_store.raw_data)}')
98
+
99
+ lb_data_store.raw_df_qa = get_leaderboard_df(
100
+ lb_data_store.raw_data, task='qa', metric=DEFAULT_METRIC_QA)
101
+ lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy()
102
+ # leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
103
+ print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}')
104
+ shown_columns_qa, types_qa = get_default_cols(
105
+ 'qa', lb_data_store.leaderboard_df_qa.columns, add_fix_cols=True)
106
+ lb_data_store.types_qa = types_qa
107
+ lb_data_store.leaderboard_df_qa = \
108
+ lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
109
+ lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
110
+
111
+ lb_data_store.raw_df_long_doc = get_leaderboard_df(
112
+ lb_data_store.raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
113
+ print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
114
+ lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
115
+ shown_columns_long_doc, types_long_doc = get_default_cols(
116
+ 'long-doc', lb_data_store.leaderboard_df_long_doc.columns, add_fix_cols=True)
117
+ lb_data_store.types_long_doc = types_long_doc
118
+ lb_data_store.leaderboard_df_long_doc = \
119
+ lb_data_store.leaderboard_df_long_doc[~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][
120
+ shown_columns_long_doc]
121
+ lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
122
+
123
+ lb_data_store.reranking_models = sorted(
124
+ list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
125
+ return lb_data_store
126
+
127
+
128
  def load_eval_results(file_path: str):
129
  output = {}
130
  versions = ("AIR-Bench_24.04",)
131
  for version in versions:
132
+ fn = f"{file_path}/{version}"
133
+ output[version] = load_leaderboard_data(fn)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  return output
135
 
136
 
137
  data = load_eval_results(EVAL_RESULTS_PATH)
138
 
139
+
140
  def update_metric_qa(
141
  metric: str,
142
  domains: list,
 
144
  reranking_model: list,
145
  query: str,
146
  show_anonymous: bool,
147
+ show_revision_and_timestamp: bool,
148
+ selected_version: str,
149
  ):
150
+ return update_metric(data[selected_version].raw_data, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
151
+
152
 
153
  def update_metric_long_doc(
154
  metric: str,
 
201
  selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
202
  leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, data["AIR-Bench_24.04"].types_qa)
203
  # Dummy leaderboard for handling the case when the user uses backspace key
204
+ hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].raw_df_qa, data["AIR-Bench_24.04"].types_qa, visible=False)
205
 
206
  set_listeners(
207
  "qa",
 
226
  search_bar,
227
  show_anonymous,
228
  show_revision_and_timestamp,
229
+ selected_version,
230
  ],
231
  leaderboard_table,
232
  queue=True
233
  )
234
+
235
+ """
236
  with gr.TabItem("Retrieval Only", id=11):
237
  with gr.Row():
238
  with gr.Column(scale=1):
 
243
  lb_df_retriever = reset_rank(lb_df_retriever)
244
  lb_table_retriever = get_leaderboard_table(lb_df_retriever, data["AIR-Bench_24.04"].types_qa)
245
  # Dummy leaderboard for handling the case when the user uses backspace key
246
+ hidden_lb_df_retriever = data["AIR-Bench_24.04"].raw_df_qa[data["AIR-Bench_24.04"].raw_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
247
  hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
248
  hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, data["AIR-Bench_24.04"].types_qa, visible=False)
249
 
 
270
  search_bar_retriever,
271
  show_anonymous,
272
  show_revision_and_timestamp,
273
+ selected_version,
274
  ],
275
  lb_table_retriever,
276
  queue=True
 
285
  with gr.Column(scale=1):
286
  search_bar_reranker = gr.Textbox(show_label=False, visible=False)
287
  lb_table_reranker = get_leaderboard_table(lb_df_reranker, data["AIR-Bench_24.04"].types_qa)
288
+ hidden_lb_df_reranker = data["AIR-Bench_24.04"].raw_df_qa[data["AIR-Bench_24.04"].raw_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
289
  hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
290
  hidden_lb_table_reranker = get_leaderboard_table(
291
  hidden_lb_df_reranker, data["AIR-Bench_24.04"].types_qa, visible=False
 
313
  search_bar_reranker,
314
  show_anonymous,
315
  show_revision_and_timestamp,
316
+ selected_version,
317
  ],
318
  lb_table_reranker,
319
  queue=True
 
352
 
353
  # Dummy leaderboard for handling the case when the user uses backspace key
354
  hidden_lb_table_for_search = get_leaderboard_table(
355
+ data["AIR-Bench_24.04"].raw_df_long_doc, data["AIR-Bench_24.04"].types_long_doc, visible=False
356
  )
357
 
358
  set_listeners(
 
392
  data["AIR-Bench_24.04"].leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
393
  ]
394
  lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
395
+ hidden_lb_db_retriever_long_doc = data["AIR-Bench_24.04"].raw_df_long_doc[
396
+ data["AIR-Bench_24.04"].raw_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
397
  ]
398
  hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
399
  lb_table_retriever_long_doc = get_leaderboard_table(
 
440
  with gr.Column(scale=1):
441
  search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
442
  lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc)
443
+ hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].raw_df_long_doc[data["AIR-Bench_24.04"].raw_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
444
  hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
445
  hidden_lb_table_reranker_ldoc = get_leaderboard_table(
446
  hidden_lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc, visible=False
 
539
 
540
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
541
  gr.Markdown(BENCHMARKS_TEXT, elem_classes="markdown-text")
542
+ """
543
 
544
  if __name__ == "__main__":
545
  scheduler = BackgroundScheduler()