Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
8a1daf9
1 Parent(s): 3b83af7

chore: clean up the requests related codes

Browse files
Files changed (3) hide show
  1. app.py +20 -75
  2. src/about.py +1 -1
  3. tests/src/leaderboard/test_read_evals.py +3 -2
app.py CHANGED
@@ -4,23 +4,14 @@ from huggingface_hub import snapshot_download
4
 
5
  from src.about import (
6
  INTRODUCTION_TEXT,
7
- LLM_BENCHMARKS_TEXT,
8
  TITLE,
9
  EVALUATION_QUEUE_TEXT
10
  )
11
  from src.display.css_html_js import custom_css
12
- from src.display.utils import (
13
- QA_BENCHMARK_COLS,
14
- LONG_DOC_BENCHMARK_COLS,
15
- COLS_QA,
16
- COLS_LONG_DOC,
17
- EVAL_COLS,
18
- TYPES,
19
- AutoEvalColumnQA,
20
- fields
21
- )
22
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
23
- from src.populate import get_leaderboard_df, get_evaluation_queue_df
24
  from utils import update_table, update_metric, update_table_long_doc, upload_file
25
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, metric_list
26
 
@@ -28,14 +19,6 @@ from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, L
28
  def restart_space():
29
  API.restart_space(repo_id=REPO_ID)
30
 
31
- # try:
32
- # print(EVAL_REQUESTS_PATH)
33
- # snapshot_download(
34
- # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
35
- # token=TOKEN
36
- # )
37
- # except Exception:
38
- # restart_space()
39
  # try:
40
  # print(EVAL_RESULTS_PATH)
41
  # snapshot_download(
@@ -45,17 +28,18 @@ def restart_space():
45
  # except Exception:
46
  # restart_space()
47
 
48
- from src.leaderboard.read_evals import get_raw_eval_results
49
- raw_data_qa = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
50
- original_df_qa = get_leaderboard_df(raw_data_qa, COLS_QA, QA_BENCHMARK_COLS, task='qa', metric='ndcg_at_3')
51
- original_df_long_doc = get_leaderboard_df(raw_data_qa, COLS_LONG_DOC, LONG_DOC_BENCHMARK_COLS, task='long_doc', metric='ndcg_at_3')
52
- print(f'raw data: {len(raw_data_qa)}')
 
 
53
  print(f'QA data loaded: {original_df_qa.shape}')
54
  print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
55
 
56
- leaderboard_df = original_df_qa.copy()
57
  leaderboard_df_long_doc = original_df_long_doc.copy()
58
- print(leaderboard_df_long_doc.head())
59
 
60
 
61
  def update_metric_qa(
@@ -65,7 +49,7 @@ def update_metric_qa(
65
  reranking_model: list,
66
  query: str,
67
  ):
68
- return update_metric(raw_data_qa, 'qa', metric, domains, langs, reranking_model, query)
69
 
70
  def update_metric_long_doc(
71
  metric: str,
@@ -74,14 +58,7 @@ def update_metric_long_doc(
74
  reranking_model: list,
75
  query: str,
76
  ):
77
- return update_metric(raw_data_qa, 'long_doc', metric, domains, langs, reranking_model, query)
78
-
79
-
80
- (
81
- finished_eval_queue_df,
82
- running_eval_queue_df,
83
- pending_eval_queue_df,
84
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
85
 
86
 
87
  demo = gr.Blocks(css=custom_css)
@@ -128,7 +105,7 @@ with demo:
128
  interactive=True
129
  )
130
  # select reranking model
131
- reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data_qa]))
132
  with gr.Row():
133
  selected_rerankings = gr.CheckboxGroup(
134
  choices=reranking_models,
@@ -139,7 +116,7 @@ with demo:
139
  )
140
 
141
  leaderboard_table = gr.components.Dataframe(
142
- value=leaderboard_df,
143
  # headers=shown_columns,
144
  # datatype=TYPES,
145
  elem_id="leaderboard-table",
@@ -149,7 +126,7 @@ with demo:
149
 
150
  # Dummy leaderboard for handling the case when the user uses backspace key
151
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
152
- value=leaderboard_df,
153
  # headers=COLS,
154
  # datatype=TYPES,
155
  visible=False,
@@ -236,7 +213,7 @@ with demo:
236
  interactive=True
237
  )
238
  # select reranking model
239
- reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data_qa]))
240
  with gr.Row():
241
  selected_rerankings = gr.CheckboxGroup(
242
  choices=reranking_models,
@@ -311,48 +288,16 @@ with demo:
311
  with gr.Column():
312
  with gr.Row():
313
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
314
- with gr.Row():
315
- with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
316
- with gr.Row():
317
- finished_eval_table = gr.components.Dataframe(
318
- value=finished_eval_queue_df,
319
- row_count=5,
320
- )
321
- with gr.Row():
322
- with gr.Accordion(
323
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
324
- open=False,
325
- ):
326
- with gr.Row():
327
- running_eval_table = gr.components.Dataframe(
328
- value=running_eval_queue_df,
329
- row_count=5,
330
- )
331
- with gr.Row():
332
- with gr.Accordion(
333
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
334
- open=False,
335
- ):
336
- with gr.Row():
337
- pending_eval_table = gr.components.Dataframe(
338
- value=pending_eval_queue_df,
339
- row_count=5,
340
- )
341
  with gr.Row():
342
  gr.Markdown("## ✉️Submit your model here!", elem_classes="markdown-text")
343
- # with gr.Row():
344
- # with gr.Column():
345
- # model_name_textbox = gr.Textbox(label="Model name")
346
- # with gr.Column():
347
- # model_url = gr.Textbox(label="Model URL")
348
  with gr.Row():
349
  file_output = gr.File()
350
  with gr.Row():
351
  upload_button = gr.UploadButton("Click to submit evaluation", file_count="multiple")
352
  upload_button.upload(upload_file, upload_button, file_output)
353
 
354
- # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
355
- # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
356
 
357
  scheduler = BackgroundScheduler()
358
  scheduler.add_job(restart_space, "interval", seconds=1800)
 
4
 
5
  from src.about import (
6
  INTRODUCTION_TEXT,
7
+ BENCHMARKS_TEXT,
8
  TITLE,
9
  EVALUATION_QUEUE_TEXT
10
  )
11
  from src.display.css_html_js import custom_css
12
+ from src.leaderboard.read_evals import get_raw_eval_results, get_leaderboard_df
13
+
14
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
 
 
 
 
 
 
 
 
 
15
  from utils import update_table, update_metric, update_table_long_doc, upload_file
16
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, metric_list
17
 
 
19
  def restart_space():
20
  API.restart_space(repo_id=REPO_ID)
21
 
 
 
 
 
 
 
 
 
22
  # try:
23
  # print(EVAL_RESULTS_PATH)
24
  # snapshot_download(
 
28
  # except Exception:
29
  # restart_space()
30
 
31
+ raw_data = get_raw_eval_results(EVAL_RESULTS_PATH)
32
+
33
+ original_df_qa = get_leaderboard_df(
34
+ raw_data, task='qa', metric='ndcg_at_3')
35
+ original_df_long_doc = get_leaderboard_df(
36
+ raw_data, task='long_doc', metric='ndcg_at_3')
37
+ print(f'raw data: {len(raw_data)}')
38
  print(f'QA data loaded: {original_df_qa.shape}')
39
  print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
40
 
41
+ leaderboard_df_qa = original_df_qa.copy()
42
  leaderboard_df_long_doc = original_df_long_doc.copy()
 
43
 
44
 
45
  def update_metric_qa(
 
49
  reranking_model: list,
50
  query: str,
51
  ):
52
+ return update_metric(raw_data, 'qa', metric, domains, langs, reranking_model, query)
53
 
54
  def update_metric_long_doc(
55
  metric: str,
 
58
  reranking_model: list,
59
  query: str,
60
  ):
61
+ return update_metric(raw_data, 'long_doc', metric, domains, langs, reranking_model, query)
 
 
 
 
 
 
 
62
 
63
 
64
  demo = gr.Blocks(css=custom_css)
 
105
  interactive=True
106
  )
107
  # select reranking model
108
+ reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
109
  with gr.Row():
110
  selected_rerankings = gr.CheckboxGroup(
111
  choices=reranking_models,
 
116
  )
117
 
118
  leaderboard_table = gr.components.Dataframe(
119
+ value=leaderboard_df_qa,
120
  # headers=shown_columns,
121
  # datatype=TYPES,
122
  elem_id="leaderboard-table",
 
126
 
127
  # Dummy leaderboard for handling the case when the user uses backspace key
128
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
129
+ value=leaderboard_df_qa,
130
  # headers=COLS,
131
  # datatype=TYPES,
132
  visible=False,
 
213
  interactive=True
214
  )
215
  # select reranking model
216
+ reranking_models = list(frozenset([eval_result.reranking_model for eval_result in raw_data]))
217
  with gr.Row():
218
  selected_rerankings = gr.CheckboxGroup(
219
  choices=reranking_models,
 
288
  with gr.Column():
289
  with gr.Row():
290
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  with gr.Row():
292
  gr.Markdown("## ✉️Submit your model here!", elem_classes="markdown-text")
 
 
 
 
 
293
  with gr.Row():
294
  file_output = gr.File()
295
  with gr.Row():
296
  upload_button = gr.UploadButton("Click to submit evaluation", file_count="multiple")
297
  upload_button.upload(upload_file, upload_button, file_output)
298
 
299
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
300
+ gr.Markdown(BENCHMARKS_TEXT, elem_classes="markdown-text")
301
 
302
  scheduler = BackgroundScheduler()
303
  scheduler.add_job(restart_space, "interval", seconds=1800)
src/about.py CHANGED
@@ -46,7 +46,7 @@ AIR-Bench: Automated Heterogeneous Information Retrieval Benchmark
46
  """
47
 
48
  # Which evaluations are you running? how can people reproduce what you have?
49
- LLM_BENCHMARKS_TEXT = f"""
50
  ## How it works
51
 
52
  ## Reproducibility
 
46
  """
47
 
48
  # Which evaluations are you running? how can people reproduce what you have?
49
+ BENCHMARKS_TEXT = f"""
50
  ## How it works
51
 
52
  ## Reproducibility
tests/src/leaderboard/test_read_evals.py CHANGED
@@ -37,6 +37,7 @@ def test_get_raw_eval_results():
37
  assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
38
  assert len(results[1].results) == 6
39
 
 
40
  def test_get_leaderboard_df():
41
  results_path = cur_fp.parents[2] / "toydata" / "test_results"
42
  raw_data = get_raw_eval_results(results_path)
@@ -49,7 +50,7 @@ def test_get_leaderboard_df():
49
  assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
50
  assert df["Reranking Model"][1] == "NoReranker"
51
  assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
52
- assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh',]].isnull().values.any()
53
 
54
 
55
  def test_get_leaderboard_df_long_doc():
@@ -64,4 +65,4 @@ def test_get_leaderboard_df_long_doc():
64
  assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
65
  assert df["Reranking Model"][1] == "NoReranker"
66
  assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
67
- assert not df[['Average ⬆️', 'law_en_lex_files_500k_600k',]].isnull().values.any()
 
37
  assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
38
  assert len(results[1].results) == 6
39
 
40
+
41
  def test_get_leaderboard_df():
42
  results_path = cur_fp.parents[2] / "toydata" / "test_results"
43
  raw_data = get_raw_eval_results(results_path)
 
50
  assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
51
  assert df["Reranking Model"][1] == "NoReranker"
52
  assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
53
+ assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any()
54
 
55
 
56
  def test_get_leaderboard_df_long_doc():
 
65
  assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
66
  assert df["Reranking Model"][1] == "NoReranker"
67
  assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
68
+ assert not df[['Average ⬆️', 'law_en_lex_files_500k_600k', ]].isnull().values.any()