Terry Zhuo commited on
Commit
14a3287
·
1 Parent(s): 5fa61d0

update w/ hard only

Browse files
Files changed (1) hide show
  1. app.py +168 -166
app.py CHANGED
@@ -150,26 +150,26 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
150
  raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
151
 
152
  def get_latest_data_leaderboard(
153
- leaderboard_initial_df = None,
154
  hard_leaderboard_initial_df = None,
155
- elo_task_df = None,
156
- elo_bench_df = None,
157
  hard_elo_task_df = None,
158
  hard_elo_bench_df = None,
159
- complete_solve_df = None,
160
- instruct_solve_df = None,
161
  hard_complete_solve_df = None,
162
  hard_instruct_solve_df = None
163
  ):
164
  global NEW_DATA_ON_LEADERBOARD
165
- global LEADERBOARD_DF
166
  global HARD_LEADERBOARD_DF
167
- global ELO_TASK_DF
168
- global ELO_BENCH_DF
169
  global HARD_ELO_TASK_DF
170
  global HARD_ELO_BENCH_DF
171
- global COMPLETE_SOLVE_DF
172
- global INSTRUCT_SOLVE_DF
173
  global HARD_COMPLETE_SOLVE_DF
174
  global HARD_INSTRUCT_SOLVE_DF
175
 
@@ -183,10 +183,10 @@ def get_latest_data_leaderboard(
183
  download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
184
  verification_mode="no_checks"
185
  )
186
- LEADERBOARD_DF = get_leaderboard_df(
187
- leaderboard_dataset=leaderboard_dataset,
188
- cols=COLS,
189
- )
190
  hard_leaderboard_dataset = datasets.load_dataset(
191
  HARD_RESULT_REPO,
192
  "default",
@@ -201,24 +201,24 @@ def get_latest_data_leaderboard(
201
  )
202
  HARD_LEADERBOARD_DF = hard_leaderboard_df
203
 
204
- elo_task_df = datasets.load_dataset(
205
- ELO_REPO,
206
- "default",
207
- split="task_no_tie",
208
- cache_dir=HF_HOME,
209
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
210
- verification_mode="no_checks"
211
- ).to_pandas()
212
- elo_bench_df = datasets.load_dataset(
213
- ELO_REPO,
214
- "default",
215
- split="benchmark_tie",
216
- cache_dir=HF_HOME,
217
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
218
- verification_mode="no_checks"
219
- ).to_pandas()
220
- ELO_TASK_DF = elo_task_df
221
- ELO_BENCH_DF = elo_bench_df
222
 
223
  hard_elo_task_df = datasets.load_dataset(
224
  HARD_ELO_REPO,
@@ -239,24 +239,24 @@ def get_latest_data_leaderboard(
239
  HARD_ELO_TASK_DF = hard_elo_task_df
240
  HARD_ELO_BENCH_DF = hard_elo_bench_df
241
 
242
- complete_solve_df = datasets.load_dataset(
243
- SOLVE_REPO,
244
- "default",
245
- split="complete",
246
- cache_dir=HF_HOME,
247
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
248
- verification_mode="no_checks"
249
- ).to_pandas()
250
- instruct_solve_df = datasets.load_dataset(
251
- SOLVE_REPO,
252
- "default",
253
- split="instruct",
254
- cache_dir=HF_HOME,
255
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
256
- verification_mode="no_checks"
257
- ).to_pandas()
258
- COMPLETE_SOLVE_DF = complete_solve_df
259
- INSTRUCT_SOLVE_DF = instruct_solve_df
260
 
261
  hard_complete_solve_df = datasets.load_dataset(
262
  HARD_SOLVE_REPO,
@@ -280,41 +280,41 @@ def get_latest_data_leaderboard(
280
  NEW_DATA_ON_LEADERBOARD = False
281
 
282
  else:
283
- LEADERBOARD_DF = leaderboard_initial_df
284
  HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
285
- ELO_TASK_DF = elo_task_df
286
- ELO_BENCH_DF = elo_bench_df
287
  HARD_ELO_TASK_DF = hard_elo_task_df
288
  HARD_ELO_BENCH_DF = hard_elo_bench_df
289
- COMPLETE_SOLVE_DF = complete_solve_df
290
- INSTRUCT_SOLVE_DF = instruct_solve_df
291
  HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
292
  HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
293
 
294
- return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
 
295
 
296
 
297
  def init_space():
298
  """Initializes the application space, loading only necessary data."""
299
 
300
  # Always redownload the leaderboard DataFrame
301
- global LEADERBOARD_DF
302
  global HARD_LEADERBOARD_DF
303
- global ELO_TASK_DF
304
- global ELO_BENCH_DF
305
  global HARD_ELO_TASK_DF
306
  global HARD_ELO_BENCH_DF
307
- global COMPLETE_SOLVE_DF
308
- global INSTRUCT_SOLVE_DF
309
  global HARD_COMPLETE_SOLVE_DF
310
  global HARD_INSTRUCT_SOLVE_DF
311
 
312
- LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
 
313
 
314
- # Evaluation queue DataFrame retrieval is independent of initialization detail level
315
- # eval_queue_dfs = get_latest_data_queue()
316
-
317
- return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
318
 
319
  # Initialize VoteManager
320
  # vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
@@ -329,11 +329,11 @@ def init_space():
329
 
330
  # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
331
  # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
332
- LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
333
- ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
334
- COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
335
- HARD_INSTRUCT_SOLVE_DF = init_space()
336
-
337
 
338
  # Data processing for plots now only on demand in the respective Gradio tab
339
  # def load_and_create_plots():
@@ -378,107 +378,108 @@ def init_others(dataframe):
378
  main_block = gr.Blocks(css=custom_css)
379
  with main_block as demo:
380
  with gr.Row(elem_id="header-row"):
381
- gr.HTML(TITLE + "<p>Total models: " + str(len(LEADERBOARD_DF))+ "</p>")
382
 
383
  # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
384
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
385
- with gr.Tab("💎 Hard Set") as hard_tabs:
386
- with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
387
- hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
388
- gr.Markdown(
389
- """
390
- **Notes:**
391
- - _Hard Set_ vs _Full Set_:
392
- - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
393
- - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
394
- - _Complete_ vs _Instruct_:
395
- - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
396
- - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
397
- - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
398
- - `Average` is the average of `Complete` and `Instruct` when both are available.
399
- - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
400
- - `#Act Params (B)` is the number of activated model parameters during inference.
401
- - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
402
- - For more details check the 📝 About section.
403
- """,
404
- elem_classes="markdown-text",
405
- )
406
-
407
- with gr.TabItem("📊 Elo Rating", id="hard_elo"):
408
- with gr.Column():
409
- with gr.Group():
410
- gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
411
- hard_task_elo_map = gr.Plot()
412
- hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
413
- demo.load(plot_elo_mle, [hard_elo_task_gr],
414
- hard_task_elo_map)
415
- with gr.Group():
416
- gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
417
- hard_bench_elo_map = gr.Plot()
418
- hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
419
- demo.load(plot_elo_mle, [hard_elo_bench_gr],
420
- hard_bench_elo_map)
 
421
 
422
- with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
423
- with gr.Column():
424
- hard_complete_map = gr.Plot()
425
- hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
426
- demo.load(plot_solve_rate, [hard_complete_solve_gr,
427
- gr.Textbox("Complete", visible=False),
428
- gr.Number(10, visible=False),
429
- gr.Number(16, visible=False),
430
- ], hard_complete_map)
431
- hard_instruct_map = gr.Plot()
432
- hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
433
- demo.load(plot_solve_rate, [hard_instruct_solve_gr,
434
- gr.Textbox("Instruct", visible=False),
435
- gr.Number(10, visible=False),
436
- gr.Number(16, visible=False),
437
- ], hard_instruct_map)
438
- with gr.Tab("🎯 Full Set") as full_tabs:
439
- with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
440
- leaderboard = init_leaderboard(LEADERBOARD_DF)
441
- gr.Markdown(
442
- """
443
- **Notes:**
444
- - _Complete_ vs _Instruct_:
445
- - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
446
- - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
447
- - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
448
- - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
449
- - `size` is the amount of activated model weight during inference.
450
- - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
451
- - For more details check the 📝 About section.
452
- """,
453
- elem_classes="markdown-text",
454
- )
455
 
456
- with gr.TabItem("📊 Elo Rating", id="full_elo"):
457
- with gr.Column():
458
- with gr.Group():
459
 
460
- gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
461
- task_elo_map = gr.Plot()
462
- elo_task_gr = init_others(ELO_TASK_DF)
463
- demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
464
- with gr.Group():
465
- gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
466
- bench_elo_map = gr.Plot()
467
- elo_bench_gr = init_others(ELO_BENCH_DF)
468
- demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
469
 
470
- with gr.TabItem("🧩 Solve Rate", id="full_solve"):
471
- with gr.Column():
472
- complete_map = gr.Plot()
473
- complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
474
- demo.load(plot_solve_rate, [complete_solve_gr,
475
- gr.Textbox("Complete", visible=False),
476
- ], complete_map)
477
- instruct_map = gr.Plot()
478
- instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
479
- demo.load(plot_solve_rate, [instruct_solve_gr,
480
- gr.Textbox("Instruct", visible=False),
481
- ], instruct_map)
482
  with gr.TabItem("📝 About", id=3):
483
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
484
  with gr.TabItem("🔎 Data Viewer", id="viewer"):
@@ -521,7 +522,8 @@ with main_block as demo:
521
  show_copy_button=True,
522
  )
523
 
524
- main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
 
525
  # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
526
  # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
527
 
 
150
  raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
151
 
152
  def get_latest_data_leaderboard(
153
+ # leaderboard_initial_df = None,
154
  hard_leaderboard_initial_df = None,
155
+ # elo_task_df = None,
156
+ # elo_bench_df = None,
157
  hard_elo_task_df = None,
158
  hard_elo_bench_df = None,
159
+ # complete_solve_df = None,
160
+ # instruct_solve_df = None,
161
  hard_complete_solve_df = None,
162
  hard_instruct_solve_df = None
163
  ):
164
  global NEW_DATA_ON_LEADERBOARD
165
+ # global LEADERBOARD_DF
166
  global HARD_LEADERBOARD_DF
167
+ # global ELO_TASK_DF
168
+ # global ELO_BENCH_DF
169
  global HARD_ELO_TASK_DF
170
  global HARD_ELO_BENCH_DF
171
+ # global COMPLETE_SOLVE_DF
172
+ # global INSTRUCT_SOLVE_DF
173
  global HARD_COMPLETE_SOLVE_DF
174
  global HARD_INSTRUCT_SOLVE_DF
175
 
 
183
  download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
184
  verification_mode="no_checks"
185
  )
186
+ # LEADERBOARD_DF = get_leaderboard_df(
187
+ # leaderboard_dataset=leaderboard_dataset,
188
+ # cols=COLS,
189
+ # )
190
  hard_leaderboard_dataset = datasets.load_dataset(
191
  HARD_RESULT_REPO,
192
  "default",
 
201
  )
202
  HARD_LEADERBOARD_DF = hard_leaderboard_df
203
 
204
+ # elo_task_df = datasets.load_dataset(
205
+ # ELO_REPO,
206
+ # "default",
207
+ # split="task_no_tie",
208
+ # cache_dir=HF_HOME,
209
+ # download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
210
+ # verification_mode="no_checks"
211
+ # ).to_pandas()
212
+ # elo_bench_df = datasets.load_dataset(
213
+ # ELO_REPO,
214
+ # "default",
215
+ # split="benchmark_tie",
216
+ # cache_dir=HF_HOME,
217
+ # download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
218
+ # verification_mode="no_checks"
219
+ # ).to_pandas()
220
+ # ELO_TASK_DF = elo_task_df
221
+ # ELO_BENCH_DF = elo_bench_df
222
 
223
  hard_elo_task_df = datasets.load_dataset(
224
  HARD_ELO_REPO,
 
239
  HARD_ELO_TASK_DF = hard_elo_task_df
240
  HARD_ELO_BENCH_DF = hard_elo_bench_df
241
 
242
+ # complete_solve_df = datasets.load_dataset(
243
+ # SOLVE_REPO,
244
+ # "default",
245
+ # split="complete",
246
+ # cache_dir=HF_HOME,
247
+ # download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
248
+ # verification_mode="no_checks"
249
+ # ).to_pandas()
250
+ # instruct_solve_df = datasets.load_dataset(
251
+ # SOLVE_REPO,
252
+ # "default",
253
+ # split="instruct",
254
+ # cache_dir=HF_HOME,
255
+ # download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
256
+ # verification_mode="no_checks"
257
+ # ).to_pandas()
258
+ # COMPLETE_SOLVE_DF = complete_solve_df
259
+ # INSTRUCT_SOLVE_DF = instruct_solve_df
260
 
261
  hard_complete_solve_df = datasets.load_dataset(
262
  HARD_SOLVE_REPO,
 
280
  NEW_DATA_ON_LEADERBOARD = False
281
 
282
  else:
283
+ # LEADERBOARD_DF = leaderboard_initial_df
284
  HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
285
+ # ELO_TASK_DF = elo_task_df
286
+ # ELO_BENCH_DF = elo_bench_df
287
  HARD_ELO_TASK_DF = hard_elo_task_df
288
  HARD_ELO_BENCH_DF = hard_elo_bench_df
289
+ # COMPLETE_SOLVE_DF = complete_solve_df
290
+ # INSTRUCT_SOLVE_DF = instruct_solve_df
291
  HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
292
  HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
293
 
294
+ # return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
295
+ return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
296
 
297
 
298
  def init_space():
299
  """Initializes the application space, loading only necessary data."""
300
 
301
  # Always redownload the leaderboard DataFrame
302
+ # global LEADERBOARD_DF
303
  global HARD_LEADERBOARD_DF
304
+ # global ELO_TASK_DF
305
+ # global ELO_BENCH_DF
306
  global HARD_ELO_TASK_DF
307
  global HARD_ELO_BENCH_DF
308
+ # global COMPLETE_SOLVE_DF
309
+ # global INSTRUCT_SOLVE_DF
310
  global HARD_COMPLETE_SOLVE_DF
311
  global HARD_INSTRUCT_SOLVE_DF
312
 
313
+ # LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
314
+ HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
315
 
316
+ # return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
317
+ return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
 
 
318
 
319
  # Initialize VoteManager
320
  # vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
 
329
 
330
  # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
331
  # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
332
+ # LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
333
+ # ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
334
+ # COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
335
+ # HARD_INSTRUCT_SOLVE_DF = init_space()
336
+ HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
337
 
338
  # Data processing for plots now only on demand in the respective Gradio tab
339
  # def load_and_create_plots():
 
378
  main_block = gr.Blocks(css=custom_css)
379
  with main_block as demo:
380
  with gr.Row(elem_id="header-row"):
381
+ gr.HTML(TITLE + "<p>Total models: " + str(len(HARD_LEADERBOARD_DF))+ "</p>")
382
 
383
  # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
384
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
385
+ # with gr.Tab("💎 Hard Set") as hard_tabs:
386
+ with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
387
+ hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
388
+ gr.Markdown(
389
+ """
390
+ **Notes:**
391
+ - For the efficiency reasons, we only display the Hard Set leaderboard.
392
+ - _Hard Set_ vs _Full Set_:
393
+ - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
394
+ - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
395
+ - _Complete_ vs _Instruct_:
396
+ - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
397
+ - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
398
+ - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
399
+ - `Average` is the average of `Complete` and `Instruct` when both are available.
400
+ - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
401
+ - `#Act Params (B)` is the number of activated model parameters during inference.
402
+ - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
403
+ - For more details check the 📝 About section.
404
+ """,
405
+ elem_classes="markdown-text",
406
+ )
407
+
408
+ with gr.TabItem("📊 Elo Rating", id="hard_elo"):
409
+ with gr.Column():
410
+ with gr.Group():
411
+ gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
412
+ hard_task_elo_map = gr.Plot()
413
+ hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
414
+ demo.load(plot_elo_mle, [hard_elo_task_gr],
415
+ hard_task_elo_map)
416
+ with gr.Group():
417
+ gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
418
+ hard_bench_elo_map = gr.Plot()
419
+ hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
420
+ demo.load(plot_elo_mle, [hard_elo_bench_gr],
421
+ hard_bench_elo_map)
422
 
423
+ with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
424
+ with gr.Column():
425
+ hard_complete_map = gr.Plot()
426
+ hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
427
+ demo.load(plot_solve_rate, [hard_complete_solve_gr,
428
+ gr.Textbox("Complete", visible=False),
429
+ gr.Number(10, visible=False),
430
+ gr.Number(16, visible=False),
431
+ ], hard_complete_map)
432
+ hard_instruct_map = gr.Plot()
433
+ hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
434
+ demo.load(plot_solve_rate, [hard_instruct_solve_gr,
435
+ gr.Textbox("Instruct", visible=False),
436
+ gr.Number(10, visible=False),
437
+ gr.Number(16, visible=False),
438
+ ], hard_instruct_map)
439
+ # with gr.Tab("🎯 Full Set") as full_tabs:
440
+ # with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
441
+ # leaderboard = init_leaderboard(LEADERBOARD_DF)
442
+ # gr.Markdown(
443
+ # """
444
+ # **Notes:**
445
+ # - _Complete_ vs _Instruct_:
446
+ # - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
447
+ # - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
448
+ # - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
449
+ # - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
450
+ # - `size` is the amount of activated model weight during inference.
451
+ # - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
452
+ # - For more details check the 📝 About section.
453
+ # """,
454
+ # elem_classes="markdown-text",
455
+ # )
456
 
457
+ # with gr.TabItem("📊 Elo Rating", id="full_elo"):
458
+ # with gr.Column():
459
+ # with gr.Group():
460
 
461
+ # gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
462
+ # task_elo_map = gr.Plot()
463
+ # elo_task_gr = init_others(ELO_TASK_DF)
464
+ # demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
465
+ # with gr.Group():
466
+ # gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
467
+ # bench_elo_map = gr.Plot()
468
+ # elo_bench_gr = init_others(ELO_BENCH_DF)
469
+ # demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
470
 
471
+ # with gr.TabItem("🧩 Solve Rate", id="full_solve"):
472
+ # with gr.Column():
473
+ # complete_map = gr.Plot()
474
+ # complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
475
+ # demo.load(plot_solve_rate, [complete_solve_gr,
476
+ # gr.Textbox("Complete", visible=False),
477
+ # ], complete_map)
478
+ # instruct_map = gr.Plot()
479
+ # instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
480
+ # demo.load(plot_solve_rate, [instruct_solve_gr,
481
+ # gr.Textbox("Instruct", visible=False),
482
+ # ], instruct_map)
483
  with gr.TabItem("📝 About", id=3):
484
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
485
  with gr.TabItem("🔎 Data Viewer", id="viewer"):
 
522
  show_copy_button=True,
523
  )
524
 
525
+ # main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
526
+ main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
527
  # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
528
  # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
529