xeon27 commited on
Commit
1d1f5e9
·
1 Parent(s): 0796d85

Add separate tab for agentic benchmark

Browse files
Files changed (3) hide show
  1. app.py +9 -5
  2. src/display/utils.py +2 -2
  3. src/populate.py +2 -3
app.py CHANGED
@@ -14,8 +14,8 @@ from src.about import (
14
  )
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
@@ -49,7 +49,8 @@ except Exception:
49
  restart_space()
50
 
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
53
 
54
  (
55
  finished_eval_queue_df,
@@ -96,8 +97,11 @@ with demo:
96
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
97
 
98
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
99
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
100
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
101
 
102
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
103
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
14
  )
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
+ ST_BENCHMARK_COLS,
18
+ AGENTIC_BENCHMARK_COLS,
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
 
49
  restart_space()
50
 
51
 
52
+ ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ST_BENCHMARK_COLS)
53
+ AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, AGENTIC_BENCHMARK_COLS)
54
 
55
  (
56
  finished_eval_queue_df,
 
97
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
98
 
99
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
100
+ with gr.TabItem("Single-turn Benchmark", elem_id="llm-benchmark-tab-table", id=0):
101
+ leaderboard = init_leaderboard(ST_LEADERBOARD_DF)
102
+
103
+ with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
104
+ leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF)
105
 
106
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
107
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
src/display/utils.py CHANGED
@@ -106,5 +106,5 @@ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
106
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
106
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
+ ST_BENCHMARK_COLS = [t.value.col_name for t in Tasks if t.value.type=="single-turn"]
110
+ AGENTIC_BENCHMARK_COLS = [t.value.col_name for t in Tasks if t.value.type=="agentic"]
src/populate.py CHANGED
@@ -34,7 +34,7 @@ def get_inspect_log_url(model_name: str, benchmark_name: str) -> str:
34
  return f"https://storage.googleapis.com/inspect-evals/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
35
 
36
 
37
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
38
  """Creates a dataframe from all the individual experiment results"""
39
  raw_data = get_raw_eval_results(results_path, requests_path)
40
  all_data_json = [v.to_dict() for v in raw_data]
@@ -42,12 +42,11 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
42
  df = pd.DataFrame.from_records(all_data_json)
43
 
44
  # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
45
- df = df[cols].round(decimals=2)
46
 
47
  # # filter out if any of the benchmarks have not been produced
48
  # df = df[has_no_nan_values(df, benchmark_cols)]
49
  df = df.fillna(EMPTY_SYMBOL)
50
- print(df["GAIA"].head())
51
 
52
  # make values clickable and link to log files
53
  for col in benchmark_cols:
 
34
  return f"https://storage.googleapis.com/inspect-evals/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
35
 
36
 
37
+ def get_leaderboard_df(results_path: str, requests_path: str, benchmark_cols: list) -> pd.DataFrame:
38
  """Creates a dataframe from all the individual experiment results"""
39
  raw_data = get_raw_eval_results(results_path, requests_path)
40
  all_data_json = [v.to_dict() for v in raw_data]
 
42
  df = pd.DataFrame.from_records(all_data_json)
43
 
44
  # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
45
+ df = df[benchmark_cols].round(decimals=2)
46
 
47
  # # filter out if any of the benchmarks have not been produced
48
  # df = df[has_no_nan_values(df, benchmark_cols)]
49
  df = df.fillna(EMPTY_SYMBOL)
 
50
 
51
  # make values clickable and link to log files
52
  for col in benchmark_cols: