Alvinn-aai commited on
Commit
e00a798
Β·
1 Parent(s): 5f7ca36

populate leaderboard df

Browse files
Files changed (3) hide show
  1. app.py +2 -11
  2. src/display/utils.py +3 -1
  3. src/populate.py +47 -9
app.py CHANGED
@@ -47,6 +47,8 @@ def restart_space():
47
 
48
 
49
  lbdb = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=SPLIT)
 
 
50
 
51
  logger.info("Initialized LBDB")
52
 
@@ -94,17 +96,6 @@ with demo:
94
 
95
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
96
  with gr.TabItem("πŸ… FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
97
- # TODO: activate
98
- # leaderboard_df = get_leaderboard_df
99
- # dummy df
100
- leaderboard_df = pd.DataFrame(
101
- {
102
- AutoEvalColumn.system.name: ["Model A", "Model B", "Model C"], # AutoEvalColumn.model.name
103
- AutoEvalColumn.system_type.name: ["LLM", "LLM+Agent", "N/A"], # AutoEvalColumn.model_type.name
104
- AutoEvalColumn.organization.name: ["Org A", "Org B", "Org C"], # AutoEvalColumn.organization.name
105
- AutoEvalColumn.success_rate.name: [0.01, 0.0, 0.005],
106
- }
107
- )
108
  leaderboard = init_leaderboard(leaderboard_df)
109
 
110
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=1):
 
47
 
48
 
49
  lbdb = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=SPLIT)
50
+ leaderboard_df = get_leaderboard_df(RESULTS_REPO)
51
+
52
 
53
  logger.info("Initialized LBDB")
54
 
 
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
  with gr.TabItem("πŸ… FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
 
 
 
 
 
 
 
 
 
 
 
99
  leaderboard = init_leaderboard(leaderboard_df)
100
 
101
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=1):
src/display/utils.py CHANGED
@@ -57,13 +57,15 @@ class ColumnContent:
57
  # # We use make dataclass to dynamically fill the scores from Tasks
58
  # AutoEvalColumn = make_classvar_dataclass("AutoEvalColumn", auto_eval_column_fields)
59
 
 
60
  @dataclass(frozen=True)
61
  class AutoEvalColumn:
62
  system = ColumnContent("System Name", "markdown", True, never_hidden=True)
63
  system_type = ColumnContent("System Type", "str", True)
64
  organization = ColumnContent("Organization", "str", True, never_hidden=True)
65
  success_rate = ColumnContent("Success Rate (%)", "number", True)
66
-
 
67
 
68
 
69
  ## For the queue columns in the submission tab
 
57
  # # We use make dataclass to dynamically fill the scores from Tasks
58
  # AutoEvalColumn = make_classvar_dataclass("AutoEvalColumn", auto_eval_column_fields)
59
 
60
+
61
  @dataclass(frozen=True)
62
  class AutoEvalColumn:
63
  system = ColumnContent("System Name", "markdown", True, never_hidden=True)
64
  system_type = ColumnContent("System Type", "str", True)
65
  organization = ColumnContent("Organization", "str", True, never_hidden=True)
66
  success_rate = ColumnContent("Success Rate (%)", "number", True)
67
+ problems_solved = ColumnContent("Problems Solved", "number", True)
68
+ submitted_on = ColumnContent("Submitted On", "datetime", True)
69
 
70
 
71
  ## For the queue columns in the submission tab
src/populate.py CHANGED
@@ -2,24 +2,62 @@ import json
2
  import os
3
 
4
  import pandas as pd
 
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
 
8
  from src.leaderboard.read_evals import get_raw_eval_results
 
9
 
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
12
  """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
 
16
- df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.success_rate.name], ascending=False)
18
- df = df[cols].round(decimals=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
- return df
23
 
24
 
25
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
2
  import os
3
 
4
  import pandas as pd
5
+ from datasets import load_dataset, get_dataset_config_names
6
+ from tqdm.auto import tqdm
7
 
8
  from src.display.formatting import has_no_nan_values, make_clickable_model
9
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
10
+ from src.envs import TOKEN
11
  from src.leaderboard.read_evals import get_raw_eval_results
12
+ from src.logger import get_logger
13
 
14
+ logger = get_logger(__name__)
15
 
16
+
17
+ def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
18
  """Creates a dataframe from all the individual experiment results"""
 
 
19
 
20
+ configs = get_dataset_config_names(results_dataset_name, token=TOKEN)
21
+
22
+ rows = []
23
+ for submission_id in tqdm(configs, total=len(configs), desc="Processing Submission Results"):
24
+ submission_ds = load_dataset(results_dataset_name, submission_id, split="train", token=TOKEN)
25
+ submission_df = pd.DataFrame(submission_ds)
26
+
27
+ if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
28
+ logger.warning(f"Skipping {submission_id} due to invalid did_pass values")
29
+ continue
30
+
31
+ success_rate = 100 * submission_df["did_pass"].mean()
32
+ num_solved = submission_df["did_pass"].sum()
33
+ first_row = submission_df.iloc[0]
34
+
35
+ rows.append(
36
+ {
37
+ "System Name": first_row["system_name"],
38
+ "System Type": first_row["system_type"],
39
+ "Organization": first_row["organization"],
40
+ "Success Rate (%)": success_rate,
41
+ "Problems Solved": num_solved,
42
+ "Submitted On": pd.to_datetime(first_row.get("submission_ts", "1970-01-01T00:00:00")),
43
+ }
44
+ )
45
+
46
+ full_df = pd.DataFrame(rows)
47
+
48
+ # TODO: forbid multiple submissions under the same name?
49
+ # Keep only the latest entry per unique (System Name, System Type, Organization) triplet
50
+ final_df = (
51
+ full_df.sort_values("Submitted On", ascending=False)
52
+ .drop_duplicates(subset=["System Name", "System Type", "Organization"], keep="first")
53
+ .sort_values(by=[AutoEvalColumn.success_rate.name], ascending=False)
54
+ .reset_index(drop=True)
55
+ )
56
+
57
+ cols_to_round = ["Success Rate (%)"]
58
+ final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
59
 
60
+ return final_df
 
 
61
 
62
 
63
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: