Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
Β·
e00a798
1
Parent(s):
5f7ca36
populate leaderboard df
Browse files- app.py +2 -11
- src/display/utils.py +3 -1
- src/populate.py +47 -9
app.py
CHANGED
@@ -47,6 +47,8 @@ def restart_space():
|
|
47 |
|
48 |
|
49 |
lbdb = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=SPLIT)
|
|
|
|
|
50 |
|
51 |
logger.info("Initialized LBDB")
|
52 |
|
@@ -94,17 +96,6 @@ with demo:
|
|
94 |
|
95 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
96 |
with gr.TabItem("π
FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
|
97 |
-
# TODO: activate
|
98 |
-
# leaderboard_df = get_leaderboard_df
|
99 |
-
# dummy df
|
100 |
-
leaderboard_df = pd.DataFrame(
|
101 |
-
{
|
102 |
-
AutoEvalColumn.system.name: ["Model A", "Model B", "Model C"], # AutoEvalColumn.model.name
|
103 |
-
AutoEvalColumn.system_type.name: ["LLM", "LLM+Agent", "N/A"], # AutoEvalColumn.model_type.name
|
104 |
-
AutoEvalColumn.organization.name: ["Org A", "Org B", "Org C"], # AutoEvalColumn.organization.name
|
105 |
-
AutoEvalColumn.success_rate.name: [0.01, 0.0, 0.005],
|
106 |
-
}
|
107 |
-
)
|
108 |
leaderboard = init_leaderboard(leaderboard_df)
|
109 |
|
110 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=1):
|
|
|
47 |
|
48 |
|
49 |
lbdb = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=SPLIT)
|
50 |
+
leaderboard_df = get_leaderboard_df(RESULTS_REPO)
|
51 |
+
|
52 |
|
53 |
logger.info("Initialized LBDB")
|
54 |
|
|
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
with gr.TabItem("π
FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
leaderboard = init_leaderboard(leaderboard_df)
|
100 |
|
101 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=1):
|
src/display/utils.py
CHANGED
@@ -57,13 +57,15 @@ class ColumnContent:
|
|
57 |
# # We use make dataclass to dynamically fill the scores from Tasks
|
58 |
# AutoEvalColumn = make_classvar_dataclass("AutoEvalColumn", auto_eval_column_fields)
|
59 |
|
|
|
60 |
@dataclass(frozen=True)
|
61 |
class AutoEvalColumn:
|
62 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
63 |
system_type = ColumnContent("System Type", "str", True)
|
64 |
organization = ColumnContent("Organization", "str", True, never_hidden=True)
|
65 |
success_rate = ColumnContent("Success Rate (%)", "number", True)
|
66 |
-
|
|
|
67 |
|
68 |
|
69 |
## For the queue columns in the submission tab
|
|
|
57 |
# # We use make dataclass to dynamically fill the scores from Tasks
|
58 |
# AutoEvalColumn = make_classvar_dataclass("AutoEvalColumn", auto_eval_column_fields)
|
59 |
|
60 |
+
|
61 |
@dataclass(frozen=True)
|
62 |
class AutoEvalColumn:
|
63 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
64 |
system_type = ColumnContent("System Type", "str", True)
|
65 |
organization = ColumnContent("Organization", "str", True, never_hidden=True)
|
66 |
success_rate = ColumnContent("Success Rate (%)", "number", True)
|
67 |
+
problems_solved = ColumnContent("Problems Solved", "number", True)
|
68 |
+
submitted_on = ColumnContent("Submitted On", "datetime", True)
|
69 |
|
70 |
|
71 |
## For the queue columns in the submission tab
|
src/populate.py
CHANGED
@@ -2,24 +2,62 @@ import json
|
|
2 |
import os
|
3 |
|
4 |
import pandas as pd
|
|
|
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
|
|
9 |
|
|
|
10 |
|
11 |
-
|
|
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
-
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
-
return df
|
23 |
|
24 |
|
25 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
2 |
import os
|
3 |
|
4 |
import pandas as pd
|
5 |
+
from datasets import load_dataset, get_dataset_config_names
|
6 |
+
from tqdm.auto import tqdm
|
7 |
|
8 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
9 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
10 |
+
from src.envs import TOKEN
|
11 |
from src.leaderboard.read_evals import get_raw_eval_results
|
12 |
+
from src.logger import get_logger
|
13 |
|
14 |
+
logger = get_logger(__name__)
|
15 |
|
16 |
+
|
17 |
+
def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame:
|
18 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
|
|
19 |
|
20 |
+
configs = get_dataset_config_names(results_dataset_name, token=TOKEN)
|
21 |
+
|
22 |
+
rows = []
|
23 |
+
for submission_id in tqdm(configs, total=len(configs), desc="Processing Submission Results"):
|
24 |
+
submission_ds = load_dataset(results_dataset_name, submission_id, split="train", token=TOKEN)
|
25 |
+
submission_df = pd.DataFrame(submission_ds)
|
26 |
+
|
27 |
+
if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any():
|
28 |
+
logger.warning(f"Skipping {submission_id} due to invalid did_pass values")
|
29 |
+
continue
|
30 |
+
|
31 |
+
success_rate = 100 * submission_df["did_pass"].mean()
|
32 |
+
num_solved = submission_df["did_pass"].sum()
|
33 |
+
first_row = submission_df.iloc[0]
|
34 |
+
|
35 |
+
rows.append(
|
36 |
+
{
|
37 |
+
"System Name": first_row["system_name"],
|
38 |
+
"System Type": first_row["system_type"],
|
39 |
+
"Organization": first_row["organization"],
|
40 |
+
"Success Rate (%)": success_rate,
|
41 |
+
"Problems Solved": num_solved,
|
42 |
+
"Submitted On": pd.to_datetime(first_row.get("submission_ts", "1970-01-01T00:00:00")),
|
43 |
+
}
|
44 |
+
)
|
45 |
+
|
46 |
+
full_df = pd.DataFrame(rows)
|
47 |
+
|
48 |
+
# TODO: forbid multiple submissions under the same name?
|
49 |
+
# Keep only the latest entry per unique (System Name, System Type, Organization) triplet
|
50 |
+
final_df = (
|
51 |
+
full_df.sort_values("Submitted On", ascending=False)
|
52 |
+
.drop_duplicates(subset=["System Name", "System Type", "Organization"], keep="first")
|
53 |
+
.sort_values(by=[AutoEvalColumn.success_rate.name], ascending=False)
|
54 |
+
.reset_index(drop=True)
|
55 |
+
)
|
56 |
+
|
57 |
+
cols_to_round = ["Success Rate (%)"]
|
58 |
+
final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2)
|
59 |
|
60 |
+
return final_df
|
|
|
|
|
61 |
|
62 |
|
63 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|