Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
7d20cd0
1
Parent(s):
a44350f
revive leaderboard, fill with dummy data
Browse files- app.py +63 -46
- src/about.py +9 -8
- src/display/utils.py +51 -25
app.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
from functools import partial
|
2 |
|
3 |
import gradio as gr
|
4 |
-
|
5 |
import pandas as pd
|
6 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
7 |
# from huggingface_hub import snapshot_download
|
8 |
|
9 |
from src.about import (
|
@@ -17,27 +18,31 @@ from src.about import (
|
|
17 |
from src.datamodel.data import F1Data
|
18 |
|
19 |
from src.display.css_html_js import custom_css
|
20 |
-
|
21 |
-
|
22 |
-
#
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
31 |
from src.envs import API, REPO_ID, TOKEN, CODE_PROBLEMS_REPO, SUBMISSIONS_REPO, RESULTS_REPO
|
32 |
from src.logger import get_logger
|
33 |
-
|
|
|
34 |
from src.submission.submit import add_new_solutions
|
35 |
|
36 |
logger = get_logger(__name__)
|
37 |
|
|
|
38 |
def restart_space():
|
39 |
API.restart_space(repo_id=REPO_ID)
|
40 |
|
|
|
41 |
lbdb = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO)
|
42 |
|
43 |
logger.info("Initialized LBDB")
|
@@ -48,36 +53,35 @@ logger.info("Initialized LBDB")
|
|
48 |
# pending_eval_queue_df,
|
49 |
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
#
|
68 |
-
#
|
69 |
-
#
|
70 |
-
#
|
71 |
-
#
|
72 |
-
#
|
73 |
-
#
|
74 |
-
#
|
75 |
-
#
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
# )
|
81 |
|
82 |
|
83 |
demo = gr.Blocks(css=custom_css)
|
@@ -86,8 +90,19 @@ with demo:
|
|
86 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
87 |
|
88 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
|
93 |
logger.info("Tab about")
|
@@ -149,7 +164,7 @@ with demo:
|
|
149 |
# interactive=True,
|
150 |
# )
|
151 |
|
152 |
-
|
153 |
submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
|
154 |
# precision = gr.Dropdown(
|
155 |
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
@@ -170,8 +185,10 @@ with demo:
|
|
170 |
logger.info("Submut button")
|
171 |
submit_button = gr.Button("Submit")
|
172 |
submission_result = gr.Markdown()
|
|
|
173 |
def add_solution_cbk(submitter, submission_path):
|
174 |
return add_new_solutions(lbdb, submitter, submission_path)
|
|
|
175 |
submit_button.click(
|
176 |
add_solution_cbk,
|
177 |
[
|
|
|
1 |
from functools import partial
|
2 |
|
3 |
import gradio as gr
|
4 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
5 |
import pandas as pd
|
6 |
from apscheduler.schedulers.background import BackgroundScheduler
|
7 |
+
|
8 |
# from huggingface_hub import snapshot_download
|
9 |
|
10 |
from src.about import (
|
|
|
18 |
from src.datamodel.data import F1Data
|
19 |
|
20 |
from src.display.css_html_js import custom_css
|
21 |
+
|
22 |
+
from src.display.utils import (
|
23 |
+
# BENCHMARK_COLS,
|
24 |
+
COLS,
|
25 |
+
EVAL_COLS,
|
26 |
+
EVAL_TYPES,
|
27 |
+
AutoEvalColumn,
|
28 |
+
ModelType,
|
29 |
+
fields,
|
30 |
+
WeightType,
|
31 |
+
Precision,
|
32 |
+
)
|
33 |
from src.envs import API, REPO_ID, TOKEN, CODE_PROBLEMS_REPO, SUBMISSIONS_REPO, RESULTS_REPO
|
34 |
from src.logger import get_logger
|
35 |
+
|
36 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
37 |
from src.submission.submit import add_new_solutions
|
38 |
|
39 |
logger = get_logger(__name__)
|
40 |
|
41 |
+
|
42 |
def restart_space():
|
43 |
API.restart_space(repo_id=REPO_ID)
|
44 |
|
45 |
+
|
46 |
lbdb = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO)
|
47 |
|
48 |
logger.info("Initialized LBDB")
|
|
|
53 |
# pending_eval_queue_df,
|
54 |
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
55 |
|
56 |
+
|
57 |
+
def init_leaderboard(dataframe):
|
58 |
+
if dataframe is None or dataframe.empty:
|
59 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
60 |
+
return Leaderboard(
|
61 |
+
value=dataframe,
|
62 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
63 |
+
select_columns=SelectColumns(
|
64 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
65 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
66 |
+
label="Select Columns to Display:",
|
67 |
+
),
|
68 |
+
search_columns=[AutoEvalColumn.system.name, AutoEvalColumn.system_type.name],
|
69 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
70 |
+
filter_columns=[
|
71 |
+
ColumnFilter(AutoEvalColumn.system_type.name, type="checkboxgroup", label="Model types"),
|
72 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
73 |
+
# ColumnFilter(
|
74 |
+
# AutoEvalColumn.params.name,
|
75 |
+
# type="slider",
|
76 |
+
# min=0.01,
|
77 |
+
# max=150,
|
78 |
+
# label="Select the number of parameters (B)",
|
79 |
+
# ),
|
80 |
+
# ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
|
81 |
+
],
|
82 |
+
bool_checkboxgroup_label="Hide models",
|
83 |
+
interactive=False,
|
84 |
+
)
|
|
|
85 |
|
86 |
|
87 |
demo = gr.Blocks(css=custom_css)
|
|
|
90 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
91 |
|
92 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
93 |
+
with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
|
94 |
+
# TODO: activate
|
95 |
+
# leaderboard_df = get_leaderboard_df
|
96 |
+
# dummy df
|
97 |
+
leaderboard_df = pd.DataFrame(
|
98 |
+
{
|
99 |
+
AutoEvalColumn.system.name: ["Model A", "Model B", "Model C"], # AutoEvalColumn.model.name
|
100 |
+
AutoEvalColumn.system_type.name: ["LLM", "LLM+Agent", "N/A"], # AutoEvalColumn.model_type.name
|
101 |
+
AutoEvalColumn.organization.name: ["Org A", "Org B", "Org C"], # AutoEvalColumn.organization.name
|
102 |
+
AutoEvalColumn.success_rate.name: [0.01, 0.0, 0.005],
|
103 |
+
}
|
104 |
+
)
|
105 |
+
leaderboard = init_leaderboard(leaderboard_df)
|
106 |
|
107 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
|
108 |
logger.info("Tab about")
|
|
|
164 |
# interactive=True,
|
165 |
# )
|
166 |
|
167 |
+
# with gr.Column():
|
168 |
submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
|
169 |
# precision = gr.Dropdown(
|
170 |
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
|
|
185 |
logger.info("Submut button")
|
186 |
submit_button = gr.Button("Submit")
|
187 |
submission_result = gr.Markdown()
|
188 |
+
|
189 |
def add_solution_cbk(submitter, submission_path):
|
190 |
return add_new_solutions(lbdb, submitter, submission_path)
|
191 |
+
|
192 |
submit_button.click(
|
193 |
add_solution_cbk,
|
194 |
[
|
src/about.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
|
|
4 |
@dataclass
|
5 |
class Task:
|
6 |
benchmark: str
|
@@ -8,17 +9,17 @@ class Task:
|
|
8 |
col_name: str
|
9 |
|
10 |
|
11 |
-
#
|
12 |
-
# # ---------------------------------------------------
|
13 |
-
# class Tasks(Enum):
|
14 |
-
# # task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
# task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
-
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
-
|
18 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
|
|
|
|
|
|
|
|
20 |
|
21 |
|
|
|
|
|
|
|
22 |
|
23 |
# Your leaderboard name
|
24 |
TITLE = """<h1 align="center" id="space-title">AAI FormulaOne Leaderboard</h1>"""
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
+
|
5 |
@dataclass
|
6 |
class Task:
|
7 |
benchmark: str
|
|
|
9 |
col_name: str
|
10 |
|
11 |
|
12 |
+
# Select your tasks here
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# ---------------------------------------------------
|
14 |
+
class Tasks(Enum):
|
15 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
+
task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
|
17 |
+
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
18 |
|
19 |
|
20 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
21 |
+
# ---------------------------------------------------
|
22 |
+
|
23 |
|
24 |
# Your leaderboard name
|
25 |
TITLE = """<h1 align="center" id="space-title">AAI FormulaOne Leaderboard</h1>"""
|
src/display/utils.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
-
from dataclasses import dataclass, make_dataclass
|
|
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.about import Tasks
|
7 |
|
|
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
|
@@ -20,28 +22,49 @@ class ColumnContent:
|
|
20 |
hidden: bool = False
|
21 |
never_hidden: bool = False
|
22 |
|
|
|
23 |
## Leaderboard columns
|
24 |
-
|
25 |
-
# Init
|
26 |
-
|
27 |
-
|
28 |
-
#Scores
|
29 |
-
|
30 |
-
for task in Tasks:
|
31 |
-
|
32 |
-
# Model information
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
#
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
## For the queue columns in the submission tab
|
47 |
@dataclass(frozen=True)
|
@@ -53,12 +76,13 @@ class EvalQueueColumn: # Queue column
|
|
53 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
54 |
status = ColumnContent("status", "str", True)
|
55 |
|
|
|
56 |
## All the model information that we might need
|
57 |
@dataclass
|
58 |
class ModelDetails:
|
59 |
name: str
|
60 |
display_name: str = ""
|
61 |
-
symbol: str = ""
|
62 |
|
63 |
|
64 |
class ModelType(Enum):
|
@@ -83,11 +107,13 @@ class ModelType(Enum):
|
|
83 |
return ModelType.IFT
|
84 |
return ModelType.Unknown
|
85 |
|
|
|
86 |
class WeightType(Enum):
|
87 |
Adapter = ModelDetails("Adapter")
|
88 |
Original = ModelDetails("Original")
|
89 |
Delta = ModelDetails("Delta")
|
90 |
|
|
|
91 |
class Precision(Enum):
|
92 |
float16 = ModelDetails("float16")
|
93 |
bfloat16 = ModelDetails("bfloat16")
|
@@ -100,11 +126,11 @@ class Precision(Enum):
|
|
100 |
return Precision.bfloat16
|
101 |
return Precision.Unknown
|
102 |
|
|
|
103 |
# Column selection
|
104 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
105 |
|
106 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
108 |
|
109 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
110 |
-
|
|
|
1 |
+
from dataclasses import dataclass, field, make_dataclass
|
2 |
+
from typing import ClassVar
|
3 |
from enum import Enum
|
4 |
|
5 |
import pandas as pd
|
6 |
|
7 |
from src.about import Tasks
|
8 |
|
9 |
+
|
10 |
def fields(raw_class):
|
11 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
12 |
|
|
|
22 |
hidden: bool = False
|
23 |
never_hidden: bool = False
|
24 |
|
25 |
+
|
26 |
## Leaderboard columns
|
27 |
+
# auto_eval_column_fields = []
|
28 |
+
# # Init
|
29 |
+
# auto_eval_column_fields.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
30 |
+
# auto_eval_column_fields.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
31 |
+
# # Scores
|
32 |
+
# auto_eval_column_fields.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
33 |
+
# for task in Tasks:
|
34 |
+
# auto_eval_column_fields.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
35 |
+
# # Model information
|
36 |
+
# auto_eval_column_fields.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
37 |
+
# auto_eval_column_fields.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
38 |
+
# auto_eval_column_fields.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
39 |
+
# auto_eval_column_fields.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
40 |
+
# auto_eval_column_fields.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
41 |
+
# auto_eval_column_fields.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
42 |
+
# auto_eval_column_fields.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
43 |
+
# auto_eval_column_fields.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
44 |
+
# auto_eval_column_fields.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
45 |
+
#
|
46 |
+
#
|
47 |
+
#
|
48 |
+
# def make_classvar_dataclass(name: str, spec: list):
|
49 |
+
# ns = {"__annotations__": {}}
|
50 |
+
# for field_name, field_type, default in spec:
|
51 |
+
# # Mark as ClassVar so dataclass doesn't treat it as an instance field
|
52 |
+
# ns["__annotations__"][field_name] = ClassVar[field_type]
|
53 |
+
# ns[field_name] = default
|
54 |
+
# # No instance fields; just class-level descriptors
|
55 |
+
# return make_dataclass(name, [], frozen=True, namespace=ns)
|
56 |
+
#
|
57 |
+
# # We use make dataclass to dynamically fill the scores from Tasks
|
58 |
+
# AutoEvalColumn = make_classvar_dataclass("AutoEvalColumn", auto_eval_column_fields)
|
59 |
+
|
60 |
+
@dataclass(frozen=True)
|
61 |
+
class AutoEvalColumn:
|
62 |
+
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
63 |
+
system_type = ColumnContent("System Type", "str", True)
|
64 |
+
organization = ColumnContent("Organization", "str", True, never_hidden=True)
|
65 |
+
success_rate = ColumnContent("Success Rate (%)", "number", True)
|
66 |
+
|
67 |
+
|
68 |
|
69 |
## For the queue columns in the submission tab
|
70 |
@dataclass(frozen=True)
|
|
|
76 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
77 |
status = ColumnContent("status", "str", True)
|
78 |
|
79 |
+
|
80 |
## All the model information that we might need
|
81 |
@dataclass
|
82 |
class ModelDetails:
|
83 |
name: str
|
84 |
display_name: str = ""
|
85 |
+
symbol: str = "" # emoji
|
86 |
|
87 |
|
88 |
class ModelType(Enum):
|
|
|
107 |
return ModelType.IFT
|
108 |
return ModelType.Unknown
|
109 |
|
110 |
+
|
111 |
class WeightType(Enum):
|
112 |
Adapter = ModelDetails("Adapter")
|
113 |
Original = ModelDetails("Original")
|
114 |
Delta = ModelDetails("Delta")
|
115 |
|
116 |
+
|
117 |
class Precision(Enum):
|
118 |
float16 = ModelDetails("float16")
|
119 |
bfloat16 = ModelDetails("bfloat16")
|
|
|
126 |
return Precision.bfloat16
|
127 |
return Precision.Unknown
|
128 |
|
129 |
+
|
130 |
# Column selection
|
131 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
132 |
|
133 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
134 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
135 |
|
136 |
+
# BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|