Alvinn-aai commited on
Commit
7d20cd0
·
1 Parent(s): a44350f

revive leaderboard, fill with dummy data

Browse files
Files changed (3) hide show
  1. app.py +63 -46
  2. src/about.py +9 -8
  3. src/display/utils.py +51 -25
app.py CHANGED
@@ -1,9 +1,10 @@
1
  from functools import partial
2
 
3
  import gradio as gr
4
- # from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
5
  import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
 
7
  # from huggingface_hub import snapshot_download
8
 
9
  from src.about import (
@@ -17,27 +18,31 @@ from src.about import (
17
  from src.datamodel.data import F1Data
18
 
19
  from src.display.css_html_js import custom_css
20
- # from src.display.utils import (
21
- # BENCHMARK_COLS,
22
- # COLS,
23
- # EVAL_COLS,
24
- # EVAL_TYPES,
25
- # AutoEvalColumn,
26
- # ModelType,
27
- # fields,
28
- # WeightType,
29
- # Precision
30
- # )
 
31
  from src.envs import API, REPO_ID, TOKEN, CODE_PROBLEMS_REPO, SUBMISSIONS_REPO, RESULTS_REPO
32
  from src.logger import get_logger
33
- # from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
34
  from src.submission.submit import add_new_solutions
35
 
36
  logger = get_logger(__name__)
37
 
 
38
  def restart_space():
39
  API.restart_space(repo_id=REPO_ID)
40
 
 
41
  lbdb = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO)
42
 
43
  logger.info("Initialized LBDB")
@@ -48,36 +53,35 @@ logger.info("Initialized LBDB")
48
  # pending_eval_queue_df,
49
  # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
50
 
51
- # def init_leaderboard(dataframe):
52
- # if dataframe is None or dataframe.empty:
53
- # raise ValueError("Leaderboard DataFrame is empty or None.")
54
- # return Leaderboard(
55
- # value=dataframe,
56
- # datatype=[c.type for c in fields(AutoEvalColumn)],
57
- # select_columns=SelectColumns(
58
- # default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
59
- # cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
60
- # label="Select Columns to Display:",
61
- # ),
62
- # search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
63
- # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
64
- # filter_columns=[
65
- # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
66
- # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
67
- # ColumnFilter(
68
- # AutoEvalColumn.params.name,
69
- # type="slider",
70
- # min=0.01,
71
- # max=150,
72
- # label="Select the number of parameters (B)",
73
- # ),
74
- # ColumnFilter(
75
- # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
76
- # ),
77
- # ],
78
- # bool_checkboxgroup_label="Hide models",
79
- # interactive=False,
80
- # )
81
 
82
 
83
  demo = gr.Blocks(css=custom_css)
@@ -86,8 +90,19 @@ with demo:
86
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
87
 
88
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
89
- # with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
90
- # leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
93
  logger.info("Tab about")
@@ -149,7 +164,7 @@ with demo:
149
  # interactive=True,
150
  # )
151
 
152
- # with gr.Column():
153
  submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
154
  # precision = gr.Dropdown(
155
  # choices=[i.value.name for i in Precision if i != Precision.Unknown],
@@ -170,8 +185,10 @@ with demo:
170
  logger.info("Submut button")
171
  submit_button = gr.Button("Submit")
172
  submission_result = gr.Markdown()
 
173
  def add_solution_cbk(submitter, submission_path):
174
  return add_new_solutions(lbdb, submitter, submission_path)
 
175
  submit_button.click(
176
  add_solution_cbk,
177
  [
 
1
  from functools import partial
2
 
3
  import gradio as gr
4
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
5
  import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
+
8
  # from huggingface_hub import snapshot_download
9
 
10
  from src.about import (
 
18
  from src.datamodel.data import F1Data
19
 
20
  from src.display.css_html_js import custom_css
21
+
22
+ from src.display.utils import (
23
+ # BENCHMARK_COLS,
24
+ COLS,
25
+ EVAL_COLS,
26
+ EVAL_TYPES,
27
+ AutoEvalColumn,
28
+ ModelType,
29
+ fields,
30
+ WeightType,
31
+ Precision,
32
+ )
33
  from src.envs import API, REPO_ID, TOKEN, CODE_PROBLEMS_REPO, SUBMISSIONS_REPO, RESULTS_REPO
34
  from src.logger import get_logger
35
+
36
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
37
  from src.submission.submit import add_new_solutions
38
 
39
  logger = get_logger(__name__)
40
 
41
+
42
  def restart_space():
43
  API.restart_space(repo_id=REPO_ID)
44
 
45
+
46
  lbdb = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO)
47
 
48
  logger.info("Initialized LBDB")
 
53
  # pending_eval_queue_df,
54
  # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
55
 
56
+
57
+ def init_leaderboard(dataframe):
58
+ if dataframe is None or dataframe.empty:
59
+ raise ValueError("Leaderboard DataFrame is empty or None.")
60
+ return Leaderboard(
61
+ value=dataframe,
62
+ datatype=[c.type for c in fields(AutoEvalColumn)],
63
+ select_columns=SelectColumns(
64
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
65
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
66
+ label="Select Columns to Display:",
67
+ ),
68
+ search_columns=[AutoEvalColumn.system.name, AutoEvalColumn.system_type.name],
69
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
70
+ filter_columns=[
71
+ ColumnFilter(AutoEvalColumn.system_type.name, type="checkboxgroup", label="Model types"),
72
+ # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
73
+ # ColumnFilter(
74
+ # AutoEvalColumn.params.name,
75
+ # type="slider",
76
+ # min=0.01,
77
+ # max=150,
78
+ # label="Select the number of parameters (B)",
79
+ # ),
80
+ # ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
81
+ ],
82
+ bool_checkboxgroup_label="Hide models",
83
+ interactive=False,
84
+ )
 
85
 
86
 
87
  demo = gr.Blocks(css=custom_css)
 
90
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
91
 
92
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
93
+ with gr.TabItem("🏅 FormulaOne Leaderboard", elem_id="formulaone-leaderboar-tab-table", id=0):
94
+ # TODO: activate
95
+ # leaderboard_df = get_leaderboard_df
96
+ # dummy df
97
+ leaderboard_df = pd.DataFrame(
98
+ {
99
+ AutoEvalColumn.system.name: ["Model A", "Model B", "Model C"], # AutoEvalColumn.model.name
100
+ AutoEvalColumn.system_type.name: ["LLM", "LLM+Agent", "N/A"], # AutoEvalColumn.model_type.name
101
+ AutoEvalColumn.organization.name: ["Org A", "Org B", "Org C"], # AutoEvalColumn.organization.name
102
+ AutoEvalColumn.success_rate.name: [0.01, 0.0, 0.005],
103
+ }
104
+ )
105
+ leaderboard = init_leaderboard(leaderboard_df)
106
 
107
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
108
  logger.info("Tab about")
 
164
  # interactive=True,
165
  # )
166
 
167
+ # with gr.Column():
168
  submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
169
  # precision = gr.Dropdown(
170
  # choices=[i.value.name for i in Precision if i != Precision.Unknown],
 
185
  logger.info("Submut button")
186
  submit_button = gr.Button("Submit")
187
  submission_result = gr.Markdown()
188
+
189
  def add_solution_cbk(submitter, submission_path):
190
  return add_new_solutions(lbdb, submitter, submission_path)
191
+
192
  submit_button.click(
193
  add_solution_cbk,
194
  [
src/about.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -8,17 +9,17 @@ class Task:
8
  col_name: str
9
 
10
 
11
- # # Select your tasks here
12
- # # ---------------------------------------------------
13
- # class Tasks(Enum):
14
- # # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- # task0 = Task("anli_r1", "acc", "ANLI")
16
- # task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
 
 
 
 
20
 
21
 
 
 
 
22
 
23
  # Your leaderboard name
24
  TITLE = """<h1 align="center" id="space-title">AAI FormulaOne Leaderboard</h1>"""
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
 
9
  col_name: str
10
 
11
 
12
+ # Select your tasks here
 
 
 
 
 
 
 
13
  # ---------------------------------------------------
14
+ class Tasks(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+ task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
17
+ # task1 = Task("logiqa", "acc_norm", "LogiQA")
18
 
19
 
20
+ NUM_FEWSHOT = 0 # Change with your few shot
21
+ # ---------------------------------------------------
22
+
23
 
24
  # Your leaderboard name
25
  TITLE = """<h1 align="center" id="space-title">AAI FormulaOne Leaderboard</h1>"""
src/display/utils.py CHANGED
@@ -1,10 +1,12 @@
1
- from dataclasses import dataclass, make_dataclass
 
2
  from enum import Enum
3
 
4
  import pandas as pd
5
 
6
  from src.about import Tasks
7
 
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
@@ -20,28 +22,49 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
 
23
  ## Leaderboard columns
24
- auto_eval_column_dict = []
25
- # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
-
43
- # We use make dataclass to dynamically fill the scores from Tasks
44
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
@@ -53,12 +76,13 @@ class EvalQueueColumn: # Queue column
53
  weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
 
56
  ## All the model information that we might need
57
  @dataclass
58
  class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
- symbol: str = "" # emoji
62
 
63
 
64
  class ModelType(Enum):
@@ -83,11 +107,13 @@ class ModelType(Enum):
83
  return ModelType.IFT
84
  return ModelType.Unknown
85
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
89
  Delta = ModelDetails("Delta")
90
 
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
@@ -100,11 +126,11 @@ class Precision(Enum):
100
  return Precision.bfloat16
101
  return Precision.Unknown
102
 
 
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
 
106
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
1
+ from dataclasses import dataclass, field, make_dataclass
2
+ from typing import ClassVar
3
  from enum import Enum
4
 
5
  import pandas as pd
6
 
7
  from src.about import Tasks
8
 
9
+
10
  def fields(raw_class):
11
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
12
 
 
22
  hidden: bool = False
23
  never_hidden: bool = False
24
 
25
+
26
  ## Leaderboard columns
27
+ # auto_eval_column_fields = []
28
+ # # Init
29
+ # auto_eval_column_fields.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
30
+ # auto_eval_column_fields.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
31
+ # # Scores
32
+ # auto_eval_column_fields.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
33
+ # for task in Tasks:
34
+ # auto_eval_column_fields.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
35
+ # # Model information
36
+ # auto_eval_column_fields.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
37
+ # auto_eval_column_fields.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
38
+ # auto_eval_column_fields.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
39
+ # auto_eval_column_fields.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
40
+ # auto_eval_column_fields.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
41
+ # auto_eval_column_fields.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
42
+ # auto_eval_column_fields.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
43
+ # auto_eval_column_fields.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
44
+ # auto_eval_column_fields.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
45
+ #
46
+ #
47
+ #
48
+ # def make_classvar_dataclass(name: str, spec: list):
49
+ # ns = {"__annotations__": {}}
50
+ # for field_name, field_type, default in spec:
51
+ # # Mark as ClassVar so dataclass doesn't treat it as an instance field
52
+ # ns["__annotations__"][field_name] = ClassVar[field_type]
53
+ # ns[field_name] = default
54
+ # # No instance fields; just class-level descriptors
55
+ # return make_dataclass(name, [], frozen=True, namespace=ns)
56
+ #
57
+ # # We use make dataclass to dynamically fill the scores from Tasks
58
+ # AutoEvalColumn = make_classvar_dataclass("AutoEvalColumn", auto_eval_column_fields)
59
+
60
+ @dataclass(frozen=True)
61
+ class AutoEvalColumn:
62
+ system = ColumnContent("System Name", "markdown", True, never_hidden=True)
63
+ system_type = ColumnContent("System Type", "str", True)
64
+ organization = ColumnContent("Organization", "str", True, never_hidden=True)
65
+ success_rate = ColumnContent("Success Rate (%)", "number", True)
66
+
67
+
68
 
69
  ## For the queue columns in the submission tab
70
  @dataclass(frozen=True)
 
76
  weight_type = ColumnContent("weight_type", "str", "Original")
77
  status = ColumnContent("status", "str", True)
78
 
79
+
80
  ## All the model information that we might need
81
  @dataclass
82
  class ModelDetails:
83
  name: str
84
  display_name: str = ""
85
+ symbol: str = "" # emoji
86
 
87
 
88
  class ModelType(Enum):
 
107
  return ModelType.IFT
108
  return ModelType.Unknown
109
 
110
+
111
  class WeightType(Enum):
112
  Adapter = ModelDetails("Adapter")
113
  Original = ModelDetails("Original")
114
  Delta = ModelDetails("Delta")
115
 
116
+
117
  class Precision(Enum):
118
  float16 = ModelDetails("float16")
119
  bfloat16 = ModelDetails("bfloat16")
 
126
  return Precision.bfloat16
127
  return Precision.Unknown
128
 
129
+
130
  # Column selection
131
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
132
 
133
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
134
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
135
 
136
+ # BENCHMARK_COLS = [t.value.col_name for t in Tasks]