annamonica commited on
Commit
bca9693
·
1 Parent(s): d4cb50a

add basic static leaderboard

Browse files
Files changed (2) hide show
  1. app.py +30 -106
  2. src/populate.py +54 -10
app.py CHANGED
@@ -22,7 +22,7 @@ from src.display.utils import (
22
  ModelType,
23
  fields,
24
  WeightType,
25
- Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -32,24 +32,40 @@ from src.submission.submit import add_new_eval
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
 
35
  ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
40
  )
41
  except Exception:
42
  restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
47
  )
48
  except Exception:
49
  restart_space()
50
 
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
 
 
 
53
 
54
  (
55
  finished_eval_queue_df,
@@ -57,7 +73,9 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
 
60
  def init_leaderboard(dataframe):
 
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
63
  return Leaderboard(
@@ -68,21 +86,10 @@ def init_leaderboard(dataframe):
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
@@ -95,98 +102,15 @@ with demo:
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
  leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
103
 
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
 
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
@@ -201,4 +125,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
22
  ModelType,
23
  fields,
24
  WeightType,
25
+ Precision,
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
+
36
  ### Space initialisation
37
  try:
38
  print(EVAL_REQUESTS_PATH)
39
  snapshot_download(
40
+ repo_id=QUEUE_REPO,
41
+ local_dir=EVAL_REQUESTS_PATH,
42
+ repo_type="dataset",
43
+ tqdm_class=None,
44
+ etag_timeout=30,
45
+ token=TOKEN,
46
  )
47
  except Exception:
48
  restart_space()
49
  try:
50
  print(EVAL_RESULTS_PATH)
51
  snapshot_download(
52
+ repo_id=RESULTS_REPO,
53
+ local_dir=EVAL_RESULTS_PATH,
54
+ repo_type="dataset",
55
+ tqdm_class=None,
56
+ etag_timeout=30,
57
+ token=TOKEN,
58
  )
59
  except Exception:
60
  restart_space()
61
 
62
 
63
+ LEADERBOARD_DF = get_leaderboard_df(
64
+ EVAL_RESULTS_PATH + "/" + "BOOM_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
65
+ )
66
+ LEADERBOARD_DF_DOMAIN = get_leaderboard_df(
67
+ EVAL_RESULTS_PATH + "/" + "BOOM_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
68
+ )
69
 
70
  (
71
  finished_eval_queue_df,
 
73
  pending_eval_queue_df,
74
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
75
 
76
+
77
  def init_leaderboard(dataframe):
78
+ # TODO: merge results df with model info df
79
  if dataframe is None or dataframe.empty:
80
  raise ValueError("Leaderboard DataFrame is empty or None.")
81
  return Leaderboard(
 
86
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
87
  label="Select Columns to Display:",
88
  ),
89
+ search_columns=[AutoEvalColumn.model.name],
90
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
91
  filter_columns=[
92
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
 
 
 
 
 
 
 
 
 
 
 
93
  ],
94
  bool_checkboxgroup_label="Hide models",
95
  interactive=False,
 
102
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
103
 
104
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
105
+ with gr.TabItem("🏅 Overall", elem_id="boom-benchmark-tab-table", id=0):
106
  leaderboard = init_leaderboard(LEADERBOARD_DF)
107
 
108
+ # TODO - add other tabs if needed
109
+ with gr.TabItem("🏅 By Domain - TODO", elem_id="boom-benchmark-tab-table", id=1):
110
+ leaderboard = init_leaderboard(LEADERBOARD_DF_DOMAIN) # TODO - update table data
111
 
112
+ with gr.TabItem("📝 About - TODO", elem_id="boom-benchmark-tab-table", id=2):
113
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  with gr.Row():
116
  with gr.Accordion("📙 Citation", open=False):
 
125
  scheduler = BackgroundScheduler()
126
  scheduler.add_job(restart_space, "interval", seconds=1800)
127
  scheduler.start()
128
+ demo.queue(default_concurrency_limit=40).launch()
src/populate.py CHANGED
@@ -2,23 +2,65 @@ import json
2
  import os
3
 
4
  import pandas as pd
5
-
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
 
 
19
 
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
  return df
23
 
24
 
@@ -39,7 +81,9 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
39
  all_evals.append(data)
40
  elif ".md" not in entry:
41
  # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
 
 
43
  for sub_entry in sub_entries:
44
  file_path = os.path.join(save_path, entry, sub_entry)
45
  with open(file_path) as fp:
 
2
  import os
3
 
4
  import pandas as pd
5
+ from dataclasses import fields
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
+ from src.display.utils import ModelType
10
+
11
+
12
+ # def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
+ # """Creates a dataframe from all the individual experiment results"""
14
+ # raw_data = get_raw_eval_results(results_path, requests_path)
15
+ # all_data_json = [v.to_dict() for v in raw_data]
16
+
17
+ # df = pd.DataFrame.from_records(all_data_json)
18
+ # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
19
+ # df = df[cols].round(decimals=2)
20
+
21
+ # # filter out if any of the benchmarks have not been produced
22
+ # df = df[has_no_nan_values(df, benchmark_cols)]
23
+ # return df
24
 
25
 
26
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
27
+ """
28
+ Processes a STATIC results CSV file to generate a leaderboard DataFrame with formatted columns and sorted values.
29
+ Args:
30
+ results_path (str): The file path to the results CSV file.
31
+ Returns:
32
+ pd.DataFrame: A processed DataFrame with renamed columns, additional formatting, and sorted values.
33
+ Notes:
34
+ - The function reads a CSV file from the given `results_path`.
35
+ - Internal column names are mapped to display names using `AutoEvalColumn`.
36
+ - A new column for model type symbols is created by parsing the `model_type` column.
37
+ - The `model_type` column is updated to prepend the model type symbol.
38
+ - The DataFrame is sorted by the `Rank_6750_scaled` column in ascending order.
39
+ """
40
+
41
+ df = pd.read_csv(results_path)
42
+ # Create the mapping from internal column name to display name
43
+
44
+ column_mapping = {field.name: getattr(AutoEvalColumn, field.name).name for field in fields(AutoEvalColumn)}
45
+ # Assuming `df` is your DataFrame:
46
+ df.rename(columns=column_mapping, inplace=True)
47
+
48
+ # Create a new column for model type symbol by parsing the model_type column
49
+ df[AutoEvalColumn.model_type_symbol.name] = df[AutoEvalColumn.model_type.name].apply(
50
+ lambda x: ModelType.from_str(x).value.symbol
51
+ )
52
+ # Prepend the value of model_type_symbol to the value of model_type
53
+ df[AutoEvalColumn.model_type.name] = (
54
+ df[AutoEvalColumn.model_type_symbol.name] + " " + df[AutoEvalColumn.model_type.name]
55
+ )
56
 
57
+ # Move the model_type_symbol column to the beginning
58
+ cols = [AutoEvalColumn.model_type_symbol.name] + [
59
+ col for col in df.columns if col != AutoEvalColumn.model_type_symbol.name
60
+ ]
61
+ df = df[cols]
62
 
63
+ df = df.sort_values(by=[AutoEvalColumn.Rank_6750_scaled.name], ascending=True)
 
64
  return df
65
 
66
 
 
81
  all_evals.append(data)
82
  elif ".md" not in entry:
83
  # this is a folder
84
+ sub_entries = [
85
+ e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")
86
+ ]
87
  for sub_entry in sub_entries:
88
  file_path = os.path.join(save_path, entry, sub_entry)
89
  with open(file_path) as fp: