cdminix commited on
Commit
026ee6b
1 Parent(s): 9c2d40e

setup leaderboard

Browse files
README.md CHANGED
@@ -17,9 +17,9 @@ Results files should have the following format and be stored as json files:
17
  ```json
18
  {
19
  "config": {
20
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
21
- "model_name": "path of the model on the hub: org/model",
22
- "model_sha": "revision on the hub",
23
  },
24
  "results": {
25
  "task_name": {
@@ -41,4 +41,4 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
41
  You'll find
42
  - the main table' columns names and properties in `src/display/utils.py`
43
  - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
44
- - teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
17
  ```json
18
  {
19
  "config": {
20
+ "model_name": "name of the model",
21
+ "model_url": "url of the model",
22
+ "tags": ["tag1", "tag2"], // e.g. ["flow", "diffusion", "autoregressive", "end-to-end"]
23
  },
24
  "results": {
25
  "task_name": {
 
41
  You'll find
42
  - the main table' columns names and properties in `src/display/utils.py`
43
  - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
44
+ - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py CHANGED
@@ -19,10 +19,7 @@ from src.display.utils import (
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
22
- ModelType,
23
  fields,
24
- WeightType,
25
- Precision,
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -80,11 +77,9 @@ def init_leaderboard(dataframe):
80
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
81
  label="Select Columns to Display:",
82
  ),
83
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
84
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
85
  filter_columns=[
86
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
87
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
88
  ColumnFilter(
89
  AutoEvalColumn.params.name,
90
  type="slider",
@@ -92,7 +87,6 @@ def init_leaderboard(dataframe):
92
  max=150,
93
  label="Select the number of parameters (B)",
94
  ),
95
- ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
96
  ],
97
  bool_checkboxgroup_label="Hide models",
98
  interactive=False,
@@ -100,129 +94,97 @@ def init_leaderboard(dataframe):
100
 
101
 
102
  def show_leaderboard(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None):
103
- global demo
104
-
105
  if profile or True:
106
- print(f"Logged in as {profile.name}")
107
- with demo:
108
- gr.HTML(TITLE)
109
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
110
 
111
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
112
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
113
- leaderboard = init_leaderboard(LEADERBOARD_DF)
114
 
115
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
116
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
117
 
118
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
119
- with gr.Column():
120
- with gr.Row():
121
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
122
-
123
- with gr.Column():
124
- with gr.Accordion(
125
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
126
- open=False,
127
- ):
128
- with gr.Row():
129
- finished_eval_table = gr.components.Dataframe(
130
- value=finished_eval_queue_df,
131
- headers=EVAL_COLS,
132
- datatype=EVAL_TYPES,
133
- row_count=5,
134
- )
135
- with gr.Accordion(
136
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
137
- open=False,
138
- ):
139
- with gr.Row():
140
- running_eval_table = gr.components.Dataframe(
141
- value=running_eval_queue_df,
142
- headers=EVAL_COLS,
143
- datatype=EVAL_TYPES,
144
- row_count=5,
145
- )
146
-
147
- with gr.Accordion(
148
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
149
- open=False,
150
- ):
151
- with gr.Row():
152
- pending_eval_table = gr.components.Dataframe(
153
- value=pending_eval_queue_df,
154
- headers=EVAL_COLS,
155
- datatype=EVAL_TYPES,
156
- row_count=5,
157
- )
158
  with gr.Row():
159
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
160
 
161
- with gr.Row():
162
- with gr.Column():
163
- model_name_textbox = gr.Textbox(label="Model name")
164
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
165
- model_type = gr.Dropdown(
166
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
167
- label="Model type",
168
- multiselect=False,
169
- value=None,
170
- interactive=True,
171
- )
172
-
173
- with gr.Column():
174
- precision = gr.Dropdown(
175
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
176
- label="Precision",
177
- multiselect=False,
178
- value="float16",
179
- interactive=True,
180
- )
181
- weight_type = gr.Dropdown(
182
- choices=[i.value.name for i in WeightType],
183
- label="Weights type",
184
- multiselect=False,
185
- value="Original",
186
- interactive=True,
187
- )
188
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
189
-
190
- submit_button = gr.Button("Submit Eval")
191
- submission_result = gr.Markdown()
192
- # submit_button.click(
193
- # add_new_eval,
194
- # [
195
- # model_name_textbox,
196
- # base_model_name_textbox,
197
- # revision_name_textbox,
198
- # precision,
199
- # weight_type,
200
- # model_type,
201
- # ],
202
- # submission_result,
203
- # )
204
-
205
- with gr.Row():
206
- with gr.Accordion("📙 Citation", open=False):
207
- citation_button = gr.Textbox(
208
- value=CITATION_BUTTON_TEXT,
209
- label=CITATION_BUTTON_LABEL,
210
- lines=20,
211
- elem_id="citation-button",
212
- show_copy_button=True,
213
- )
 
 
 
 
 
 
 
 
214
 
215
 
216
  demo = gr.Blocks(css=custom_css)
217
 
218
  with demo:
219
- gr.LoginButton()
220
  m1 = gr.Markdown("Please login to see the leaderboard.")
221
- demo.load(show_leaderboard, inputs=None, outputs=m1)
 
222
 
223
 
224
  scheduler = BackgroundScheduler()
225
  scheduler.add_job(restart_space, "interval", seconds=1800)
226
  scheduler.start()
227
- # demo.queue(default_concurrency_limit=40).launch()
228
- demo.launch()
 
 
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
 
22
  fields,
 
 
23
  )
24
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
25
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
77
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
78
  label="Select Columns to Display:",
79
  ),
80
+ search_columns=[AutoEvalColumn.model.name],
81
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
82
  filter_columns=[
 
 
83
  ColumnFilter(
84
  AutoEvalColumn.params.name,
85
  type="slider",
 
87
  max=150,
88
  label="Select the number of parameters (B)",
89
  ),
 
90
  ],
91
  bool_checkboxgroup_label="Hide models",
92
  interactive=False,
 
94
 
95
 
96
  def show_leaderboard(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None):
 
 
97
  if profile or True:
98
+ gr.HTML(TITLE)
99
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
 
100
 
101
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
102
+ with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
103
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
104
 
105
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
106
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
107
 
108
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
109
+ with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  with gr.Row():
111
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
112
 
113
+ with gr.Column():
114
+ with gr.Accordion(
115
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
116
+ open=False,
117
+ ):
118
+ with gr.Row():
119
+ finished_eval_table = gr.components.Dataframe(
120
+ value=finished_eval_queue_df,
121
+ headers=EVAL_COLS,
122
+ datatype=EVAL_TYPES,
123
+ row_count=5,
124
+ )
125
+ with gr.Accordion(
126
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
127
+ open=False,
128
+ ):
129
+ with gr.Row():
130
+ running_eval_table = gr.components.Dataframe(
131
+ value=running_eval_queue_df,
132
+ headers=EVAL_COLS,
133
+ datatype=EVAL_TYPES,
134
+ row_count=5,
135
+ )
136
+
137
+ with gr.Accordion(
138
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
139
+ open=False,
140
+ ):
141
+ with gr.Row():
142
+ pending_eval_table = gr.components.Dataframe(
143
+ value=pending_eval_queue_df,
144
+ headers=EVAL_COLS,
145
+ datatype=EVAL_TYPES,
146
+ row_count=5,
147
+ )
148
+ with gr.Row():
149
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
150
+
151
+ with gr.Row():
152
+ with gr.Column():
153
+ model_name_textbox = gr.Textbox(label="Model name")
154
+
155
+ submit_button = gr.Button("Submit Eval")
156
+ submission_result = gr.Markdown()
157
+ submit_button.click(
158
+ add_new_eval,
159
+ [
160
+ model_name_textbox,
161
+ ],
162
+ submission_result,
163
+ )
164
+
165
+ with gr.Row():
166
+ with gr.Accordion("📙 Citation", open=False):
167
+ citation_button = gr.Textbox(
168
+ value=CITATION_BUTTON_TEXT,
169
+ label=CITATION_BUTTON_LABEL,
170
+ lines=20,
171
+ elem_id="citation-button",
172
+ show_copy_button=True,
173
+ )
174
 
175
 
176
  demo = gr.Blocks(css=custom_css)
177
 
178
  with demo:
179
+ # gr.LoginButton()
180
  m1 = gr.Markdown("Please login to see the leaderboard.")
181
+ # demo.load(show_leaderboard, inputs=None, outputs=m1)
182
+ show_leaderboard(None, None)
183
 
184
 
185
  scheduler = BackgroundScheduler()
186
  scheduler.add_job(restart_space, "interval", seconds=1800)
187
  scheduler.start()
188
+
189
+ demo.queue(default_concurrency_limit=40).launch()
190
+ # demo.launch()
new/app.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import json
3
+
4
+ import gradio as gr
5
+ from huggingface_hub import snapshot_download
6
+ from gradio_leaderboard import Leaderboard, SelectColumns
7
+ import pandas as pd
8
+ from apscheduler.schedulers.background import BackgroundScheduler
9
+ from ttsdb.benchmarks.benchmark import BenchmarkCategory
10
+ from ttsdb import BenchmarkSuite
11
+
12
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, TAGS
13
+ from src.texts import LLM_BENCHMARKS_TEXT, EVALUATION_QUEUE_TEXT
14
+ from src.css_html_js import custom_css
15
+
16
+
17
+ def filter_dfs(tags, lb):
18
+ global f_b_df, f_a_df
19
+ is_agg = False
20
+ if "Environment" in lb.columns:
21
+ is_agg = True
22
+ if is_agg:
23
+ lb = f_a_df.copy()
24
+ else:
25
+ lb = f_b_df.copy()
26
+ if tags and len(lb) > 0:
27
+ lb = lb[lb["Tags"].apply(lambda x: any(tag in x for tag in tags))]
28
+ return lb
29
+
30
+
31
+ def restart_space():
32
+ API.restart_space(repo_id=REPO_ID)
33
+
34
+
35
+ def submit_eval(model_name, model_tags, web_url, hf_url, code_url, paper_url, inference_details, file_path):
36
+ model_id = model_name.lower().replace(" ", "_")
37
+ # check if model already exists
38
+ if Path(f"{EVAL_REQUESTS_PATH}/{model_id}.json").exists():
39
+ return "Model already exists in the evaluation queue"
40
+ # check which urls are valid
41
+ if web_url and not web_url.startswith("http"):
42
+ return "Please enter a valid URL"
43
+ if hf_url and not hf_url.startswith("http"):
44
+ return "Please enter a valid URL"
45
+ if code_url and not code_url.startswith("http"):
46
+ return "Please enter a valid URL"
47
+ if paper_url and not paper_url.startswith("http"):
48
+ return "Please enter a valid URL"
49
+ # move file to correct location
50
+ if not file_path.endswith(".tar.gz"):
51
+ return "Please upload a .tar.gz file"
52
+ Path(file_path).rename(f"{EVAL_REQUESTS_PATH}/{model_id}.tar.gz")
53
+ # build display name - use web_url to link text if available, and emojis for the other urls
54
+ display_name = model_name
55
+ if web_url:
56
+ display_name = f"[{display_name}]({web_url}) "
57
+ if hf_url:
58
+ display_name += f"[🤗]({hf_url})"
59
+ if code_url:
60
+ display_name += f"[💻]({code_url})"
61
+ if paper_url:
62
+ display_name += f"[📄]({paper_url})"
63
+ request_obj = {
64
+ "model_name": model_name,
65
+ "display_name": display_name,
66
+ "model_tags": model_tags,
67
+ "web_url": web_url,
68
+ "hf_url": hf_url,
69
+ "code_url": code_url,
70
+ "paper_url": paper_url,
71
+ "inference_details": inference_details,
72
+ "status": "pending",
73
+ }
74
+ with open(f"{EVAL_REQUESTS_PATH}/{model_id}.json", "w") as f:
75
+ json.dump(request_obj, f)
76
+ API.upload_file(
77
+ path_or_fileobj=f"{EVAL_REQUESTS_PATH}/{model_id}.json",
78
+ path_in_repo=f"{model_id}.json",
79
+ repo_id=QUEUE_REPO,
80
+ repo_type="dataset",
81
+ commit_message=f"Add {model_name} to evaluation queue",
82
+ )
83
+ API.upload_file(
84
+ path_or_fileobj=f"{EVAL_REQUESTS_PATH}/{model_id}.tar.gz",
85
+ path_in_repo=f"{model_id}.tar.gz",
86
+ repo_id=QUEUE_REPO,
87
+ repo_type="dataset",
88
+ commit_message=f"Add {model_name} to evaluation queue",
89
+ )
90
+ return "Model submitted successfully 🎉"
91
+
92
+
93
+ ### Space initialisation
94
+ try:
95
+ print(EVAL_REQUESTS_PATH)
96
+ snapshot_download(
97
+ repo_id=QUEUE_REPO,
98
+ local_dir=EVAL_REQUESTS_PATH,
99
+ repo_type="dataset",
100
+ tqdm_class=None,
101
+ etag_timeout=30,
102
+ token=TOKEN,
103
+ )
104
+ except Exception:
105
+ restart_space()
106
+ try:
107
+ print(EVAL_RESULTS_PATH)
108
+ snapshot_download(
109
+ repo_id=RESULTS_REPO,
110
+ local_dir=EVAL_RESULTS_PATH,
111
+ repo_type="dataset",
112
+ tqdm_class=None,
113
+ etag_timeout=30,
114
+ token=TOKEN,
115
+ )
116
+ except Exception:
117
+ restart_space()
118
+
119
+
120
+ results_df = pd.read_csv(EVAL_RESULTS_PATH + "/results.csv")
121
+
122
+ agg_df = BenchmarkSuite.aggregate_df(results_df)
123
+ agg_df = agg_df.pivot(index="dataset", columns="benchmark_category", values="score")
124
+ agg_df.rename(columns={"OVERALL": "General"}, inplace=True)
125
+ agg_df.columns = [x.capitalize() for x in agg_df.columns]
126
+ agg_df["Mean"] = agg_df.mean(axis=1)
127
+ # make sure mean is the first column
128
+ agg_df = agg_df[["Mean"] + [col for col in agg_df.columns if col != "Mean"]]
129
+ for col in agg_df.columns:
130
+ agg_df[col] = agg_df[col].apply(lambda x: round(x, 2))
131
+ agg_df["Tags"] = ""
132
+ agg_df.reset_index(inplace=True)
133
+ agg_df.rename(columns={"dataset": "Model"}, inplace=True)
134
+ agg_df.sort_values("Mean", ascending=False, inplace=True)
135
+
136
+ benchmark_df = results_df.pivot(index="dataset", columns="benchmark_name", values="score")
137
+
138
+ # get benchmark name order by category
139
+ benchmark_order = list(results_df.sort_values("benchmark_category")["benchmark_name"].unique())
140
+ benchmark_df = benchmark_df[benchmark_order]
141
+ benchmark_df = benchmark_df.reset_index()
142
+ benchmark_df.rename(columns={"dataset": "Model"}, inplace=True)
143
+ # set index
144
+ benchmark_df.set_index("Model", inplace=True)
145
+ benchmark_df["Mean"] = benchmark_df.mean(axis=1)
146
+ # make sure mean is the first column
147
+ benchmark_df = benchmark_df[["Mean"] + [col for col in benchmark_df.columns if col != "Mean"]]
148
+ # round all
149
+ for col in benchmark_df.columns:
150
+ benchmark_df[col] = benchmark_df[col].apply(lambda x: round(x, 2))
151
+ benchmark_df["Tags"] = ""
152
+ benchmark_df.reset_index(inplace=True)
153
+ benchmark_df.sort_values("Mean", ascending=False, inplace=True)
154
+
155
+ # get details for each model
156
+ model_detail_files = Path(EVAL_REQUESTS_PATH).glob("*.json")
157
+ model_details = {}
158
+ for model_detail_file in model_detail_files:
159
+ with open(model_detail_file) as f:
160
+ model_detail = json.load(f)
161
+ model_details[model_detail_file.stem] = model_detail
162
+
163
+ # replace .tar.gz
164
+ benchmark_df["Model"] = benchmark_df["Model"].apply(lambda x: x.replace(".tar.gz", ""))
165
+ agg_df["Model"] = agg_df["Model"].apply(lambda x: x.replace(".tar.gz", ""))
166
+
167
+ benchmark_df["Tags"] = benchmark_df["Model"].apply(lambda x: model_details.get(x, {}).get("model_tags", ""))
168
+ agg_df["Tags"] = agg_df["Model"].apply(lambda x: model_details.get(x, {}).get("model_tags", ""))
169
+
170
+ benchmark_df["Model"] = benchmark_df["Model"].apply(lambda x: model_details.get(x, {}).get("display_name", x))
171
+ agg_df["Model"] = agg_df["Model"].apply(lambda x: model_details.get(x, {}).get("display_name", x))
172
+
173
+ f_b_df = benchmark_df.copy()
174
+ f_a_df = agg_df.copy()
175
+
176
+
177
+ def init_leaderboard(dataframe):
178
+ if dataframe is None or dataframe.empty:
179
+ raise ValueError("Leaderboard DataFrame is empty or None.")
180
+ df_types = []
181
+ for col in dataframe.columns:
182
+ if col == "Model":
183
+ df_types.append("markdown")
184
+ elif col == "Tags":
185
+ df_types.append("markdown")
186
+ else:
187
+ df_types.append("number")
188
+ return Leaderboard(
189
+ value=dataframe,
190
+ select_columns=SelectColumns(
191
+ default_selection=list(dataframe.columns),
192
+ cant_deselect=["Model", "Mean"],
193
+ label="Select Columns to Display:",
194
+ ),
195
+ search_columns=["Model", "Tags"],
196
+ filter_columns=[],
197
+ hide_columns=["Tags"],
198
+ interactive=False,
199
+ datatype=df_types,
200
+ )
201
+
202
+
203
+ app = gr.Blocks(css=custom_css, title="TTS Benchmark Leaderboard")
204
+
205
+ with app:
206
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
207
+ with gr.TabItem("🏅 TTSDB Scores", elem_id="llm-benchmark-tab-table", id=0):
208
+ tags = gr.Dropdown(
209
+ TAGS,
210
+ value=[],
211
+ multiselect=True,
212
+ label="Tags",
213
+ info="Select tags to filter the leaderboard. You can suggest new tags here: https://huggingface.co/spaces/ttsds/benchmark/discussions/1",
214
+ )
215
+ leaderboard = init_leaderboard(f_a_df)
216
+ tags.change(filter_dfs, [tags, leaderboard], [leaderboard])
217
+ with gr.TabItem("🏅 Individual Benchmarks", elem_id="llm-benchmark-tab-table", id=1):
218
+ tags = gr.Dropdown(
219
+ TAGS,
220
+ value=[],
221
+ multiselect=True,
222
+ label="Tags",
223
+ info="Select tags to filter the leaderboard",
224
+ )
225
+ leaderboard = init_leaderboard(f_b_df)
226
+ tags.change(filter_dfs, [tags, leaderboard], [leaderboard])
227
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
228
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
229
+ with gr.TabItem("🚀 Submit here!", elem_id="llm-benchmark-tab-table", id=3):
230
+ with gr.Column():
231
+ with gr.Row():
232
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
233
+ with gr.Row():
234
+ gr.Markdown("# ✉️✨ Submit a TTS dataset here!", elem_classes="markdown-text")
235
+ with gr.Row():
236
+ with gr.Column():
237
+ model_name_textbox = gr.Textbox(label="Model name")
238
+ model_tags_dropdown = gr.Dropdown(
239
+ label="Model tags",
240
+ choices=TAGS,
241
+ multiselect=True,
242
+ )
243
+ website_url_textbox = gr.Textbox(label="Website URL (optional)")
244
+ hf_url_textbox = gr.Textbox(label="Huggingface URL (optional)")
245
+ code_url_textbox = gr.Textbox(label="Code URL (optional)")
246
+ paper_url_textbox = gr.Textbox(label="Paper URL (optional)")
247
+ inference_details_textbox = gr.TextArea(label="Inference details (optional)")
248
+ file_input = gr.File(file_types=[".gz"], interactive=True, label=".tar.gz TTS dataset")
249
+ submit_button = gr.Button("Submit Eval")
250
+ submission_result = gr.Markdown()
251
+ submit_button.click(
252
+ submit_eval,
253
+ [
254
+ model_name_textbox,
255
+ model_tags_dropdown,
256
+ website_url_textbox,
257
+ hf_url_textbox,
258
+ code_url_textbox,
259
+ paper_url_textbox,
260
+ inference_details_textbox,
261
+ file_input,
262
+ ],
263
+ submission_result,
264
+ )
265
+
266
+ scheduler = BackgroundScheduler()
267
+ scheduler.add_job(restart_space, "interval", seconds=1800)
268
+ scheduler.start()
269
+
270
+ app.queue(default_concurrency_limit=40).launch()
new/requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler
2
+ black
3
+ datasets
4
+ gradio
5
+ gradio[oauth]
6
+ gradio_leaderboard==0.0.9
7
+ gradio_client
8
+ huggingface-hub>=0.18.0
9
+ matplotlib
10
+ numpy
11
+ pandas
12
+ python-dateutil
13
+ tqdm
14
+ transformers
15
+ tokenizers>=0.15.0
16
+ sentencepiece
17
+ markdown
new/src/css_html_js.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+
3
+ .markdown-text {
4
+ font-size: 16px !important;
5
+ }
6
+
7
+ #models-to-add-text {
8
+ font-size: 18px !important;
9
+ }
10
+
11
+ #citation-button span {
12
+ font-size: 16px !important;
13
+ }
14
+
15
+ #citation-button textarea {
16
+ font-size: 16px !important;
17
+ }
18
+
19
+ #citation-button > label > button {
20
+ margin: 6px;
21
+ transform: scale(1.3);
22
+ }
23
+
24
+ #leaderboard-table {
25
+ margin-top: 15px
26
+ }
27
+
28
+ #leaderboard-table-lite {
29
+ margin-top: 15px
30
+ }
31
+
32
+ #search-bar-table-box > div:first-child {
33
+ background: none;
34
+ border: none;
35
+ }
36
+
37
+ #search-bar {
38
+ padding: 0px;
39
+ }
40
+
41
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
+ table td:first-child,
43
+ table th:first-child {
44
+ max-width: 400px;
45
+ overflow: auto;
46
+ white-space: nowrap;
47
+ }
48
+
49
+ .tab-buttons button {
50
+ font-size: 20px;
51
+ }
52
+
53
+ #scale-logo {
54
+ border-style: none !important;
55
+ box-shadow: none;
56
+ display: block;
57
+ margin-left: auto;
58
+ margin-right: auto;
59
+ max-width: 600px;
60
+ }
61
+
62
+ #scale-logo .download {
63
+ display: none;
64
+ }
65
+ #filter_type{
66
+ border: 0;
67
+ padding-left: 0;
68
+ padding-top: 0;
69
+ }
70
+ #filter_type label {
71
+ display: flex;
72
+ }
73
+ #filter_type label > span{
74
+ margin-top: var(--spacing-lg);
75
+ margin-right: 0.5em;
76
+ }
77
+ #filter_type label > .wrap{
78
+ width: 103px;
79
+ }
80
+ #filter_type label > .wrap .wrap-inner{
81
+ padding: 2px;
82
+ }
83
+ #filter_type label > .wrap .wrap-inner input{
84
+ width: 1px
85
+ }
86
+ #filter-columns-type{
87
+ border:0;
88
+ padding:0.5;
89
+ }
90
+ #filter-columns-size{
91
+ border:0;
92
+ padding:0.5;
93
+ }
94
+ #box-filter > .form{
95
+ border: 0
96
+ }
97
+
98
+ .svelte-1m1obck:nth-of-type(2) {
99
+ display: none !important;
100
+ }
101
+ """
new/src/envs.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
+
9
+ OWNER = "ttsds" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
+ # ----------------------------------
11
+
12
+ REPO_ID = f"{OWNER}/leaderboard"
13
+ QUEUE_REPO = f"{OWNER}/requests"
14
+ RESULTS_REPO = f"{OWNER}/results"
15
+
16
+ # If you setup a cache later, just change HF_HOME
17
+ CACHE_PATH = os.getenv("HF_HOME", ".")
18
+
19
+ # Local caches
20
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
+
25
+ API = HfApi(token=TOKEN)
26
+
27
+ TAGS = [
28
+ "Normalizing Flow",
29
+ "Reference-based (Speaker)",
30
+ "Prompt-based (Speaker)",
31
+ "Prosodic Correlates",
32
+ "Adversarial",
33
+ "Diffusion",
34
+ "Audio Tokens",
35
+ "Autoregressive",
36
+ "Non-autoregressive",
37
+ "Pretrained Text Encoder",
38
+ ]
new/src/texts.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LLM_BENCHMARKS_TEXT = f"""
2
+ ## How it works
3
+
4
+ ## Reproducibility
5
+ To reproduce our results, check out our repository [here](https://github.com/ttsds/ttsds).
6
+
7
+ """
8
+
9
+ EVALUATION_QUEUE_TEXT = """
10
+ ## How to submit a TTS model to the leaderboard
11
+
12
+ ### 1) download the evaluation dataset
13
+ The evaluation dataset consists of wav / text pairs.
14
+ You can download it [here](https://huggingface.co/ttsds/eval).
15
+
16
+ The format of the dataset is as follows:
17
+ ```
18
+ eval/
19
+ ├── 0001.wav
20
+ ├── 0001.txt
21
+ ├── 0002.wav
22
+ ├── 0002.txt
23
+ ├── ...
24
+ ```
25
+
26
+ ### 2) create your TTS dataset
27
+ Create a dataset with your TTS model and the evaluation dataset.
28
+ Use the wav files as speaker reference and the text as the prompt.
29
+ Create a .tar.gz file with the dataset, and make sure to inlcude .wav files and .txt files.
30
+
31
+ ### 3) submit your TTS dataset
32
+ Submit your dataset below.
33
+ """
34
+
35
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
36
+ CITATION_BUTTON_TEXT = r"""
37
+ """
src/about.py CHANGED
@@ -1,23 +1,25 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
7
  metric: str
8
  col_name: str
 
9
 
10
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
- # ---------------------------------------------------
20
 
 
 
21
 
22
 
23
  # Your leaderboard name
@@ -33,38 +35,24 @@ LLM_BENCHMARKS_TEXT = f"""
33
  ## How it works
34
 
35
  ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
 
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
-
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
-
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
-
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
 
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
 
60
 
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
 
 
63
 
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
  """
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
8
  metric: str
9
  col_name: str
10
+ category: str
11
 
12
 
13
  # Select your tasks here
14
  # ---------------------------------------------------
15
  class Tasks(Enum):
16
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
17
+ task0 = Task("anli_r1", "acc", "ANLI", "")
18
+ task1 = Task("logiqa", "acc_norm", "LogiQA", "")
19
 
 
 
20
 
21
+ NUM_FEWSHOT = 0 # Change with your few shot
22
+ # ---------------------------------------------------
23
 
24
 
25
  # Your leaderboard name
 
35
  ## How it works
36
 
37
  ## Reproducibility
38
+ To reproduce our results, check out our repository [here](https://github.com/ttsds/ttsds).
39
 
40
  """
41
 
42
  EVALUATION_QUEUE_TEXT = """
43
+ ## How to submit a TTS model to the leaderboard
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ ### 1) download the evaluation dataset
46
+ The evaluation dataset consists of wav / text pairs.
47
+ You can download it [here](https://huggingface.co/ttsds/eval).
48
 
49
+ ### 2) create your TTS dataset
50
+ Create a dataset with your TTS model and the evaluation dataset.
51
+ Use the wav files as speaker reference and the text as the prompt.
52
+ Create a .tar.gz file with the dataset, and make sure to inlcude .wav files and .txt files.
53
 
54
+ ### 3) submit your TTS dataset
55
+ Submit your dataset below.
 
 
56
  """
57
 
58
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
src/display/utils.py CHANGED
@@ -22,32 +22,22 @@ class ColumnContent:
22
  never_hidden: bool = False
23
 
24
 
25
- @dataclass
26
  class AutoEvalColumn:
27
- model_type_symbol = ColumnContent("model_type_symbol", "str", True, never_hidden=True)
28
  model = ColumnContent("model", "markdown", True, never_hidden=True)
29
  average = ColumnContent("average", "number", True)
30
- anli = ColumnContent("ANLI", "number", True)
31
- logiqa = ColumnContent("LogiQA", "number", True)
32
- model_type = ColumnContent("model_type", "str", False)
33
- architecture = ColumnContent("architecture", "str", False)
34
- weight_type = ColumnContent("weight_type", "str", False, True)
35
- precision = ColumnContent("precision", "str", False)
36
- license = ColumnContent("license", "str", False)
37
- params = ColumnContent("#Params (B)", "number", False)
38
- likes = ColumnContent("Hub ❤️", "number", False)
39
- still_on_hub = ColumnContent("Available on the hub", "bool", False)
40
- revision = ColumnContent("Model sha", "str", False, False)
41
 
42
 
43
  ## For the queue columns in the submission tab
44
  @dataclass(frozen=True)
45
  class EvalQueueColumn: # Queue column
46
  model = ColumnContent("model", "markdown", True)
47
- revision = ColumnContent("revision", "str", True)
48
- private = ColumnContent("private", "bool", True)
49
- precision = ColumnContent("precision", "str", True)
50
- weight_type = ColumnContent("weight_type", "str", "Original")
51
  status = ColumnContent("status", "str", True)
52
 
53
 
@@ -59,64 +49,10 @@ class ModelDetails:
59
  symbol: str = "" # emoji
60
 
61
 
62
- class ModelType(Enum):
63
- PT = ModelDetails(name="pretrained", symbol="🟢")
64
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
65
- IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
66
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
67
- Unknown = ModelDetails(name="", symbol="?")
68
-
69
- def to_str(self, separator=" "):
70
- return f"{self.value.symbol}{separator}{self.value.name}"
71
-
72
- @staticmethod
73
- def from_str(type):
74
- if "fine-tuned" in type or "🔶" in type:
75
- return ModelType.FT
76
- if "pretrained" in type or "🟢" in type:
77
- return ModelType.PT
78
- if "RL-tuned" in type or "🟦" in type:
79
- return ModelType.RL
80
- if "instruction-tuned" in type or "⭕" in type:
81
- return ModelType.IFT
82
- return ModelType.Unknown
83
-
84
-
85
- class WeightType(Enum):
86
- Adapter = ModelDetails("Adapter")
87
- Original = ModelDetails("Original")
88
- Delta = ModelDetails("Delta")
89
-
90
-
91
- class Precision(Enum):
92
- float16 = ModelDetails("float16")
93
- bfloat16 = ModelDetails("bfloat16")
94
- float32 = ModelDetails("float32")
95
- # qt_8bit = ModelDetails("8bit")
96
- # qt_4bit = ModelDetails("4bit")
97
- # qt_GPTQ = ModelDetails("GPTQ")
98
- Unknown = ModelDetails("?")
99
-
100
- def from_str(precision):
101
- if precision in ["torch.float16", "float16"]:
102
- return Precision.float16
103
- if precision in ["torch.bfloat16", "bfloat16"]:
104
- return Precision.bfloat16
105
- if precision in ["float32"]:
106
- return Precision.float32
107
- # if precision in ["8bit"]:
108
- # return Precision.qt_8bit
109
- # if precision in ["4bit"]:
110
- # return Precision.qt_4bit
111
- # if precision in ["GPTQ", "None"]:
112
- # return Precision.qt_GPTQ
113
- return Precision.Unknown
114
-
115
-
116
  # Column selection
117
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
118
 
119
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
120
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
121
 
122
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
22
  never_hidden: bool = False
23
 
24
 
25
+ @dataclass(frozen=True)
26
  class AutoEvalColumn:
 
27
  model = ColumnContent("model", "markdown", True, never_hidden=True)
28
  average = ColumnContent("average", "number", True)
29
+ general = ColumnContent("general", "number", True)
30
+ speaker = ColumnContent("speaker", "number", True)
31
+ prosody = ColumnContent("prosody", "number", True)
32
+ intelligibility = ColumnContent("intelligibility", "number", True)
33
+ environment = ColumnContent("environment", "number", True)
34
+ tags = ColumnContent("tags", "str", False)
 
 
 
 
 
35
 
36
 
37
  ## For the queue columns in the submission tab
38
  @dataclass(frozen=True)
39
  class EvalQueueColumn: # Queue column
40
  model = ColumnContent("model", "markdown", True)
 
 
 
 
41
  status = ColumnContent("status", "str", True)
42
 
43
 
 
49
  symbol: str = "" # emoji
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  # Column selection
53
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
54
 
55
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
56
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
57
 
58
+ BENCHMARK_COLS = ["general", "speaker", "prosody", "intelligibility", "environment"]
src/leaderboard/read_evals.py CHANGED
@@ -8,28 +8,16 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
 
13
 
14
  @dataclass
15
  class EvalResult:
16
  """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
17
 
18
- eval_name: str # org_model_precision (uid)
19
- full_model: str # org/model (path on hub)
20
- org: str
21
- model: str
22
- revision: str # commit hash, "" if main
23
  results: dict
24
- precision: Precision = Precision.Unknown
25
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
26
- weight_type: WeightType = WeightType.Original # Original or Adapter
27
- architecture: str = "Unknown"
28
- license: str = "?"
29
- likes: int = 0
30
- num_params: int = 0
31
  date: str = "" # submission date of request file
32
- still_on_hub: bool = False
33
 
34
  @classmethod
35
  def init_from_json_file(self, json_filepath):
@@ -39,22 +27,8 @@ class EvalResult:
39
 
40
  config = data.get("config")
41
 
42
- # Precision
43
- precision = Precision.from_str(config.get("model_dtype"))
44
-
45
- # Get model and org
46
- org_and_model = config.get("model_name", config.get("model_args", None))
47
- org_and_model = org_and_model.split("/", 1)
48
-
49
- if len(org_and_model) == 1:
50
- org = None
51
- model = org_and_model[0]
52
- result_key = f"{model}_{precision.value.name}"
53
- else:
54
- org = org_and_model[0]
55
- model = org_and_model[1]
56
- result_key = f"{org}_{model}_{precision.value.name}"
57
- full_model = "/".join(org_and_model)
58
 
59
  # Extract results available in this file (some results are split in several files)
60
  results = {}
@@ -70,28 +44,19 @@ class EvalResult:
70
  results[task.benchmark] = mean_acc
71
 
72
  return self(
73
- eval_name=result_key,
74
- full_model=full_model,
75
- org=org,
76
- model=model,
77
  results=results,
78
- precision=precision,
79
- revision=config.get("model_sha", ""),
80
  )
81
 
82
  def update_with_request_file(self, requests_path):
83
  """Finds the relevant request file for the current model and updates info with it"""
84
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
85
 
86
  try:
87
  with open(request_file, "r") as f:
88
  request = json.load(f)
89
- self.model_type = ModelType.from_str(request.get("model_type", ""))
90
- self.weight_type = WeightType[request.get("weight_type", "Original")]
91
- self.license = request.get("license", "?")
92
- self.likes = request.get("likes", 0)
93
- self.num_params = request.get("params", 0)
94
- self.date = request.get("submitted_time", "")
95
  except Exception:
96
  print(
97
  f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
@@ -99,30 +64,11 @@ class EvalResult:
99
 
100
  def to_dict(self):
101
  """Converts the Eval Result to a dict compatible with our dataframe display"""
102
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
103
  data_dict = {
104
- "eval_name": self.eval_name, # not a column, just a save name,
105
- AutoEvalColumn.precision.name: self.precision.value.name,
106
- AutoEvalColumn.model_type.name: self.model_type.value.name,
107
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
108
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
109
- AutoEvalColumn.architecture.name: self.architecture,
110
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
111
- AutoEvalColumn.revision.name: self.revision,
112
- AutoEvalColumn.average.name: average,
113
- AutoEvalColumn.license.name: self.license,
114
- AutoEvalColumn.likes.name: self.likes,
115
- AutoEvalColumn.params.name: self.num_params,
116
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
117
- }
118
-
119
- for task in Tasks:
120
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
121
 
122
- return data_dict
123
 
124
 
125
- def get_request_file_for_model(requests_path, model_name, precision):
126
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
127
  request_files = os.path.join(
128
  requests_path,
@@ -130,13 +76,13 @@ def get_request_file_for_model(requests_path, model_name, precision):
130
  )
131
  request_files = glob.glob(request_files)
132
 
133
- # Select correct request file (precision)
134
  request_file = ""
135
  request_files = sorted(request_files, reverse=True)
136
  for tmp_request_file in request_files:
137
  with open(tmp_request_file, "r") as f:
138
  req_content = json.load(f)
139
- if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
140
  request_file = tmp_request_file
141
  return request_file
142
 
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, Tasks
12
 
13
 
14
  @dataclass
15
  class EvalResult:
16
  """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
17
 
18
+ model_id: str
 
 
 
 
19
  results: dict
 
 
 
 
 
 
 
20
  date: str = "" # submission date of request file
 
21
 
22
  @classmethod
23
  def init_from_json_file(self, json_filepath):
 
27
 
28
  config = data.get("config")
29
 
30
+ # Extract model info
31
+ model = config.get("model_name", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  # Extract results available in this file (some results are split in several files)
34
  results = {}
 
44
  results[task.benchmark] = mean_acc
45
 
46
  return self(
47
+ model_id=model,
 
 
 
48
  results=results,
 
 
49
  )
50
 
51
  def update_with_request_file(self, requests_path):
52
  """Finds the relevant request file for the current model and updates info with it"""
53
+ request_file = get_request_file_for_model(requests_path, self.full_model)
54
 
55
  try:
56
  with open(request_file, "r") as f:
57
  request = json.load(f)
58
+ self.model_id = request.get("model", self.model_id)
59
+ self.results
 
 
 
 
60
  except Exception:
61
  print(
62
  f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
 
64
 
65
  def to_dict(self):
66
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
67
  data_dict = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
 
69
 
70
 
71
+ def get_request_file_for_model(requests_path, model_name):
72
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
73
  request_files = os.path.join(
74
  requests_path,
 
76
  )
77
  request_files = glob.glob(request_files)
78
 
79
+ # Select correct request file
80
  request_file = ""
81
  request_files = sorted(request_files, reverse=True)
82
  for tmp_request_file in request_files:
83
  with open(tmp_request_file, "r") as f:
84
  req_content = json.load(f)
85
+ if req_content["status"] in ["FINISHED"]:
86
  request_file = tmp_request_file
87
  return request_file
88
 
src/submission/submit.py CHANGED
@@ -1,6 +1,7 @@
1
  import json
2
  import os
3
  from datetime import datetime, timezone
 
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
  from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
@@ -13,7 +14,7 @@ USERS_TO_SUBMISSION_DATES = None
13
 
14
  def add_new_eval(
15
  model: str,
16
- revision: str,
17
  ):
18
  global REQUESTED_MODELS
19
  global USERS_TO_SUBMISSION_DATES
@@ -34,7 +35,6 @@ def add_new_eval(
34
 
35
  eval_entry = {
36
  "model": model,
37
- "revision": revision,
38
  "status": "PENDING",
39
  "submitted_time": current_time,
40
  "private": False,
@@ -47,7 +47,7 @@ def add_new_eval(
47
  print("Creating eval file")
48
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
49
  os.makedirs(OUT_DIR, exist_ok=True)
50
- out_path = f"{OUT_DIR}/{model_name}_eval_request_False_{precision}_{weight_type}.json"
51
 
52
  with open(out_path, "w") as f:
53
  f.write(json.dumps(eval_entry))
 
1
  import json
2
  import os
3
  from datetime import datetime, timezone
4
+ from typing import List
5
 
6
  from src.display.formatting import styled_error, styled_message, styled_warning
7
  from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
 
14
 
15
  def add_new_eval(
16
  model: str,
17
+ tags: List[str],
18
  ):
19
  global REQUESTED_MODELS
20
  global USERS_TO_SUBMISSION_DATES
 
35
 
36
  eval_entry = {
37
  "model": model,
 
38
  "status": "PENDING",
39
  "submitted_time": current_time,
40
  "private": False,
 
47
  print("Creating eval file")
48
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
49
  os.makedirs(OUT_DIR, exist_ok=True)
50
+ out_path = f"{OUT_DIR}/{model_name}_eval_request_False.json"
51
 
52
  with open(out_path, "w") as f:
53
  f.write(json.dumps(eval_entry))