support dynamic refresh page
#15
by
AppleSwing
- opened
- LICENSE +0 -201
- README.md +1 -1
- app.py +261 -296
- backend-cli.py +30 -121
- requirements.txt +3 -7
- src/backend/envs.py +1 -4
- src/backend/hflm_with_measurement.py +18 -167
- src/backend/manage_requests.py +5 -5
- src/backend/moe_infinity.py +4 -9
- src/backend/run_eval_suite.py +1 -9
- src/backend/tasks/arena_hard/__init__.py +0 -0
- src/backend/tasks/arena_hard/arena_hard.yaml +0 -2
- src/backend/tasks/arena_hard/arena_judgment.py +0 -256
- src/backend/tasks/arena_hard/arena_utils.py +0 -349
- src/backend/tasks/arena_hard/configs/api_config.yaml +0 -17
- src/backend/tasks/arena_hard/configs/judge_config.yaml +0 -26
- src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl +0 -0
- src/backend/tasks/arena_hard/question.jsonl +0 -0
- src/backend/tasks/arena_hard/task.py +0 -220
- src/backend/tasks/gsm8k/gsm8k-custom.yaml +0 -47
- src/backend/tasks/measurement_task_utils.py +0 -9
- src/backend/tasks/selfcheckgpt/task.py +2 -2
- src/display/about.py +2 -15
- src/display/imgs/Netmind.AI_LOGO.jpg +0 -0
- src/display/utils.py +37 -99
- src/leaderboard/read_evals.py +22 -45
- src/populate.py +7 -5
- src/submission/check_validity.py +2 -3
- src/submission/submit.py +3 -5
- src/utils.py +0 -177
LICENSE
DELETED
@@ -1,201 +0,0 @@
|
|
1 |
-
Apache License
|
2 |
-
Version 2.0, January 2004
|
3 |
-
http://www.apache.org/licenses/
|
4 |
-
|
5 |
-
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
-
|
7 |
-
1. Definitions.
|
8 |
-
|
9 |
-
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
-
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
-
|
12 |
-
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
-
the copyright owner that is granting the License.
|
14 |
-
|
15 |
-
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
-
other entities that control, are controlled by, or are under common
|
17 |
-
control with that entity. For the purposes of this definition,
|
18 |
-
"control" means (i) the power, direct or indirect, to cause the
|
19 |
-
direction or management of such entity, whether by contract or
|
20 |
-
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
-
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
-
|
23 |
-
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
-
exercising permissions granted by this License.
|
25 |
-
|
26 |
-
"Source" form shall mean the preferred form for making modifications,
|
27 |
-
including but not limited to software source code, documentation
|
28 |
-
source, and configuration files.
|
29 |
-
|
30 |
-
"Object" form shall mean any form resulting from mechanical
|
31 |
-
transformation or translation of a Source form, including but
|
32 |
-
not limited to compiled object code, generated documentation,
|
33 |
-
and conversions to other media types.
|
34 |
-
|
35 |
-
"Work" shall mean the work of authorship, whether in Source or
|
36 |
-
Object form, made available under the License, as indicated by a
|
37 |
-
copyright notice that is included in or attached to the work
|
38 |
-
(an example is provided in the Appendix below).
|
39 |
-
|
40 |
-
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
-
form, that is based on (or derived from) the Work and for which the
|
42 |
-
editorial revisions, annotations, elaborations, or other modifications
|
43 |
-
represent, as a whole, an original work of authorship. For the purposes
|
44 |
-
of this License, Derivative Works shall not include works that remain
|
45 |
-
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
-
the Work and Derivative Works thereof.
|
47 |
-
|
48 |
-
"Contribution" shall mean any work of authorship, including
|
49 |
-
the original version of the Work and any modifications or additions
|
50 |
-
to that Work or Derivative Works thereof, that is intentionally
|
51 |
-
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
-
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
-
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
-
means any form of electronic, verbal, or written communication sent
|
55 |
-
to the Licensor or its representatives, including but not limited to
|
56 |
-
communication on electronic mailing lists, source code control systems,
|
57 |
-
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
-
Licensor for the purpose of discussing and improving the Work, but
|
59 |
-
excluding communication that is conspicuously marked or otherwise
|
60 |
-
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
-
|
62 |
-
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
-
on behalf of whom a Contribution has been received by Licensor and
|
64 |
-
subsequently incorporated within the Work.
|
65 |
-
|
66 |
-
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
-
this License, each Contributor hereby grants to You a perpetual,
|
68 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
-
copyright license to reproduce, prepare Derivative Works of,
|
70 |
-
publicly display, publicly perform, sublicense, and distribute the
|
71 |
-
Work and such Derivative Works in Source or Object form.
|
72 |
-
|
73 |
-
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
-
this License, each Contributor hereby grants to You a perpetual,
|
75 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
-
(except as stated in this section) patent license to make, have made,
|
77 |
-
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
-
where such license applies only to those patent claims licensable
|
79 |
-
by such Contributor that are necessarily infringed by their
|
80 |
-
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
-
with the Work to which such Contribution(s) was submitted. If You
|
82 |
-
institute patent litigation against any entity (including a
|
83 |
-
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
-
or a Contribution incorporated within the Work constitutes direct
|
85 |
-
or contributory patent infringement, then any patent licenses
|
86 |
-
granted to You under this License for that Work shall terminate
|
87 |
-
as of the date such litigation is filed.
|
88 |
-
|
89 |
-
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
-
Work or Derivative Works thereof in any medium, with or without
|
91 |
-
modifications, and in Source or Object form, provided that You
|
92 |
-
meet the following conditions:
|
93 |
-
|
94 |
-
(a) You must give any other recipients of the Work or
|
95 |
-
Derivative Works a copy of this License; and
|
96 |
-
|
97 |
-
(b) You must cause any modified files to carry prominent notices
|
98 |
-
stating that You changed the files; and
|
99 |
-
|
100 |
-
(c) You must retain, in the Source form of any Derivative Works
|
101 |
-
that You distribute, all copyright, patent, trademark, and
|
102 |
-
attribution notices from the Source form of the Work,
|
103 |
-
excluding those notices that do not pertain to any part of
|
104 |
-
the Derivative Works; and
|
105 |
-
|
106 |
-
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
-
distribution, then any Derivative Works that You distribute must
|
108 |
-
include a readable copy of the attribution notices contained
|
109 |
-
within such NOTICE file, excluding those notices that do not
|
110 |
-
pertain to any part of the Derivative Works, in at least one
|
111 |
-
of the following places: within a NOTICE text file distributed
|
112 |
-
as part of the Derivative Works; within the Source form or
|
113 |
-
documentation, if provided along with the Derivative Works; or,
|
114 |
-
within a display generated by the Derivative Works, if and
|
115 |
-
wherever such third-party notices normally appear. The contents
|
116 |
-
of the NOTICE file are for informational purposes only and
|
117 |
-
do not modify the License. You may add Your own attribution
|
118 |
-
notices within Derivative Works that You distribute, alongside
|
119 |
-
or as an addendum to the NOTICE text from the Work, provided
|
120 |
-
that such additional attribution notices cannot be construed
|
121 |
-
as modifying the License.
|
122 |
-
|
123 |
-
You may add Your own copyright statement to Your modifications and
|
124 |
-
may provide additional or different license terms and conditions
|
125 |
-
for use, reproduction, or distribution of Your modifications, or
|
126 |
-
for any such Derivative Works as a whole, provided Your use,
|
127 |
-
reproduction, and distribution of the Work otherwise complies with
|
128 |
-
the conditions stated in this License.
|
129 |
-
|
130 |
-
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
-
any Contribution intentionally submitted for inclusion in the Work
|
132 |
-
by You to the Licensor shall be under the terms and conditions of
|
133 |
-
this License, without any additional terms or conditions.
|
134 |
-
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
-
the terms of any separate license agreement you may have executed
|
136 |
-
with Licensor regarding such Contributions.
|
137 |
-
|
138 |
-
6. Trademarks. This License does not grant permission to use the trade
|
139 |
-
names, trademarks, service marks, or product names of the Licensor,
|
140 |
-
except as required for reasonable and customary use in describing the
|
141 |
-
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
-
|
143 |
-
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
-
agreed to in writing, Licensor provides the Work (and each
|
145 |
-
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
-
implied, including, without limitation, any warranties or conditions
|
148 |
-
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
-
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
-
appropriateness of using or redistributing the Work and assume any
|
151 |
-
risks associated with Your exercise of permissions under this License.
|
152 |
-
|
153 |
-
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
-
whether in tort (including negligence), contract, or otherwise,
|
155 |
-
unless required by applicable law (such as deliberate and grossly
|
156 |
-
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
-
liable to You for damages, including any direct, indirect, special,
|
158 |
-
incidental, or consequential damages of any character arising as a
|
159 |
-
result of this License or out of the use or inability to use the
|
160 |
-
Work (including but not limited to damages for loss of goodwill,
|
161 |
-
work stoppage, computer failure or malfunction, or any and all
|
162 |
-
other commercial damages or losses), even if such Contributor
|
163 |
-
has been advised of the possibility of such damages.
|
164 |
-
|
165 |
-
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
-
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
-
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
-
or other liability obligations and/or rights consistent with this
|
169 |
-
License. However, in accepting such obligations, You may act only
|
170 |
-
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
-
of any other Contributor, and only if You agree to indemnify,
|
172 |
-
defend, and hold each Contributor harmless for any liability
|
173 |
-
incurred by, or claims asserted against, such Contributor by reason
|
174 |
-
of your accepting any such warranty or additional liability.
|
175 |
-
|
176 |
-
END OF TERMS AND CONDITIONS
|
177 |
-
|
178 |
-
APPENDIX: How to apply the Apache License to your work.
|
179 |
-
|
180 |
-
To apply the Apache License to your work, attach the following
|
181 |
-
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
-
replaced with your own identifying information. (Don't include
|
183 |
-
the brackets!) The text should be enclosed in the appropriate
|
184 |
-
comment syntax for the file format. We also recommend that a
|
185 |
-
file or class name and description of purpose be included on the
|
186 |
-
same "printed page" as the copyright notice for easier
|
187 |
-
identification within third-party archives.
|
188 |
-
|
189 |
-
Copyright [yyyy] [name of copyright owner]
|
190 |
-
|
191 |
-
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
-
you may not use this file except in compliance with the License.
|
193 |
-
You may obtain a copy of the License at
|
194 |
-
|
195 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
-
|
197 |
-
Unless required by applicable law or agreed to in writing, software
|
198 |
-
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
-
See the License for the specific language governing permissions and
|
201 |
-
limitations under the License.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🔥
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.9.0
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
#!/usr/bin/env python
|
|
|
2 |
import os
|
3 |
import datetime
|
4 |
import socket
|
5 |
-
import base64
|
6 |
from threading import Thread
|
7 |
|
8 |
import gradio as gr
|
@@ -11,7 +11,6 @@ import time
|
|
11 |
from apscheduler.schedulers.background import BackgroundScheduler
|
12 |
|
13 |
from huggingface_hub import snapshot_download
|
14 |
-
from pytz import utc
|
15 |
|
16 |
from src.display.about import (
|
17 |
CITATION_BUTTON_LABEL,
|
@@ -22,7 +21,6 @@ from src.display.about import (
|
|
22 |
LLM_BENCHMARKS_DETAILS,
|
23 |
FAQ_TEXT,
|
24 |
TITLE,
|
25 |
-
ACKNOWLEDGEMENT_TEXT,
|
26 |
)
|
27 |
|
28 |
from src.display.css_html_js import custom_css
|
@@ -39,7 +37,6 @@ from src.display.utils import (
|
|
39 |
fields,
|
40 |
WeightType,
|
41 |
Precision,
|
42 |
-
GPUType
|
43 |
)
|
44 |
|
45 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, \
|
@@ -76,7 +73,7 @@ def restart_space():
|
|
76 |
|
77 |
|
78 |
def init_space():
|
79 |
-
|
80 |
|
81 |
if socket.gethostname() not in {"neuromancer"}:
|
82 |
# sync model_type with open-llm-leaderboard
|
@@ -91,19 +88,7 @@ def init_space():
|
|
91 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
|
92 |
EVAL_REQUESTS_PATH, EVAL_COLS
|
93 |
)
|
94 |
-
|
95 |
-
return None, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
96 |
-
|
97 |
-
|
98 |
-
def add_benchmark_columns(shown_columns):
|
99 |
-
benchmark_columns = []
|
100 |
-
for benchmark in BENCHMARK_COLS:
|
101 |
-
if benchmark in shown_columns:
|
102 |
-
for c in COLS:
|
103 |
-
if benchmark in c and benchmark != c:
|
104 |
-
benchmark_columns.append(c)
|
105 |
-
return benchmark_columns
|
106 |
-
|
107 |
|
108 |
# Searching and filtering
|
109 |
def update_table(
|
@@ -111,8 +96,7 @@ def update_table(
|
|
111 |
):
|
112 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
|
113 |
filtered_df = filter_queries(query, filtered_df)
|
114 |
-
|
115 |
-
df = select_columns(filtered_df, columns + benchmark_columns)
|
116 |
return df
|
117 |
|
118 |
|
@@ -160,7 +144,6 @@ def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precisio
|
|
160 |
type_emoji = [t[0] for t in type_query]
|
161 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
162 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
163 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.inference_framework.name].isin(size_query)]
|
164 |
|
165 |
# numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
166 |
# params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
@@ -173,176 +156,154 @@ shown_columns = None
|
|
173 |
dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
174 |
leaderboard_df = original_df.copy()
|
175 |
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
|
185 |
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
|
218 |
# triggered only once at startup => read query parameter if it exists
|
219 |
def load_query(request: gr.Request):
|
220 |
query = request.query_params.get("query") or ""
|
221 |
return query
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
gr.
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
)
|
250 |
-
|
251 |
-
|
252 |
-
choices=[
|
253 |
-
|
254 |
-
for c in fields(AutoEvalColumn)
|
255 |
-
if not c.hidden and not c.never_hidden and not c.dummy
|
256 |
-
],
|
257 |
-
value=[
|
258 |
-
c.name
|
259 |
-
for c in fields(AutoEvalColumn)
|
260 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
261 |
-
],
|
262 |
-
label="Tasks",
|
263 |
-
elem_id="column-select",
|
264 |
interactive=True,
|
|
|
265 |
)
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
+ benchmark_columns
|
308 |
-
+ [AutoEvalColumn.dummy.name]
|
309 |
-
]
|
310 |
-
if leaderboard_df.empty is False
|
311 |
-
else leaderboard_df
|
312 |
-
),
|
313 |
-
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + benchmark_columns,
|
314 |
-
datatype=TYPES,
|
315 |
-
elem_id="leaderboard-table",
|
316 |
-
interactive=False,
|
317 |
-
visible=True,
|
318 |
-
) # column_widths=["2%", "20%"]
|
319 |
-
|
320 |
-
# Dummy leaderboard for handling the case when the user uses backspace key
|
321 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
322 |
-
value=original_df[COLS] if original_df.empty is False else original_df,
|
323 |
-
headers=COLS,
|
324 |
-
datatype=TYPES,
|
325 |
-
visible=False,
|
326 |
-
)
|
327 |
-
|
328 |
-
search_bar.submit(
|
329 |
-
update_table,
|
330 |
-
[
|
331 |
-
hidden_leaderboard_table_for_search,
|
332 |
-
shown_columns,
|
333 |
-
filter_columns_type,
|
334 |
-
filter_columns_precision,
|
335 |
-
filter_columns_size,
|
336 |
-
search_bar,
|
337 |
-
],
|
338 |
-
leaderboard_table
|
339 |
-
)
|
340 |
-
|
341 |
-
# Check query parameter once at startup and update search bar
|
342 |
-
demo.load(load_query, inputs=[], outputs=[search_bar])
|
343 |
-
|
344 |
-
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
|
345 |
-
selector.change(
|
346 |
update_table,
|
347 |
[
|
348 |
hidden_leaderboard_table_for_search,
|
@@ -353,133 +314,137 @@ with demo:
|
|
353 |
search_bar,
|
354 |
],
|
355 |
leaderboard_table,
|
356 |
-
queue=True,
|
357 |
)
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
|
385 |
-
)
|
386 |
-
|
387 |
-
with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
|
388 |
-
with gr.Row():
|
389 |
-
running_eval_table = gr.components.Dataframe(
|
390 |
-
value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
|
391 |
-
)
|
392 |
-
|
393 |
-
with gr.Accordion(f"⏳ Scheduled Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
|
394 |
-
with gr.Row():
|
395 |
-
pending_eval_table = gr.components.Dataframe(
|
396 |
-
value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
|
397 |
-
)
|
398 |
-
|
399 |
-
with gr.Row():
|
400 |
-
gr.Markdown("# Submit your model here", elem_classes="markdown-text")
|
401 |
-
|
402 |
-
with gr.Row():
|
403 |
-
inference_framework = gr.Dropdown(
|
404 |
-
choices=[t.to_str() for t in InferenceFramework],
|
405 |
-
label="Inference framework",
|
406 |
-
multiselect=False,
|
407 |
-
value=None,
|
408 |
-
interactive=True,
|
409 |
-
)
|
410 |
-
|
411 |
-
gpu_type = gr.Dropdown(
|
412 |
-
choices=[t.to_str() for t in GPUType],
|
413 |
-
label="GPU type",
|
414 |
-
multiselect=False,
|
415 |
-
value="NVIDIA-A100-PCIe-80GB",
|
416 |
-
interactive=True,
|
417 |
)
|
418 |
-
|
419 |
-
|
420 |
-
|
|
|
421 |
with gr.Column():
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
multiselect=False,
|
429 |
value=None,
|
430 |
interactive=True,
|
431 |
)
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
label="
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
483 |
|
484 |
scheduler.add_job(restart_space, "interval", hours=6)
|
485 |
|
@@ -490,9 +455,9 @@ def launch_backend():
|
|
490 |
if DEVICE not in {"cpu"}:
|
491 |
_ = subprocess.run(["python", "backend-cli.py"])
|
492 |
|
493 |
-
|
494 |
# scheduler.add_job(launch_backend, "interval", seconds=120)
|
495 |
if __name__ == "__main__":
|
496 |
scheduler.start()
|
497 |
-
|
498 |
|
|
|
1 |
#!/usr/bin/env python
|
2 |
+
|
3 |
import os
|
4 |
import datetime
|
5 |
import socket
|
|
|
6 |
from threading import Thread
|
7 |
|
8 |
import gradio as gr
|
|
|
11 |
from apscheduler.schedulers.background import BackgroundScheduler
|
12 |
|
13 |
from huggingface_hub import snapshot_download
|
|
|
14 |
|
15 |
from src.display.about import (
|
16 |
CITATION_BUTTON_LABEL,
|
|
|
21 |
LLM_BENCHMARKS_DETAILS,
|
22 |
FAQ_TEXT,
|
23 |
TITLE,
|
|
|
24 |
)
|
25 |
|
26 |
from src.display.css_html_js import custom_css
|
|
|
37 |
fields,
|
38 |
WeightType,
|
39 |
Precision,
|
|
|
40 |
)
|
41 |
|
42 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, \
|
|
|
73 |
|
74 |
|
75 |
def init_space():
|
76 |
+
dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
|
77 |
|
78 |
if socket.gethostname() not in {"neuromancer"}:
|
79 |
# sync model_type with open-llm-leaderboard
|
|
|
88 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
|
89 |
EVAL_REQUESTS_PATH, EVAL_COLS
|
90 |
)
|
91 |
+
return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
# Searching and filtering
|
94 |
def update_table(
|
|
|
96 |
):
|
97 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
|
98 |
filtered_df = filter_queries(query, filtered_df)
|
99 |
+
df = select_columns(filtered_df, columns)
|
|
|
100 |
return df
|
101 |
|
102 |
|
|
|
144 |
type_emoji = [t[0] for t in type_query]
|
145 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
146 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
|
|
147 |
|
148 |
# numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
149 |
# params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
|
|
156 |
dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
157 |
leaderboard_df = original_df.copy()
|
158 |
|
159 |
+
def update_leaderboard_table():
|
160 |
+
global leaderboard_df, shown_columns
|
161 |
+
print("Updating leaderboard table")
|
162 |
+
return leaderboard_df[
|
163 |
+
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
164 |
+
+ shown_columns.value
|
165 |
+
+ [AutoEvalColumn.dummy.name]
|
166 |
+
] if not leaderboard_df.empty else leaderboard_df
|
167 |
|
168 |
|
169 |
+
def update_hidden_leaderboard_table():
|
170 |
+
global original_df
|
171 |
+
return original_df[COLS] if original_df.empty is False else original_df
|
172 |
|
173 |
+
def update_dataset_table():
|
174 |
+
global dataset_df
|
175 |
+
return dataset_df
|
176 |
|
177 |
+
def update_finish_table():
|
178 |
+
global finished_eval_queue_df
|
179 |
+
return finished_eval_queue_df
|
180 |
|
181 |
+
def update_running_table():
|
182 |
+
global running_eval_queue_df
|
183 |
+
return running_eval_queue_df
|
184 |
|
185 |
+
def update_pending_table():
|
186 |
+
global pending_eval_queue_df
|
187 |
+
return pending_eval_queue_df
|
188 |
|
189 |
+
def update_finish_num():
|
190 |
+
global finished_eval_queue_df
|
191 |
+
return len(finished_eval_queue_df)
|
192 |
|
193 |
+
def update_running_num():
|
194 |
+
global running_eval_queue_df
|
195 |
+
return len(running_eval_queue_df)
|
196 |
|
197 |
+
def update_pending_num():
|
198 |
+
global pending_eval_queue_df
|
199 |
+
return len(pending_eval_queue_df)
|
200 |
|
201 |
# triggered only once at startup => read query parameter if it exists
|
202 |
def load_query(request: gr.Request):
|
203 |
query = request.query_params.get("query") or ""
|
204 |
return query
|
205 |
+
|
206 |
+
def refresh_leaderboard():
|
207 |
+
return gr.update(value=update_leaderboard_table()), gr.update(value=update_hidden_leaderboard_table()), \
|
208 |
+
gr.update(value=update_dataset_table()), gr.update(value=update_finish_table()), \
|
209 |
+
gr.update(value=update_running_table()), gr.update(value=update_pending_table()), \
|
210 |
+
gr.update(value=update_finish_num()), gr.update(value=update_running_num()), gr.update(value=update_pending_num())
|
211 |
+
|
212 |
+
def periodic_init():
|
213 |
+
global dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, leaderboard_df
|
214 |
+
while True:
|
215 |
+
time.sleep(60)
|
216 |
+
dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
217 |
+
leaderboard_df = original_df.copy()
|
218 |
+
|
219 |
+
def block_launch():
|
220 |
+
global dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, leaderboard_df, shown_columns
|
221 |
+
demo = gr.Blocks(css=custom_css)
|
222 |
+
with demo:
|
223 |
+
gr.HTML(TITLE)
|
224 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
225 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
226 |
+
with gr.TabItem("MOE-LLM-GPU-Poor-Leaderboard Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
227 |
+
with gr.Row():
|
228 |
+
with gr.Column():
|
229 |
+
with gr.Row():
|
230 |
+
search_bar = gr.Textbox(
|
231 |
+
placeholder=" 🔍 Model search (separate multiple queries with `;`)",
|
232 |
+
show_label=False,
|
233 |
+
elem_id="search-bar",
|
234 |
+
)
|
235 |
+
with gr.Row():
|
236 |
+
shown_columns = gr.CheckboxGroup(
|
237 |
+
choices=[
|
238 |
+
c.name
|
239 |
+
for c in fields(AutoEvalColumn)
|
240 |
+
if not c.hidden and not c.never_hidden and not c.dummy
|
241 |
+
],
|
242 |
+
value=[
|
243 |
+
c.name
|
244 |
+
for c in fields(AutoEvalColumn)
|
245 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
246 |
+
],
|
247 |
+
label="Select columns to show",
|
248 |
+
elem_id="column-select",
|
249 |
+
interactive=True,
|
250 |
+
)
|
251 |
+
with gr.Column(min_width=320):
|
252 |
+
filter_columns_size = gr.CheckboxGroup(
|
253 |
+
label="Inference frameworks",
|
254 |
+
choices=[t.to_str() for t in InferenceFramework],
|
255 |
+
value=[t.to_str() for t in InferenceFramework],
|
256 |
+
interactive=True,
|
257 |
+
elem_id="filter-columns-size",
|
258 |
)
|
259 |
+
filter_columns_type = gr.CheckboxGroup(
|
260 |
+
label="Model types",
|
261 |
+
choices=[t.to_str() for t in ModelType],
|
262 |
+
value=[t.to_str() for t in ModelType],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
interactive=True,
|
264 |
+
elem_id="filter-columns-type",
|
265 |
)
|
266 |
+
filter_columns_precision = gr.CheckboxGroup(
|
267 |
+
label="Precision",
|
268 |
+
choices=[i.value.name for i in Precision],
|
269 |
+
value=[i.value.name for i in Precision],
|
270 |
+
interactive=True,
|
271 |
+
elem_id="filter-columns-precision",
|
272 |
+
)
|
273 |
+
# filter_columns_size = gr.CheckboxGroup(
|
274 |
+
# label="Model sizes (in billions of parameters)",
|
275 |
+
# choices=list(NUMERIC_INTERVALS.keys()),
|
276 |
+
# value=list(NUMERIC_INTERVALS.keys()),
|
277 |
+
# interactive=True,
|
278 |
+
# elem_id="filter-columns-size",
|
279 |
+
# )
|
280 |
+
# breakpoint()
|
281 |
+
refresh_button = gr.Button("Refresh", visible=True)
|
282 |
+
leaderboard_table = gr.components.Dataframe(
|
283 |
+
value=(
|
284 |
+
leaderboard_df[
|
285 |
+
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
286 |
+
+ shown_columns.value
|
287 |
+
+ [AutoEvalColumn.dummy.name]
|
288 |
+
]
|
289 |
+
if leaderboard_df.empty is False
|
290 |
+
else leaderboard_df
|
291 |
+
),
|
292 |
+
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
293 |
+
datatype=TYPES,
|
294 |
+
elem_id="leaderboard-table",
|
295 |
+
interactive=False,
|
296 |
+
visible=True,
|
297 |
+
) # column_widths=["2%", "20%"]
|
298 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
299 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
300 |
+
value=original_df[COLS] if original_df.empty is False else original_df,
|
301 |
+
headers=COLS,
|
302 |
+
datatype=TYPES,
|
303 |
+
visible=False,
|
304 |
+
)
|
305 |
+
# refresh_button.click(fn=update_leaderboard_tables, outputs=[leaderboard_table, hidden_leaderboard_table_for_search])
|
306 |
+
search_bar.submit(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
update_table,
|
308 |
[
|
309 |
hidden_leaderboard_table_for_search,
|
|
|
314 |
search_bar,
|
315 |
],
|
316 |
leaderboard_table,
|
|
|
317 |
)
|
318 |
+
# Check query parameter once at startup and update search bar
|
319 |
+
demo.load(load_query, inputs=[], outputs=[search_bar])
|
320 |
+
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
|
321 |
+
selector.change(
|
322 |
+
update_table,
|
323 |
+
[
|
324 |
+
hidden_leaderboard_table_for_search,
|
325 |
+
shown_columns,
|
326 |
+
filter_columns_type,
|
327 |
+
filter_columns_precision,
|
328 |
+
filter_columns_size,
|
329 |
+
search_bar,
|
330 |
+
],
|
331 |
+
leaderboard_table,
|
332 |
+
queue=True,
|
333 |
+
)
|
334 |
+
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
335 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
336 |
+
dataset_table = gr.components.Dataframe(
|
337 |
+
value=dataset_df,
|
338 |
+
headers=list(dataset_df.columns),
|
339 |
+
datatype=["str", "markdown", "str", "str", "str"],
|
340 |
+
elem_id="dataset-table",
|
341 |
+
interactive=False,
|
342 |
+
visible=True,
|
343 |
+
column_widths=["15%", "20%"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
)
|
345 |
+
gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
|
346 |
+
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
347 |
+
# refresh_button.click(fn=update_dataset_table, outputs=[dataset_table])
|
348 |
+
with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
|
349 |
with gr.Column():
|
350 |
+
with gr.Row():
|
351 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
352 |
+
with gr.Column():
|
353 |
+
with gr.Accordion(f"✅ Finished Evaluations", open=False):
|
354 |
+
with gr.Column():
|
355 |
+
num_fin = gr.Number(len(finished_eval_queue_df), label="Number of finished evaluations", visible=True, interactive=False)
|
356 |
+
with gr.Row():
|
357 |
+
finished_eval_table = gr.components.Dataframe(
|
358 |
+
value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=6
|
359 |
+
)
|
360 |
+
with gr.Accordion(f"🔄 Running Evaluation Queue", open=False):
|
361 |
+
with gr.Column():
|
362 |
+
num_run = gr.Number(len(running_eval_queue_df), label="Number of running evaluations", visible=True, interactive=False)
|
363 |
+
with gr.Row():
|
364 |
+
running_eval_table = gr.components.Dataframe(
|
365 |
+
value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=6
|
366 |
+
)
|
367 |
+
with gr.Accordion(f"⏳ Scheduled Evaluation Queue", open=False):
|
368 |
+
with gr.Column():
|
369 |
+
num_sche = gr.Number(len(pending_eval_queue_df), label="Number of scheduled evaluations", visible=True, interactive=False)
|
370 |
+
with gr.Row():
|
371 |
+
pending_eval_table = gr.components.Dataframe(
|
372 |
+
value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=6
|
373 |
+
)
|
374 |
+
# refresh_button.click(fn=update_submit_tables,
|
375 |
+
# outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
376 |
+
with gr.Row():
|
377 |
+
gr.Markdown("# Submit your model here", elem_classes="markdown-text")
|
378 |
+
with gr.Row():
|
379 |
+
inference_framework = gr.Dropdown(
|
380 |
+
choices=[t.to_str() for t in InferenceFramework],
|
381 |
+
label="Inference framework",
|
382 |
multiselect=False,
|
383 |
value=None,
|
384 |
interactive=True,
|
385 |
)
|
386 |
+
with gr.Row():
|
387 |
+
with gr.Column():
|
388 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
389 |
+
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
390 |
+
private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
|
391 |
+
model_type = gr.Dropdown(
|
392 |
+
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
393 |
+
label="Model type",
|
394 |
+
multiselect=False,
|
395 |
+
value=None,
|
396 |
+
interactive=True,
|
397 |
+
)
|
398 |
+
with gr.Column():
|
399 |
+
precision = gr.Dropdown(
|
400 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
401 |
+
label="Precision",
|
402 |
+
multiselect=False,
|
403 |
+
value="float32",
|
404 |
+
interactive=True,
|
405 |
+
)
|
406 |
+
weight_type = gr.Dropdown(
|
407 |
+
choices=[i.value.name for i in WeightType],
|
408 |
+
label="Weights type",
|
409 |
+
multiselect=False,
|
410 |
+
value="Original",
|
411 |
+
interactive=True,
|
412 |
+
)
|
413 |
+
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
414 |
+
submit_button = gr.Button("Submit Eval")
|
415 |
+
submission_result = gr.Markdown()
|
416 |
+
debug = gr.Checkbox(args.debug, label="Debug", visible=False)
|
417 |
+
submit_button.click(
|
418 |
+
add_new_eval,
|
419 |
+
[
|
420 |
+
model_name_textbox,
|
421 |
+
base_model_name_textbox,
|
422 |
+
revision_name_textbox,
|
423 |
+
precision,
|
424 |
+
private,
|
425 |
+
weight_type,
|
426 |
+
model_type,
|
427 |
+
inference_framework,
|
428 |
+
debug
|
429 |
+
],
|
430 |
+
submission_result,
|
431 |
+
)
|
432 |
+
refresh_button.click(refresh_leaderboard,
|
433 |
+
outputs=[leaderboard_table, hidden_leaderboard_table_for_search, dataset_table,
|
434 |
+
finished_eval_table, running_eval_table, pending_eval_table, num_fin, num_run, num_sche])
|
435 |
+
|
436 |
+
with gr.Row():
|
437 |
+
with gr.Accordion("Citing this leaderboard", open=False):
|
438 |
+
citation_button = gr.Textbox(
|
439 |
+
value=CITATION_BUTTON_TEXT,
|
440 |
+
label=CITATION_BUTTON_LABEL,
|
441 |
+
lines=20,
|
442 |
+
elem_id="citation-button",
|
443 |
+
show_copy_button=True,
|
444 |
+
)
|
445 |
+
demo.queue(default_concurrency_limit=40).launch()
|
446 |
+
|
447 |
+
scheduler = BackgroundScheduler()
|
448 |
|
449 |
scheduler.add_job(restart_space, "interval", hours=6)
|
450 |
|
|
|
455 |
if DEVICE not in {"cpu"}:
|
456 |
_ = subprocess.run(["python", "backend-cli.py"])
|
457 |
|
458 |
+
Thread(target=periodic_init, daemon=True).start()
|
459 |
# scheduler.add_job(launch_backend, "interval", seconds=120)
|
460 |
if __name__ == "__main__":
|
461 |
scheduler.start()
|
462 |
+
block_launch()
|
463 |
|
backend-cli.py
CHANGED
@@ -6,7 +6,6 @@ import argparse
|
|
6 |
|
7 |
import socket
|
8 |
import random
|
9 |
-
import threading
|
10 |
from datetime import datetime
|
11 |
|
12 |
from src.backend.run_eval_suite import run_evaluation
|
@@ -16,20 +15,18 @@ from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PAT
|
|
16 |
from src.backend.manage_requests import EvalRequest
|
17 |
from src.leaderboard.read_evals import EvalResult
|
18 |
|
19 |
-
from src.envs import QUEUE_REPO, RESULTS_REPO, API
|
20 |
-
from src.utils import my_snapshot_download
|
21 |
|
22 |
from src.leaderboard.read_evals import get_raw_eval_results
|
23 |
|
24 |
from typing import Optional
|
25 |
-
|
26 |
import time
|
27 |
|
28 |
import pprint
|
29 |
import logging
|
30 |
|
31 |
-
from lm_eval.filters.extraction import RegexFilter
|
32 |
-
|
33 |
|
34 |
# Configure the root logger
|
35 |
logging.basicConfig(
|
@@ -44,20 +41,6 @@ eval_logger = logging.getLogger("lm-eval")
|
|
44 |
# Explicitly set the level for 'lm-eval' logger to WARNING
|
45 |
eval_logger.setLevel(logging.WARNING)
|
46 |
|
47 |
-
def tuple_input_decorator(func):
|
48 |
-
def wrapper(self, resps, docs):
|
49 |
-
stripped_resps = [[resp_data[0] for resp_data in group] for group in resps]
|
50 |
-
|
51 |
-
filtered_resps = func(self, stripped_resps, docs)
|
52 |
-
|
53 |
-
combined_resps = []
|
54 |
-
for original_group, new_group in zip(resps, filtered_resps):
|
55 |
-
combined_group = [(new_resp,) + rest_of_data[1:] for new_resp, rest_of_data in zip(new_group, original_group)]
|
56 |
-
combined_resps.append(combined_group)
|
57 |
-
|
58 |
-
return combined_resps
|
59 |
-
return wrapper
|
60 |
-
|
61 |
|
62 |
def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
|
63 |
for i in range(10):
|
@@ -140,23 +123,7 @@ def request_to_result_name(request: EvalRequest) -> str:
|
|
140 |
|
141 |
|
142 |
def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
|
143 |
-
batch_size =
|
144 |
-
batch_size = eval_request.batch_size
|
145 |
-
|
146 |
-
init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
|
147 |
-
# if init_gpu_info['Mem(M)'] > 500:
|
148 |
-
# assert False, f"This machine is not empty: {init_gpu_info}"
|
149 |
-
gpu_stats_list = []
|
150 |
-
stop_event = threading.Event()
|
151 |
-
monitor_thread = threading.Thread(target=monitor_gpus, args=(stop_event, 5, gpu_stats_list))
|
152 |
-
monitor_thread.start()
|
153 |
-
|
154 |
-
original_apply = RegexFilter.apply
|
155 |
-
if task.benchmark in ["gsm8k", "gsm8k_cot", "gsm8k_cot_self_consistency", "gsm8k_custom"]:
|
156 |
-
RegexFilter.apply = tuple_input_decorator(RegexFilter.apply)
|
157 |
-
else:
|
158 |
-
RegexFilter.apply = original_apply
|
159 |
-
|
160 |
try:
|
161 |
results = run_evaluation(
|
162 |
eval_request=eval_request,
|
@@ -183,20 +150,6 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
|
|
183 |
raise
|
184 |
|
185 |
# print("RESULTS", results)
|
186 |
-
stop_event.set()
|
187 |
-
monitor_thread.join()
|
188 |
-
gpu_info = analyze_gpu_stats(gpu_stats_list)
|
189 |
-
for task_name in results['results'].keys():
|
190 |
-
for key, value in gpu_info.items():
|
191 |
-
if "GPU" not in key:
|
192 |
-
results['results'][task_name][f"{key},none"] = int(value)
|
193 |
-
else:
|
194 |
-
results['results'][task_name][f"{key},none"] = value
|
195 |
-
|
196 |
-
results['results'][task_name]['batch_size,none'] = batch_size
|
197 |
-
results['results'][task_name]['precision,none'] = eval_request.precision
|
198 |
-
print(f"gpu_stats_list: {gpu_stats_list}")
|
199 |
-
print("GPU Usage:", gpu_info)
|
200 |
|
201 |
dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
|
202 |
# print(dumped)
|
@@ -217,8 +170,6 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
|
|
217 |
repo_id=RESULTS_REPO,
|
218 |
repo_type="dataset",
|
219 |
)
|
220 |
-
|
221 |
-
RegexFilter.apply = original_apply
|
222 |
return results
|
223 |
|
224 |
|
@@ -387,9 +338,10 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
|
|
387 |
|
388 |
return False
|
389 |
|
|
|
390 |
def process_pending_requests() -> bool:
|
391 |
sanity_checks()
|
392 |
-
|
393 |
current_pending_status = [PENDING_STATUS]
|
394 |
|
395 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
@@ -408,12 +360,6 @@ def process_pending_requests() -> bool:
|
|
408 |
|
409 |
eval_request = eval_requests[0]
|
410 |
pp.pprint(eval_request)
|
411 |
-
|
412 |
-
gpu_type = eval_request.gpu_type
|
413 |
-
curr_gpu_type = get_gpu_details()
|
414 |
-
if gpu_type != curr_gpu_type:
|
415 |
-
print(f"GPU type mismatch: {gpu_type} vs {curr_gpu_type}")
|
416 |
-
return False
|
417 |
|
418 |
my_snapshot_download(
|
419 |
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
@@ -450,15 +396,11 @@ def get_args():
|
|
450 |
parser = argparse.ArgumentParser(description="Run the backend")
|
451 |
parser.add_argument("--debug", action="store_true", help="Run in debug mode")
|
452 |
# debug parameters
|
453 |
-
parser.add_argument("--task", type=str, default="selfcheckgpt
|
454 |
-
parser.add_argument("--model", type=str, default="
|
455 |
-
parser.add_argument("--precision", type=str, default="
|
456 |
parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
|
457 |
parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
|
458 |
-
parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
|
459 |
-
help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
|
460 |
-
parser.add_argument("--debug_repo", action="store_true", help="Use debug repo")
|
461 |
-
parser.add_argument("--model_type", type=str, default="chat", help="Model type")
|
462 |
return parser.parse_args()
|
463 |
|
464 |
|
@@ -466,76 +408,43 @@ if __name__ == "__main__":
|
|
466 |
args = get_args()
|
467 |
local_debug = args.debug
|
468 |
# debug specific task by ping
|
469 |
-
if local_debug
|
470 |
-
|
471 |
-
|
472 |
-
debug_model_names = args.model.split(",")
|
473 |
-
debug_task_name = args.task.split(",")
|
474 |
-
precisions = args.precision.split(",")
|
475 |
-
print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
|
476 |
task_lst = TASKS_HARNESS.copy()
|
477 |
-
|
478 |
-
for precision in precisions:
|
479 |
for debug_model_name in debug_model_names:
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
model_type=args.model_type,
|
494 |
-
)
|
495 |
-
curr_gpu_type = get_gpu_details()
|
496 |
-
if eval_request.gpu_type != curr_gpu_type:
|
497 |
-
print(f"GPU type mismatch: {eval_request.gpu_type} vs {curr_gpu_type}")
|
498 |
-
raise Exception("GPU type mismatch")
|
499 |
-
results = process_evaluation(task, eval_request, limit=args.limit)
|
500 |
-
# except Exception as e:
|
501 |
-
# print(f"debug running error: {e}")
|
502 |
-
elif local_debug and args.debug_repo:
|
503 |
-
QUEUE_REPO = DEBUG_QUEUE_REPO
|
504 |
-
RESULTS_REPO = DEBUG_RESULTS_REPO
|
505 |
while True:
|
506 |
res = False
|
|
|
507 |
# if random.randint(0, 10) == 0:
|
508 |
res = process_pending_requests()
|
509 |
print(f"waiting for 60 seconds")
|
510 |
time.sleep(60)
|
|
|
511 |
# if res is False:
|
512 |
# if random.randint(0, 5) == 0:
|
513 |
# res = maybe_refresh_results(100)
|
514 |
# else:
|
515 |
# res = process_finished_requests(100)
|
|
|
516 |
# time.sleep(60)
|
|
|
517 |
# if res is False:
|
518 |
# if random.randint(0, 5) == 0:
|
519 |
# res = maybe_refresh_results(0)
|
520 |
# else:
|
521 |
# res = process_finished_requests(0)
|
522 |
-
elif not local_debug and not args.debug_repo:
|
523 |
-
while True:
|
524 |
-
res = False
|
525 |
-
# if random.randint(0, 10) == 0:
|
526 |
-
res = process_pending_requests()
|
527 |
-
print(f"waiting for 60 seconds")
|
528 |
-
time.sleep(60)
|
529 |
-
# if res is False:
|
530 |
-
# if random.randint(0, 5) == 0:
|
531 |
-
# res = maybe_refresh_results(100)
|
532 |
-
# else:
|
533 |
-
# res = process_finished_requests(100)
|
534 |
-
# time.sleep(60)
|
535 |
-
# if res is False:
|
536 |
-
# if random.randint(0, 5) == 0:
|
537 |
-
# res = maybe_refresh_results(0)
|
538 |
-
# else:
|
539 |
-
# res = process_finished_requests(0)
|
540 |
-
else:
|
541 |
-
raise Exception("Cannot use debug_repo without local debug flag")
|
|
|
6 |
|
7 |
import socket
|
8 |
import random
|
|
|
9 |
from datetime import datetime
|
10 |
|
11 |
from src.backend.run_eval_suite import run_evaluation
|
|
|
15 |
from src.backend.manage_requests import EvalRequest
|
16 |
from src.leaderboard.read_evals import EvalResult
|
17 |
|
18 |
+
from src.envs import QUEUE_REPO, RESULTS_REPO, API
|
19 |
+
from src.utils import my_snapshot_download
|
20 |
|
21 |
from src.leaderboard.read_evals import get_raw_eval_results
|
22 |
|
23 |
from typing import Optional
|
24 |
+
|
25 |
import time
|
26 |
|
27 |
import pprint
|
28 |
import logging
|
29 |
|
|
|
|
|
30 |
|
31 |
# Configure the root logger
|
32 |
logging.basicConfig(
|
|
|
41 |
# Explicitly set the level for 'lm-eval' logger to WARNING
|
42 |
eval_logger.setLevel(logging.WARNING)
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
|
46 |
for i in range(10):
|
|
|
123 |
|
124 |
|
125 |
def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
|
126 |
+
batch_size = 4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
try:
|
128 |
results = run_evaluation(
|
129 |
eval_request=eval_request,
|
|
|
150 |
raise
|
151 |
|
152 |
# print("RESULTS", results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
|
155 |
# print(dumped)
|
|
|
170 |
repo_id=RESULTS_REPO,
|
171 |
repo_type="dataset",
|
172 |
)
|
|
|
|
|
173 |
return results
|
174 |
|
175 |
|
|
|
338 |
|
339 |
return False
|
340 |
|
341 |
+
|
342 |
def process_pending_requests() -> bool:
|
343 |
sanity_checks()
|
344 |
+
|
345 |
current_pending_status = [PENDING_STATUS]
|
346 |
|
347 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
|
|
360 |
|
361 |
eval_request = eval_requests[0]
|
362 |
pp.pprint(eval_request)
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
|
364 |
my_snapshot_download(
|
365 |
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
|
|
396 |
parser = argparse.ArgumentParser(description="Run the backend")
|
397 |
parser.add_argument("--debug", action="store_true", help="Run in debug mode")
|
398 |
# debug parameters
|
399 |
+
parser.add_argument("--task", type=str, default="selfcheckgpt", help="Task to debug")
|
400 |
+
parser.add_argument("--model", type=str, default="facebook/opt-1.3b", help="Model to debug")
|
401 |
+
parser.add_argument("--precision", type=str, default="float16", help="Precision to debug")
|
402 |
parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
|
403 |
parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
|
|
|
|
|
|
|
|
|
404 |
return parser.parse_args()
|
405 |
|
406 |
|
|
|
408 |
args = get_args()
|
409 |
local_debug = args.debug
|
410 |
# debug specific task by ping
|
411 |
+
if local_debug:
|
412 |
+
debug_model_names = [args.model] # Use model from arguments
|
413 |
+
debug_task_name = args.task # Use task from arguments
|
|
|
|
|
|
|
|
|
414 |
task_lst = TASKS_HARNESS.copy()
|
415 |
+
for task in task_lst:
|
|
|
416 |
for debug_model_name in debug_model_names:
|
417 |
+
task_name = task.benchmark
|
418 |
+
if task_name != debug_task_name:
|
419 |
+
continue
|
420 |
+
eval_request = EvalRequest(
|
421 |
+
model=debug_model_name,
|
422 |
+
private=False,
|
423 |
+
status="",
|
424 |
+
json_filepath="",
|
425 |
+
precision=args.precision, # Use precision from arguments
|
426 |
+
inference_framework=args.inference_framework # Use inference framework from arguments
|
427 |
+
)
|
428 |
+
results = process_evaluation(task, eval_request, limit=args.limit)
|
429 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
while True:
|
431 |
res = False
|
432 |
+
|
433 |
# if random.randint(0, 10) == 0:
|
434 |
res = process_pending_requests()
|
435 |
print(f"waiting for 60 seconds")
|
436 |
time.sleep(60)
|
437 |
+
|
438 |
# if res is False:
|
439 |
# if random.randint(0, 5) == 0:
|
440 |
# res = maybe_refresh_results(100)
|
441 |
# else:
|
442 |
# res = process_finished_requests(100)
|
443 |
+
|
444 |
# time.sleep(60)
|
445 |
+
|
446 |
# if res is False:
|
447 |
# if random.randint(0, 5) == 0:
|
448 |
# res = maybe_refresh_results(0)
|
449 |
# else:
|
450 |
# res = process_finished_requests(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -4,7 +4,7 @@ APScheduler
|
|
4 |
black
|
5 |
click
|
6 |
datasets
|
7 |
-
gradio
|
8 |
gradio_client
|
9 |
huggingface-hub
|
10 |
matplotlib
|
@@ -16,7 +16,7 @@ requests
|
|
16 |
semantic-version
|
17 |
tqdm
|
18 |
wandb
|
19 |
-
transformers
|
20 |
tokenizers>=0.15.0
|
21 |
lm_eval[ifeval] @ git+https://github.com/EleutherAI/[email protected]
|
22 |
accelerate
|
@@ -27,10 +27,6 @@ cchardet
|
|
27 |
rouge_score
|
28 |
bert-score
|
29 |
evaluate
|
30 |
-
spacy
|
31 |
selfcheckgpt
|
32 |
immutabledict
|
33 |
-
gputil
|
34 |
-
bitsandbytes
|
35 |
-
openai
|
36 |
-
scikit-learn
|
|
|
4 |
black
|
5 |
click
|
6 |
datasets
|
7 |
+
gradio
|
8 |
gradio_client
|
9 |
huggingface-hub
|
10 |
matplotlib
|
|
|
16 |
semantic-version
|
17 |
tqdm
|
18 |
wandb
|
19 |
+
transformers>=4.36.0
|
20 |
tokenizers>=0.15.0
|
21 |
lm_eval[ifeval] @ git+https://github.com/EleutherAI/[email protected]
|
22 |
accelerate
|
|
|
27 |
rouge_score
|
28 |
bert-score
|
29 |
evaluate
|
30 |
+
spacy
|
31 |
selfcheckgpt
|
32 |
immutabledict
|
|
|
|
|
|
|
|
src/backend/envs.py
CHANGED
@@ -57,13 +57,10 @@ class Tasks(Enum):
|
|
57 |
|
58 |
# task20 = Task("race", "acc", "RACE", 0)
|
59 |
task21 = Task("mmlu", "acc", "MMLU", 5)
|
60 |
-
task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
|
61 |
-
# task23 = Task("gsm8k_cot", "em", "GSM8K", 8)
|
62 |
-
task24 = Task("arena_hard", "score", "Arena Hard", 0)
|
63 |
|
64 |
|
65 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
66 |
EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
|
67 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
68 |
|
69 |
-
DEVICE = "cuda
|
|
|
57 |
|
58 |
# task20 = Task("race", "acc", "RACE", 0)
|
59 |
task21 = Task("mmlu", "acc", "MMLU", 5)
|
|
|
|
|
|
|
60 |
|
61 |
|
62 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
63 |
EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
|
64 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
65 |
|
66 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
src/backend/hflm_with_measurement.py
CHANGED
@@ -24,7 +24,7 @@ from transformers.models.auto.modeling_auto import (
|
|
24 |
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
|
25 |
)
|
26 |
from transformers import TextStreamer
|
27 |
-
|
28 |
from lm_eval import utils
|
29 |
from lm_eval.api.instance import Instance
|
30 |
from lm_eval.api.model import TemplateLM
|
@@ -37,9 +37,6 @@ from lm_eval.models.utils import (
|
|
37 |
stop_sequences_criteria,
|
38 |
)
|
39 |
from lm_eval.models.huggingface import HFLM
|
40 |
-
from src.utils import get_gpu_details, get_peak_bw, transfer_precision2bytes, get_peak_flops
|
41 |
-
from src.submission.check_validity import get_model_size
|
42 |
-
from src.envs import API
|
43 |
|
44 |
|
45 |
class StopWatch(TextStreamer):
|
@@ -60,31 +57,16 @@ class StopWatch(TextStreamer):
|
|
60 |
self.start_decoding = time()
|
61 |
self.decoding_iterations += 1
|
62 |
return
|
63 |
-
|
64 |
def end(self):
|
65 |
if self.decoding_time is None and self.start_decoding is not None:
|
66 |
self.decoding_time = time() - self.start_decoding
|
67 |
return
|
68 |
-
|
69 |
|
70 |
class HFLMWithMeasurement(HFLM):
|
71 |
def __init__(self, **kwargs):
|
72 |
super().__init__(**kwargs)
|
73 |
-
self.pretrained = kwargs.get("pretrained", None)
|
74 |
-
self.revision = kwargs.get("revision", None)
|
75 |
-
self.precision = kwargs.get("dtype", None)
|
76 |
-
self.num_gpus = None
|
77 |
-
|
78 |
-
def _detect_num_gpus_used(self):
|
79 |
-
if self.num_gpus is not None:
|
80 |
-
return self.num_gpus
|
81 |
-
gpus = []
|
82 |
-
for p in self.model.parameters():
|
83 |
-
if p.device.type == "cuda":
|
84 |
-
gpus.append(p.device.index)
|
85 |
-
|
86 |
-
self.num_gpus = len(set(gpus))
|
87 |
-
return self.num_gpus
|
88 |
|
89 |
def _loglikelihood_tokens(
|
90 |
self,
|
@@ -297,7 +279,7 @@ class HFLMWithMeasurement(HFLM):
|
|
297 |
# Answer: (log prob, is-exact-match)
|
298 |
answer = (float(logits.sum()), bool(max_equal))
|
299 |
|
300 |
-
res.append((answer, per_sample_time, 0, 0
|
301 |
|
302 |
self.cache_hook.add_partial("loglikelihood", request_str, answer)
|
303 |
pbar.update(1)
|
@@ -305,16 +287,14 @@ class HFLMWithMeasurement(HFLM):
|
|
305 |
pbar.close()
|
306 |
|
307 |
return re_ord.get_original(res)
|
308 |
-
|
309 |
-
def _model_generate(self, context,
|
310 |
# temperature = 0.0 if not set
|
311 |
# if do_sample is false and temp==0.0:
|
312 |
# remove temperature, as do_sample=False takes care of this
|
313 |
# and we don't want a warning from HF
|
314 |
generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
|
315 |
do_sample = generation_kwargs.get("do_sample", None)
|
316 |
-
|
317 |
-
# is_gsm8k = generation_kwargs.get("is_gsm8k", False)
|
318 |
|
319 |
# The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
|
320 |
if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
|
@@ -322,52 +302,7 @@ class HFLMWithMeasurement(HFLM):
|
|
322 |
|
323 |
if do_sample is False and generation_kwargs.get("temperature") == 0.0:
|
324 |
generation_kwargs.pop("temperature")
|
325 |
-
|
326 |
-
# if is_gsm8k:
|
327 |
-
# generation_kwargs.pop("is_gsm8k")
|
328 |
-
|
329 |
-
context_length = context.shape[1]
|
330 |
-
|
331 |
-
if self.model.__class__.__name__ == "MoE":
|
332 |
-
model_config = self.model.model.config
|
333 |
-
else:
|
334 |
-
model_config = self.model.config
|
335 |
-
|
336 |
-
if not self.precision:
|
337 |
-
if model_config.quantization_config._load_in_4bit:
|
338 |
-
self.precision = "4bit"
|
339 |
-
elif model_config.quantization_config._load_in_8bit:
|
340 |
-
self.precision = "8bit"
|
341 |
-
else:
|
342 |
-
raise ValueError("Unknown precision")
|
343 |
-
|
344 |
-
# print(self.model)
|
345 |
-
linear_count = 0
|
346 |
-
element_wise_mul = 0
|
347 |
-
for name, module in self.model.named_modules():
|
348 |
-
if ('layers.0.' in name or "transformer.blocks.0" in name) and ('attn' not in name):
|
349 |
-
if 'experts.0.' in name or "ffn.experts" in name:
|
350 |
-
if "linear_v" in name:
|
351 |
-
element_wise_mul = 1
|
352 |
-
if isinstance(module, torch.nn.Linear):
|
353 |
-
# print(name, module)
|
354 |
-
linear_count += 1
|
355 |
-
elif isinstance(module, DbrxExpertGLU):
|
356 |
-
linear_count = 3
|
357 |
-
element_wise_mul = 1
|
358 |
-
# elif 'experts' not in name:
|
359 |
-
# if ("gate" not in name and "router" not in name) or "gate_proj" in name:
|
360 |
-
# if "gate_proj" in name:
|
361 |
-
# element_wise_mul = 1
|
362 |
-
# if isinstance(module, torch.nn.Linear):
|
363 |
-
# # print(name, module)
|
364 |
-
# linear_count += 1
|
365 |
-
else:
|
366 |
-
continue
|
367 |
-
print(f"linear_count: {linear_count}")
|
368 |
-
print(f"element_wise_mul: {element_wise_mul}")
|
369 |
-
print(f"GPU usage: {self._detect_num_gpus_used()}")
|
370 |
-
|
371 |
stopping_criteria = stop_sequences_criteria(
|
372 |
self.tokenizer, stop, context.shape[1], context.shape[0]
|
373 |
)
|
@@ -375,7 +310,7 @@ class HFLMWithMeasurement(HFLM):
|
|
375 |
start = time()
|
376 |
res = self.model.generate(
|
377 |
input_ids=context,
|
378 |
-
|
379 |
stopping_criteria=stopping_criteria,
|
380 |
pad_token_id=self.tokenizer.pad_token_id,
|
381 |
use_cache=True,
|
@@ -383,86 +318,15 @@ class HFLMWithMeasurement(HFLM):
|
|
383 |
**generation_kwargs,
|
384 |
)
|
385 |
end = time()
|
386 |
-
|
387 |
batch_size = context.shape[0]
|
388 |
output_length = stop_watch.decoding_iterations
|
389 |
-
|
390 |
-
precision_bytes = transfer_precision2bytes(self.precision)
|
391 |
-
|
392 |
-
model_size_param = sum(p.numel() for p in self.model.parameters())
|
393 |
-
|
394 |
-
n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else \
|
395 |
-
(model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layers)
|
396 |
-
|
397 |
-
d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
|
398 |
-
|
399 |
-
if hasattr(model_config, "num_experts_per_tok"):
|
400 |
-
n_experts_per_tok = model_config.num_experts_per_tok
|
401 |
-
elif hasattr(model_config, "num_selected_experts"):
|
402 |
-
n_experts_per_tok = model_config.num_selected_experts
|
403 |
-
elif hasattr(model_config, "ffn_config"):
|
404 |
-
n_experts_per_tok = model_config.ffn_config.moe_top_k
|
405 |
-
else:
|
406 |
-
n_experts_per_tok = 1
|
407 |
-
|
408 |
-
if hasattr(model_config, "ffn_dim"):
|
409 |
-
d_ff = model_config.ffn_dim
|
410 |
-
elif hasattr(model_config, "intermediate_size"):
|
411 |
-
d_ff = model_config.intermediate_size
|
412 |
-
elif hasattr(model_config, "d_ff"):
|
413 |
-
d_ff = model_config.d_ff
|
414 |
-
elif hasattr(model_config, "ff_ratio"):
|
415 |
-
d_ff = d_model * model_config.ff_ratio
|
416 |
-
elif hasattr(model_config, "ffn_config"):
|
417 |
-
d_ff = model_config.ffn_config.ffn_hidden_size
|
418 |
-
else:
|
419 |
-
raise ValueError("Unknown FFN dimension")
|
420 |
-
|
421 |
-
if hasattr(model_config, "num_local_experts"):
|
422 |
-
num_experts = model_config.num_local_experts
|
423 |
-
elif hasattr(model_config, "num_experts"):
|
424 |
-
num_experts = model_config.num_experts
|
425 |
-
elif hasattr(model_config, "ffn_config"):
|
426 |
-
num_experts = model_config.ffn_config.moe_num_experts
|
427 |
-
else:
|
428 |
-
num_experts = 1
|
429 |
-
|
430 |
-
ffn_params = n_layers * d_ff * linear_count * d_model
|
431 |
-
|
432 |
-
shared_params = model_size_param - num_experts * ffn_params
|
433 |
-
|
434 |
-
model_size = shared_params + n_experts_per_tok * ffn_params
|
435 |
-
|
436 |
-
per_token_kv_size = 2 * n_layers * d_model * precision_bytes
|
437 |
-
|
438 |
-
peak_bw_single = get_peak_bw(get_gpu_details())
|
439 |
-
peak_bw = peak_bw_single * self._detect_num_gpus_used()
|
440 |
-
|
441 |
-
context_prefill_size = context_length
|
442 |
-
kv_size = context_prefill_size * per_token_kv_size + (output_length - 1) * per_token_kv_size / 2
|
443 |
-
|
444 |
-
kv_size = kv_size / 1e9
|
445 |
-
|
446 |
-
n_vocab = model_config.vocab_size
|
447 |
|
448 |
end_to_end_time = (end - start) / batch_size
|
449 |
prefilling_time = stop_watch.prefilling_time / batch_size
|
450 |
decoding_time = stop_watch.decoding_time / batch_size
|
451 |
token_per_sec = output_length / decoding_time
|
452 |
-
|
453 |
-
|
454 |
-
avg_context_length = context_length + (output_length - 1) / 2
|
455 |
-
flops_per_token = 2 * model_size + ((linear_count + element_wise_mul) * n_layers * avg_context_length * d_model) + 4 * d_model + 2 * d_model * n_vocab
|
456 |
-
peak_flops_single = get_peak_flops(get_gpu_details(), self.precision)
|
457 |
-
peak_flops = peak_flops_single * self._detect_num_gpus_used()
|
458 |
-
|
459 |
-
## TODO only support llama-type decoder only models and moe models of switch transformer and mixtrial
|
460 |
-
mfu = token_per_sec * flops_per_token / peak_flops
|
461 |
-
mbu = achieve_mem_bw / peak_bw
|
462 |
-
|
463 |
-
print(f"mfu: {mfu}, mbu: {mbu}")
|
464 |
-
|
465 |
-
return res, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu
|
466 |
|
467 |
def generate_until(
|
468 |
self, requests: List[Instance], disable_tqdm: bool = False
|
@@ -539,19 +403,11 @@ class HFLMWithMeasurement(HFLM):
|
|
539 |
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
|
540 |
)
|
541 |
# add EOS token to stop sequences
|
542 |
-
eos =
|
543 |
if not until:
|
544 |
until = [eos]
|
545 |
else:
|
546 |
until.append(eos)
|
547 |
-
|
548 |
-
# is_gsm8k = kwargs.get("is_gsm8k", False)
|
549 |
-
# if is_gsm8k:
|
550 |
-
# until = ["Question:", "Question", "</s>"]
|
551 |
-
# eos_ids = [self.tokenizer.eos_token_id,
|
552 |
-
# self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
|
553 |
-
|
554 |
-
|
555 |
if "max_gen_toks" in kwargs.keys():
|
556 |
max_gen_toks = kwargs.pop("max_gen_toks")
|
557 |
else:
|
@@ -571,16 +427,14 @@ class HFLMWithMeasurement(HFLM):
|
|
571 |
left_truncate_len=max_ctx_len,
|
572 |
truncation=self.truncation,
|
573 |
)
|
574 |
-
|
575 |
-
# print("context: ", self.tok_decode(context_enc[0]))
|
576 |
context_enc = context_enc.to(self.device)
|
577 |
attn_masks = attn_masks.to(self.device)
|
578 |
|
579 |
-
if "
|
580 |
-
kwargs["
|
581 |
|
582 |
# perform batched generation
|
583 |
-
cont, end_to_end_time, prefilling_time, token_per_sec
|
584 |
context=context_enc,
|
585 |
attention_mask=attn_masks,
|
586 |
stop=until,
|
@@ -591,21 +445,18 @@ class HFLMWithMeasurement(HFLM):
|
|
591 |
for cont_toks, context in zip(cont_toks_list, contexts):
|
592 |
# discard context + left-padding toks if using causal decoder-only LM
|
593 |
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
|
594 |
-
# print("After Generation: ", self.tok_decode(cont_toks))
|
595 |
cont_toks = cont_toks[context_enc.shape[1] :]
|
596 |
-
|
597 |
s = self.tok_decode(cont_toks)
|
598 |
|
599 |
-
#
|
600 |
-
# if not is_gsm8k:
|
601 |
for term in until:
|
602 |
if len(term) > 0:
|
603 |
# ignore '' separator,
|
604 |
# for seq2seq case where self.tok_decode(self.eot_token_id) = ''
|
605 |
s = s.split(term)[0]
|
606 |
-
|
607 |
-
|
608 |
-
res.append((s, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu))
|
609 |
|
610 |
self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
|
611 |
pbar.update(1)
|
|
|
24 |
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
|
25 |
)
|
26 |
from transformers import TextStreamer
|
27 |
+
|
28 |
from lm_eval import utils
|
29 |
from lm_eval.api.instance import Instance
|
30 |
from lm_eval.api.model import TemplateLM
|
|
|
37 |
stop_sequences_criteria,
|
38 |
)
|
39 |
from lm_eval.models.huggingface import HFLM
|
|
|
|
|
|
|
40 |
|
41 |
|
42 |
class StopWatch(TextStreamer):
|
|
|
57 |
self.start_decoding = time()
|
58 |
self.decoding_iterations += 1
|
59 |
return
|
60 |
+
|
61 |
def end(self):
|
62 |
if self.decoding_time is None and self.start_decoding is not None:
|
63 |
self.decoding_time = time() - self.start_decoding
|
64 |
return
|
65 |
+
|
66 |
|
67 |
class HFLMWithMeasurement(HFLM):
|
68 |
def __init__(self, **kwargs):
|
69 |
super().__init__(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
def _loglikelihood_tokens(
|
72 |
self,
|
|
|
279 |
# Answer: (log prob, is-exact-match)
|
280 |
answer = (float(logits.sum()), bool(max_equal))
|
281 |
|
282 |
+
res.append((answer, per_sample_time, 0, 0))
|
283 |
|
284 |
self.cache_hook.add_partial("loglikelihood", request_str, answer)
|
285 |
pbar.update(1)
|
|
|
287 |
pbar.close()
|
288 |
|
289 |
return re_ord.get_original(res)
|
290 |
+
|
291 |
+
def _model_generate(self, context, max_length, stop, **generation_kwargs):
|
292 |
# temperature = 0.0 if not set
|
293 |
# if do_sample is false and temp==0.0:
|
294 |
# remove temperature, as do_sample=False takes care of this
|
295 |
# and we don't want a warning from HF
|
296 |
generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
|
297 |
do_sample = generation_kwargs.get("do_sample", None)
|
|
|
|
|
298 |
|
299 |
# The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
|
300 |
if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
|
|
|
302 |
|
303 |
if do_sample is False and generation_kwargs.get("temperature") == 0.0:
|
304 |
generation_kwargs.pop("temperature")
|
305 |
+
# build stopping criteria
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
306 |
stopping_criteria = stop_sequences_criteria(
|
307 |
self.tokenizer, stop, context.shape[1], context.shape[0]
|
308 |
)
|
|
|
310 |
start = time()
|
311 |
res = self.model.generate(
|
312 |
input_ids=context,
|
313 |
+
max_length=max_length,
|
314 |
stopping_criteria=stopping_criteria,
|
315 |
pad_token_id=self.tokenizer.pad_token_id,
|
316 |
use_cache=True,
|
|
|
318 |
**generation_kwargs,
|
319 |
)
|
320 |
end = time()
|
321 |
+
|
322 |
batch_size = context.shape[0]
|
323 |
output_length = stop_watch.decoding_iterations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
|
325 |
end_to_end_time = (end - start) / batch_size
|
326 |
prefilling_time = stop_watch.prefilling_time / batch_size
|
327 |
decoding_time = stop_watch.decoding_time / batch_size
|
328 |
token_per_sec = output_length / decoding_time
|
329 |
+
return res, end_to_end_time, prefilling_time, token_per_sec
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
|
331 |
def generate_until(
|
332 |
self, requests: List[Instance], disable_tqdm: bool = False
|
|
|
403 |
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
|
404 |
)
|
405 |
# add EOS token to stop sequences
|
406 |
+
eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
|
407 |
if not until:
|
408 |
until = [eos]
|
409 |
else:
|
410 |
until.append(eos)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
411 |
if "max_gen_toks" in kwargs.keys():
|
412 |
max_gen_toks = kwargs.pop("max_gen_toks")
|
413 |
else:
|
|
|
427 |
left_truncate_len=max_ctx_len,
|
428 |
truncation=self.truncation,
|
429 |
)
|
|
|
|
|
430 |
context_enc = context_enc.to(self.device)
|
431 |
attn_masks = attn_masks.to(self.device)
|
432 |
|
433 |
+
if "max_length" not in kwargs:
|
434 |
+
kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
|
435 |
|
436 |
# perform batched generation
|
437 |
+
cont, end_to_end_time, prefilling_time, token_per_sec = self._model_generate(
|
438 |
context=context_enc,
|
439 |
attention_mask=attn_masks,
|
440 |
stop=until,
|
|
|
445 |
for cont_toks, context in zip(cont_toks_list, contexts):
|
446 |
# discard context + left-padding toks if using causal decoder-only LM
|
447 |
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
|
|
|
448 |
cont_toks = cont_toks[context_enc.shape[1] :]
|
449 |
+
|
450 |
s = self.tok_decode(cont_toks)
|
451 |
|
452 |
+
# use secondary stop seqs to cut off should-have-been-stopped content post-hoc
|
|
|
453 |
for term in until:
|
454 |
if len(term) > 0:
|
455 |
# ignore '' separator,
|
456 |
# for seq2seq case where self.tok_decode(self.eot_token_id) = ''
|
457 |
s = s.split(term)[0]
|
458 |
+
|
459 |
+
res.append((s, end_to_end_time, prefilling_time, token_per_sec))
|
|
|
460 |
|
461 |
self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
|
462 |
pbar.update(1)
|
src/backend/manage_requests.py
CHANGED
@@ -27,24 +27,24 @@ class EvalRequest:
|
|
27 |
likes: Optional[int] = 0
|
28 |
params: Optional[int] = None
|
29 |
license: Optional[str] = ""
|
30 |
-
batch_size: Optional[int] = 1
|
31 |
-
gpu_type: Optional[str] = "NVIDIA-A100-PCIe-80GB"
|
32 |
|
33 |
def get_model_args(self) -> str:
|
34 |
model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
|
35 |
-
|
36 |
if self.precision in ["float16", "float32", "bfloat16"]:
|
37 |
model_args += f",dtype={self.precision}"
|
38 |
# Quantized models need some added config, the install of bits and bytes, etc
|
39 |
# elif self.precision == "8bit":
|
40 |
# model_args += ",load_in_8bit=True"
|
41 |
-
|
42 |
-
|
43 |
# elif self.precision == "GPTQ":
|
44 |
# A GPTQ model does not need dtype to be specified,
|
45 |
# it will be inferred from the config
|
|
|
46 |
elif self.precision == "8bit":
|
47 |
model_args += ",load_in_8bit=True"
|
|
|
48 |
else:
|
49 |
raise Exception(f"Unknown precision {self.precision}.")
|
50 |
return model_args
|
|
|
27 |
likes: Optional[int] = 0
|
28 |
params: Optional[int] = None
|
29 |
license: Optional[str] = ""
|
|
|
|
|
30 |
|
31 |
def get_model_args(self) -> str:
|
32 |
model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
|
33 |
+
|
34 |
if self.precision in ["float16", "float32", "bfloat16"]:
|
35 |
model_args += f",dtype={self.precision}"
|
36 |
# Quantized models need some added config, the install of bits and bytes, etc
|
37 |
# elif self.precision == "8bit":
|
38 |
# model_args += ",load_in_8bit=True"
|
39 |
+
# elif self.precision == "4bit":
|
40 |
+
# model_args += ",load_in_4bit=True"
|
41 |
# elif self.precision == "GPTQ":
|
42 |
# A GPTQ model does not need dtype to be specified,
|
43 |
# it will be inferred from the config
|
44 |
+
pass
|
45 |
elif self.precision == "8bit":
|
46 |
model_args += ",load_in_8bit=True"
|
47 |
+
model_args += ",trust_remote_code=True"
|
48 |
else:
|
49 |
raise Exception(f"Unknown precision {self.precision}.")
|
50 |
return model_args
|
src/backend/moe_infinity.py
CHANGED
@@ -31,20 +31,15 @@ class MoEHFLM(HFLMWithMeasurement):
|
|
31 |
self.use_chat_template = use_chat_template
|
32 |
if "device" in kwargs:
|
33 |
kwargs.pop("device")
|
34 |
-
if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")):
|
35 |
-
shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
|
36 |
-
kwargs["device_map"] = "cuda:0"
|
37 |
super().__init__(
|
38 |
-
*args, **kwargs, pretrained=pretrained
|
39 |
) # Assuming HFLM accepts a 'pretrained' arg and handles it
|
40 |
# self._create_model()
|
|
|
41 |
|
42 |
def __del__(self):
|
43 |
-
|
44 |
-
|
45 |
-
if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")):
|
46 |
-
shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads")) # clean up offload model
|
47 |
-
|
48 |
|
49 |
def _create_model(self, *args, **kwargs):
|
50 |
"""
|
|
|
31 |
self.use_chat_template = use_chat_template
|
32 |
if "device" in kwargs:
|
33 |
kwargs.pop("device")
|
|
|
|
|
|
|
34 |
super().__init__(
|
35 |
+
*args, **kwargs, pretrained=pretrained, device_map="cuda:0"
|
36 |
) # Assuming HFLM accepts a 'pretrained' arg and handles it
|
37 |
# self._create_model()
|
38 |
+
shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
|
39 |
|
40 |
def __del__(self):
|
41 |
+
# Clean up offloaded models from self.offload_path
|
42 |
+
shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
|
|
|
|
|
|
|
43 |
|
44 |
def _create_model(self, *args, **kwargs):
|
45 |
"""
|
src/backend/run_eval_suite.py
CHANGED
@@ -13,20 +13,16 @@ orig_higher_is_better = ConfigurableTask.higher_is_better
|
|
13 |
def process_results_decorator(func):
|
14 |
def wrapper(self, doc, results, *args, **kwargs):
|
15 |
processed_results = [r[0] for r in results]
|
16 |
-
|
17 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
18 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
19 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
20 |
-
mfu = sum([r[4] for r in results]) / len(results)
|
21 |
-
mbu = sum([r[5] for r in results]) / len(results)
|
22 |
# print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
23 |
|
24 |
result_dict = func(self, doc, processed_results, *args, **kwargs)
|
25 |
result_dict["end_to_end_time"] = end_to_end_time
|
26 |
result_dict["prefilling_time"] = prefilling_time
|
27 |
result_dict["decoding_throughput"] = decoding_throughput
|
28 |
-
result_dict["mfu"] = mfu
|
29 |
-
result_dict["mbu"] = mbu
|
30 |
return result_dict
|
31 |
return wrapper
|
32 |
ConfigurableTask.process_results = process_results_decorator(orig_process_results)
|
@@ -37,8 +33,6 @@ def aggregation_decorator(func):
|
|
37 |
aggregation_list["end_to_end_time"] = mean
|
38 |
aggregation_list["prefilling_time"] = mean
|
39 |
aggregation_list["decoding_throughput"] = mean
|
40 |
-
aggregation_list["mfu"] = mean
|
41 |
-
aggregation_list["mbu"] = mean
|
42 |
return aggregation_list
|
43 |
return wrapper
|
44 |
ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
|
@@ -49,8 +43,6 @@ def higher_is_better_decorator(func):
|
|
49 |
higher_is_better_dict["end_to_end_time"] = False
|
50 |
higher_is_better_dict["prefilling_time"] = False
|
51 |
higher_is_better_dict["decoding_throughput"] = True
|
52 |
-
higher_is_better_dict["mfu"] = True
|
53 |
-
higher_is_better_dict["mbu"] = True
|
54 |
return higher_is_better_dict
|
55 |
return wrapper
|
56 |
ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)
|
|
|
13 |
def process_results_decorator(func):
|
14 |
def wrapper(self, doc, results, *args, **kwargs):
|
15 |
processed_results = [r[0] for r in results]
|
16 |
+
|
17 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
18 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
19 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
|
|
|
|
20 |
# print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
21 |
|
22 |
result_dict = func(self, doc, processed_results, *args, **kwargs)
|
23 |
result_dict["end_to_end_time"] = end_to_end_time
|
24 |
result_dict["prefilling_time"] = prefilling_time
|
25 |
result_dict["decoding_throughput"] = decoding_throughput
|
|
|
|
|
26 |
return result_dict
|
27 |
return wrapper
|
28 |
ConfigurableTask.process_results = process_results_decorator(orig_process_results)
|
|
|
33 |
aggregation_list["end_to_end_time"] = mean
|
34 |
aggregation_list["prefilling_time"] = mean
|
35 |
aggregation_list["decoding_throughput"] = mean
|
|
|
|
|
36 |
return aggregation_list
|
37 |
return wrapper
|
38 |
ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
|
|
|
43 |
higher_is_better_dict["end_to_end_time"] = False
|
44 |
higher_is_better_dict["prefilling_time"] = False
|
45 |
higher_is_better_dict["decoding_throughput"] = True
|
|
|
|
|
46 |
return higher_is_better_dict
|
47 |
return wrapper
|
48 |
ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)
|
src/backend/tasks/arena_hard/__init__.py
DELETED
File without changes
|
src/backend/tasks/arena_hard/arena_hard.yaml
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
task: arena_hard
|
2 |
-
class: !function task.ArenaHard
|
|
|
|
|
|
src/backend/tasks/arena_hard/arena_judgment.py
DELETED
@@ -1,256 +0,0 @@
|
|
1 |
-
'''
|
2 |
-
This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
|
3 |
-
under the Apache 2.0 License from the arena-hard project.
|
4 |
-
(https://github.com/lm-sys/arena-hard)
|
5 |
-
Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
|
6 |
-
See the NOTICE file distributed with this work for additional
|
7 |
-
information regarding copyright ownership.
|
8 |
-
'''
|
9 |
-
|
10 |
-
import pandas as pd
|
11 |
-
from tqdm import tqdm
|
12 |
-
import numpy as np
|
13 |
-
from sklearn.linear_model import LogisticRegression
|
14 |
-
import math
|
15 |
-
from collections import defaultdict
|
16 |
-
from tqdm import tqdm
|
17 |
-
|
18 |
-
from src.backend.tasks.arena_hard.arena_utils import (
|
19 |
-
chat_completion_openai,
|
20 |
-
load_questions,
|
21 |
-
load_model_answers,
|
22 |
-
get_endpoint,
|
23 |
-
make_config,
|
24 |
-
)
|
25 |
-
|
26 |
-
|
27 |
-
def get_score(judgment, pattern, pairwise=True):
|
28 |
-
matches = pattern.findall(judgment)
|
29 |
-
matches = [m for m in matches if m != ""]
|
30 |
-
if len(set(matches)) == 0:
|
31 |
-
return None, True
|
32 |
-
elif len(set(matches)) == 1:
|
33 |
-
if pairwise:
|
34 |
-
return matches[0].strip("\n"), False
|
35 |
-
return int(matches[0])
|
36 |
-
else:
|
37 |
-
return None, False
|
38 |
-
|
39 |
-
|
40 |
-
# get answer from model
|
41 |
-
def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None):
|
42 |
-
api_dict = get_endpoint(endpoint_dict["endpoints"])
|
43 |
-
|
44 |
-
# if endpoint_dict["api_type"] == "anthropic":
|
45 |
-
# output = chat_completion_anthropic(model, conv, temperature, max_tokens)
|
46 |
-
# elif endpoint_dict["api_type"] == "azure":
|
47 |
-
# output = chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict)
|
48 |
-
|
49 |
-
output = chat_completion_openai(model, conv, temperature, max_tokens, api_dict)
|
50 |
-
return output
|
51 |
-
|
52 |
-
|
53 |
-
def judgment(**args):
|
54 |
-
question = args["question"]
|
55 |
-
answer = args["answer"]
|
56 |
-
reference = args["reference"]
|
57 |
-
baseline = args["baseline_answer"]
|
58 |
-
configs = args["configs"]
|
59 |
-
# output_file = args["output_file"]
|
60 |
-
model = configs["judge_model"]
|
61 |
-
|
62 |
-
num_games = 2 if configs["pairwise"] else 1
|
63 |
-
|
64 |
-
# output = {
|
65 |
-
# "question_id":question["question_id"],
|
66 |
-
# "judge": model,
|
67 |
-
# "model": "custom_model",
|
68 |
-
# "games":[]
|
69 |
-
# }
|
70 |
-
output = [question["question_id"]]
|
71 |
-
|
72 |
-
for game in range(num_games):
|
73 |
-
conv = [{"role": "system", "content": configs["system_prompt"]}]
|
74 |
-
|
75 |
-
for template in configs["prompt_template"]:
|
76 |
-
prompt_args = {}
|
77 |
-
|
78 |
-
prompt_args[f"question_{1}"] = question["content"]
|
79 |
-
base = 1
|
80 |
-
|
81 |
-
if baseline:
|
82 |
-
if game % 2 == 1: # swap position
|
83 |
-
temp = baseline
|
84 |
-
baseline = answer
|
85 |
-
answer = temp
|
86 |
-
|
87 |
-
if game == 0:
|
88 |
-
for i, turn in enumerate(baseline["choices"][0]["turns"]):
|
89 |
-
prompt_args[f"answer_{i+1}"] = turn["content"]
|
90 |
-
base += 1
|
91 |
-
|
92 |
-
if game == 1:
|
93 |
-
prompt_args[f"answer_{1}"] = baseline
|
94 |
-
base += 1
|
95 |
-
|
96 |
-
if answer:
|
97 |
-
prompt_args[f"answer_{base}"] = answer
|
98 |
-
|
99 |
-
if reference:
|
100 |
-
for j, ref_answer in enumerate(reference):
|
101 |
-
for i, turn in enumerate(ref_answer["choices"][0]["turns"]):
|
102 |
-
prompt_args[f"ref_answer_{i+j+1}"] = turn["content"]
|
103 |
-
|
104 |
-
user_prompt = template.format(**prompt_args)
|
105 |
-
conv.append({"role": "user", "content": user_prompt})
|
106 |
-
|
107 |
-
judgment = ""
|
108 |
-
for _ in range(2):
|
109 |
-
new_judgment = get_answer(
|
110 |
-
model,
|
111 |
-
conv,
|
112 |
-
configs["temperature"],
|
113 |
-
configs["max_tokens"],
|
114 |
-
args["endpoint_dict"],
|
115 |
-
)
|
116 |
-
|
117 |
-
judgment += ("\n" + new_judgment)
|
118 |
-
|
119 |
-
score, try_again = get_score(judgment, args["regex_pattern"])
|
120 |
-
|
121 |
-
conv.append({"role": "assistant", "content": new_judgment})
|
122 |
-
|
123 |
-
if not try_again:
|
124 |
-
break
|
125 |
-
|
126 |
-
conv.append({"role": "user", "content": "continue your judgment and finish by outputting a final verdict label"})
|
127 |
-
print("Finish judgment!!!")
|
128 |
-
# result = {
|
129 |
-
# "user_prompt": conv[1]["content"],
|
130 |
-
# "judgment": judgment,
|
131 |
-
# "score":score
|
132 |
-
# }
|
133 |
-
output.append(score)
|
134 |
-
|
135 |
-
return output
|
136 |
-
|
137 |
-
def get_battles_from_scores(score_list, first_game_only=False, WEIGHT=3):
|
138 |
-
arena_hard_battles = pd.DataFrame()
|
139 |
-
|
140 |
-
print("Turning score list into battles...")
|
141 |
-
|
142 |
-
for scores in tqdm(score_list):
|
143 |
-
question_id, score1, score2 = scores
|
144 |
-
|
145 |
-
# Process game 1
|
146 |
-
output = {"question_id": question_id,
|
147 |
-
"model_a": "gpt-4-0314",
|
148 |
-
"model_b": f"custom_model"} # Unique identifier for model
|
149 |
-
weight = 1
|
150 |
-
if score1 == "A=B":
|
151 |
-
output["winner"] = "tie"
|
152 |
-
elif score1 == "A>B":
|
153 |
-
output["winner"] = "model_a"
|
154 |
-
elif score1 == "A>>B":
|
155 |
-
output["winner"] = "model_a"
|
156 |
-
weight = WEIGHT
|
157 |
-
elif score1 == "B>A":
|
158 |
-
output["winner"] = "model_b"
|
159 |
-
elif score1 == "B>>A":
|
160 |
-
output["winner"] = "model_b"
|
161 |
-
weight = WEIGHT
|
162 |
-
else:
|
163 |
-
weight = 0
|
164 |
-
|
165 |
-
if weight:
|
166 |
-
arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
|
167 |
-
|
168 |
-
if not first_game_only:
|
169 |
-
# Process game 2
|
170 |
-
output = {"question_id": question_id,
|
171 |
-
"model_a": "gpt-4-0314",
|
172 |
-
"model_b": f"custom_model"} # Unique identifier for model
|
173 |
-
weight = 1
|
174 |
-
if score2 == "A=B":
|
175 |
-
output["winner"] = "tie"
|
176 |
-
elif score2 == "A>B":
|
177 |
-
output["winner"] = "model_b"
|
178 |
-
elif score2 == "A>>B":
|
179 |
-
output["winner"] = "model_b"
|
180 |
-
weight = WEIGHT
|
181 |
-
elif score2 == "B>A":
|
182 |
-
output["winner"] = "model_a"
|
183 |
-
elif score2 == "B>>A":
|
184 |
-
output["winner"] = "model_a"
|
185 |
-
weight = WEIGHT
|
186 |
-
else:
|
187 |
-
weight = 0
|
188 |
-
|
189 |
-
if weight:
|
190 |
-
arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
|
191 |
-
|
192 |
-
arena_hard_battles.to_json("./arena_hard_battles.jsonl", lines=True, orient="records")
|
193 |
-
return arena_hard_battles
|
194 |
-
|
195 |
-
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
196 |
-
models = pd.concat([df["model_a"], df["model_b"]]).unique()
|
197 |
-
models = pd.Series(np.arange(len(models)), index=models)
|
198 |
-
|
199 |
-
LOW_RATING = 100
|
200 |
-
# duplicate battles
|
201 |
-
df = pd.concat([df, df], ignore_index=True)
|
202 |
-
p = len(models.index)
|
203 |
-
n = df.shape[0]
|
204 |
-
|
205 |
-
X = np.zeros([n, p])
|
206 |
-
X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
|
207 |
-
X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
|
208 |
-
|
209 |
-
# one A win => two A win
|
210 |
-
Y = np.zeros(n)
|
211 |
-
Y[df["winner"] == "model_a"] = 1.0
|
212 |
-
|
213 |
-
# one tie => one A win + one B win
|
214 |
-
# find tie + tie (both bad) index
|
215 |
-
tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
|
216 |
-
tie_idx[len(tie_idx)//2:] = False
|
217 |
-
Y[tie_idx] = 1.0
|
218 |
-
|
219 |
-
if len(np.unique(Y)) == 1:
|
220 |
-
# If there's only one class in the data, assign default ratings
|
221 |
-
elo_scores = np.full(p, LOW_RATING)
|
222 |
-
elo_scores[models["gpt-4-0314"]] = INIT_RATING
|
223 |
-
else:
|
224 |
-
lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
|
225 |
-
lr.fit(X,Y)
|
226 |
-
|
227 |
-
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
|
228 |
-
|
229 |
-
# set anchor as gpt-4-0314 = 1000
|
230 |
-
if "gpt-4-0314" in models.index:
|
231 |
-
elo_scores += 1000 - elo_scores[models["gpt-4-0314"]]
|
232 |
-
return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)
|
233 |
-
|
234 |
-
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
|
235 |
-
names = sorted(list(elo_ratings.keys()))
|
236 |
-
wins = defaultdict(lambda: defaultdict(lambda: 0))
|
237 |
-
for a in names:
|
238 |
-
for b in names:
|
239 |
-
ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
|
240 |
-
wins[a][b] = ea
|
241 |
-
wins[b][a] = 1 - ea
|
242 |
-
|
243 |
-
data = {
|
244 |
-
a: [wins[a][b] if a != b else np.NAN for b in names]
|
245 |
-
for a in names
|
246 |
-
}
|
247 |
-
|
248 |
-
df = pd.DataFrame(data, index=names)
|
249 |
-
df.index.name = "model_a"
|
250 |
-
df.columns.name = "model_b"
|
251 |
-
return df.T
|
252 |
-
|
253 |
-
def get_win_rate_column(df, column, baseline="gpt-4-0314"):
|
254 |
-
to_dict = df[["model", column]].set_index("model").to_dict()[column]
|
255 |
-
win_rate_table = predict_win_rate(to_dict)
|
256 |
-
return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/tasks/arena_hard/arena_utils.py
DELETED
@@ -1,349 +0,0 @@
|
|
1 |
-
'''
|
2 |
-
This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
|
3 |
-
under the Apache 2.0 License from the arena-hard project.
|
4 |
-
(https://github.com/lm-sys/arena-hard)
|
5 |
-
Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
|
6 |
-
See the NOTICE file distributed with this work for additional
|
7 |
-
information regarding copyright ownership.
|
8 |
-
'''
|
9 |
-
|
10 |
-
|
11 |
-
import os
|
12 |
-
import json
|
13 |
-
import time
|
14 |
-
import yaml
|
15 |
-
import random
|
16 |
-
|
17 |
-
from typing import Optional
|
18 |
-
from glob import glob
|
19 |
-
|
20 |
-
# API setting constants
|
21 |
-
API_MAX_RETRY = 16
|
22 |
-
API_RETRY_SLEEP = 10
|
23 |
-
API_ERROR_OUTPUT = "$ERROR$"
|
24 |
-
|
25 |
-
|
26 |
-
OPENAI_MODEL_LIST = (
|
27 |
-
"gpt-3.5-turbo",
|
28 |
-
"gpt-3.5-turbo-0301",
|
29 |
-
"gpt-3.5-turbo-0613",
|
30 |
-
"gpt-3.5-turbo-0613-verbose",
|
31 |
-
"gpt-3.5-turbo-1106",
|
32 |
-
"gpt-3.5-turbo-0125",
|
33 |
-
"gpt-4",
|
34 |
-
"gpt-4-0314",
|
35 |
-
"gpt-4-0613",
|
36 |
-
"gpt-4-turbo",
|
37 |
-
"gpt-4-1106-preview",
|
38 |
-
"gpt-4-0125-preview",
|
39 |
-
)
|
40 |
-
|
41 |
-
|
42 |
-
temperature_config = {
|
43 |
-
"writing": 0.7,
|
44 |
-
"roleplay": 0.7,
|
45 |
-
"extraction": 0.0,
|
46 |
-
"math": 0.0,
|
47 |
-
"coding": 0.0,
|
48 |
-
"reasoning": 0.0,
|
49 |
-
"stem": 0.1,
|
50 |
-
"humanities": 0.1,
|
51 |
-
}
|
52 |
-
|
53 |
-
|
54 |
-
def load_questions(question_file: str):
|
55 |
-
"""Load questions from a file."""
|
56 |
-
questions = []
|
57 |
-
with open(question_file, "r") as ques_file:
|
58 |
-
for line in ques_file:
|
59 |
-
if line:
|
60 |
-
questions.append(json.loads(line))
|
61 |
-
return questions
|
62 |
-
|
63 |
-
|
64 |
-
def load_model_answers(answer_dir: str):
|
65 |
-
"""Load model answers.
|
66 |
-
|
67 |
-
The return value is a python dict of type:
|
68 |
-
Dict[model_name: str -> Dict[question_id: int -> answer: dict]]
|
69 |
-
"""
|
70 |
-
filenames = glob(os.path.join(answer_dir, "*.jsonl"))
|
71 |
-
filenames.sort()
|
72 |
-
model_answers = {}
|
73 |
-
|
74 |
-
for filename in filenames:
|
75 |
-
model_name = os.path.basename(filename)[:-6]
|
76 |
-
answer = {}
|
77 |
-
with open(filename) as fin:
|
78 |
-
for line in fin:
|
79 |
-
line = json.loads(line)
|
80 |
-
answer[line["question_id"]] = line
|
81 |
-
model_answers[model_name] = answer
|
82 |
-
|
83 |
-
return model_answers
|
84 |
-
|
85 |
-
|
86 |
-
def get_endpoint(endpoint_list):
|
87 |
-
if endpoint_list is None:
|
88 |
-
return None
|
89 |
-
assert endpoint_list is not None
|
90 |
-
# randomly pick one
|
91 |
-
api_dict = random.choices(
|
92 |
-
endpoint_list
|
93 |
-
)[0]
|
94 |
-
return api_dict
|
95 |
-
|
96 |
-
|
97 |
-
# load config args from config yaml files
|
98 |
-
def make_config(config_file: str) -> dict:
|
99 |
-
config_kwargs = {}
|
100 |
-
with open(config_file, "r") as f:
|
101 |
-
config_kwargs = yaml.load(f, Loader=yaml.SafeLoader)
|
102 |
-
|
103 |
-
return config_kwargs
|
104 |
-
|
105 |
-
|
106 |
-
def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
|
107 |
-
import openai
|
108 |
-
if api_dict:
|
109 |
-
client = openai.OpenAI(
|
110 |
-
base_url=api_dict["api_base"],
|
111 |
-
api_key=api_dict["api_key"],
|
112 |
-
)
|
113 |
-
else:
|
114 |
-
client = openai.OpenAI()
|
115 |
-
|
116 |
-
output = API_ERROR_OUTPUT
|
117 |
-
for _ in range(API_MAX_RETRY):
|
118 |
-
try:
|
119 |
-
# print(messages)
|
120 |
-
completion = client.chat.completions.create(
|
121 |
-
model=model,
|
122 |
-
messages=messages,
|
123 |
-
temperature=temperature,
|
124 |
-
max_tokens=max_tokens
|
125 |
-
)
|
126 |
-
output = completion.choices[0].message.content
|
127 |
-
break
|
128 |
-
except openai.RateLimitError as e:
|
129 |
-
print(type(e), e)
|
130 |
-
time.sleep(API_RETRY_SLEEP)
|
131 |
-
except openai.BadRequestError as e:
|
132 |
-
print(messages)
|
133 |
-
print(type(e), e)
|
134 |
-
except KeyError:
|
135 |
-
print(type(e), e)
|
136 |
-
break
|
137 |
-
|
138 |
-
return output
|
139 |
-
|
140 |
-
|
141 |
-
# def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_dict=None):
|
142 |
-
# import openai
|
143 |
-
# from openai import AzureOpenAI
|
144 |
-
|
145 |
-
# api_base = api_dict["api_base"]
|
146 |
-
# client = AzureOpenAI(
|
147 |
-
# azure_endpoint = api_base,
|
148 |
-
# api_key= api_dict["api_key"],
|
149 |
-
# api_version=api_dict["api_version"],
|
150 |
-
# timeout=240,
|
151 |
-
# max_retries=2
|
152 |
-
# )
|
153 |
-
|
154 |
-
# output = API_ERROR_OUTPUT
|
155 |
-
# for _ in range(API_MAX_RETRY):
|
156 |
-
# try:
|
157 |
-
# response = client.chat.completions.create(
|
158 |
-
# model=model,
|
159 |
-
# messages=messages,
|
160 |
-
# n=1,
|
161 |
-
# temperature=temperature,
|
162 |
-
# max_tokens=max_tokens,
|
163 |
-
# seed=42,
|
164 |
-
# )
|
165 |
-
# output = response.choices[0].message.content
|
166 |
-
# break
|
167 |
-
# except openai.RateLimitError as e:
|
168 |
-
# print(type(e), e)
|
169 |
-
# time.sleep(API_RETRY_SLEEP)
|
170 |
-
# except openai.BadRequestError as e:
|
171 |
-
# print(type(e), e)
|
172 |
-
# break
|
173 |
-
# except KeyError:
|
174 |
-
# print(type(e), e)
|
175 |
-
# break
|
176 |
-
|
177 |
-
# return output
|
178 |
-
|
179 |
-
|
180 |
-
# def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict=None):
|
181 |
-
# import anthropic
|
182 |
-
|
183 |
-
# if api_dict:
|
184 |
-
# api_key = api_dict["api_key"]
|
185 |
-
# else:
|
186 |
-
# api_key = os.environ["ANTHROPIC_API_KEY"]
|
187 |
-
|
188 |
-
# sys_msg = ""
|
189 |
-
# if messages[0]["role"] == "system":
|
190 |
-
# sys_msg = messages[0]["content"]
|
191 |
-
# messages = messages[1:]
|
192 |
-
|
193 |
-
# output = API_ERROR_OUTPUT
|
194 |
-
# for _ in range(API_MAX_RETRY):
|
195 |
-
# try:
|
196 |
-
# # print(sys_msg)
|
197 |
-
# c = anthropic.Anthropic(api_key=api_key)
|
198 |
-
# response = c.messages.create(
|
199 |
-
# model=model,
|
200 |
-
# messages=messages,
|
201 |
-
# stop_sequences=[anthropic.HUMAN_PROMPT],
|
202 |
-
# max_tokens=max_tokens,
|
203 |
-
# temperature=temperature,
|
204 |
-
# system=sys_msg
|
205 |
-
# )
|
206 |
-
# output = response.content[0].text
|
207 |
-
# break
|
208 |
-
# except anthropic.APIError as e:
|
209 |
-
# print(type(e), e)
|
210 |
-
# time.sleep(API_RETRY_SLEEP)
|
211 |
-
# return output
|
212 |
-
|
213 |
-
|
214 |
-
# def chat_completion_mistral(model, messages, temperature, max_tokens):
|
215 |
-
# from mistralai.client import MistralClient
|
216 |
-
# from mistralai.models.chat_completion import ChatMessage
|
217 |
-
# from mistralai.exceptions import MistralException
|
218 |
-
|
219 |
-
# api_key = os.environ["MISTRAL_API_KEY"]
|
220 |
-
# client = MistralClient(api_key=api_key)
|
221 |
-
|
222 |
-
# prompts = [ChatMessage(role=message["role"], content=message["content"]) for message in messages]
|
223 |
-
|
224 |
-
# output = API_ERROR_OUTPUT
|
225 |
-
# for _ in range(API_MAX_RETRY):
|
226 |
-
# try:
|
227 |
-
# chat_response = client.chat(
|
228 |
-
# model=model,
|
229 |
-
# messages=prompts,
|
230 |
-
# temperature=temperature,
|
231 |
-
# max_tokens=max_tokens,
|
232 |
-
# )
|
233 |
-
# output = chat_response.choices[0].message.content
|
234 |
-
# break
|
235 |
-
# except MistralException as e:
|
236 |
-
# print(type(e), e)
|
237 |
-
# break
|
238 |
-
|
239 |
-
# return output
|
240 |
-
|
241 |
-
|
242 |
-
# def chat_completion_gemini(model, messages, temperature, max_tokens):
|
243 |
-
# import google.generativeai as genai
|
244 |
-
# genai.configure(api_key=os.environ["GEMINI_API_KEY"])
|
245 |
-
|
246 |
-
# safety_settings = [
|
247 |
-
# {
|
248 |
-
# "category": "HARM_CATEGORY_HARASSMENT",
|
249 |
-
# "threshold": "BLOCK_NONE"
|
250 |
-
# },
|
251 |
-
# {
|
252 |
-
# "category": "HARM_CATEGORY_HATE_SPEECH",
|
253 |
-
# "threshold": "BLOCK_NONE"
|
254 |
-
# },
|
255 |
-
# {
|
256 |
-
# "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
257 |
-
# "threshold": "BLOCK_NONE"
|
258 |
-
# },
|
259 |
-
# {
|
260 |
-
# "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
|
261 |
-
# "threshold": "BLOCK_NONE"
|
262 |
-
# },
|
263 |
-
# ]
|
264 |
-
|
265 |
-
# # Set up the model
|
266 |
-
# generation_config = {
|
267 |
-
# "temperature": temperature,
|
268 |
-
# "top_p": 1,
|
269 |
-
# "top_k": 1,
|
270 |
-
# "max_output_tokens": max_tokens,
|
271 |
-
# }
|
272 |
-
|
273 |
-
# output = API_ERROR_OUTPUT
|
274 |
-
# for _ in range(API_MAX_RETRY):
|
275 |
-
# try:
|
276 |
-
# gemini = genai.GenerativeModel(
|
277 |
-
# model_name=model,
|
278 |
-
# generation_config=generation_config,
|
279 |
-
# safety_settings=safety_settings)
|
280 |
-
|
281 |
-
# convo = gemini.start_chat(history=[])
|
282 |
-
|
283 |
-
# convo.send_message(messages)
|
284 |
-
# output = convo.last.text
|
285 |
-
# break
|
286 |
-
# except genai.types.generation_types.StopCandidateException as e:
|
287 |
-
# print(type(e), e)
|
288 |
-
# break
|
289 |
-
# except Exception as e:
|
290 |
-
# print(type(e), e)
|
291 |
-
# time.sleep(API_RETRY_SLEEP)
|
292 |
-
|
293 |
-
# return output
|
294 |
-
|
295 |
-
|
296 |
-
# def chat_completion_cohere(model, messages, temperature, max_tokens):
|
297 |
-
# import cohere
|
298 |
-
|
299 |
-
# co = cohere.Client(os.environ["COHERE_API_KEY"])
|
300 |
-
# assert len(messages) > 0
|
301 |
-
|
302 |
-
# template_map = {"system":"SYSTEM",
|
303 |
-
# "assistant":"CHATBOT",
|
304 |
-
# "user":"USER"}
|
305 |
-
|
306 |
-
# assert messages[-1]["role"] == "user"
|
307 |
-
# prompt = messages[-1]["content"]
|
308 |
-
|
309 |
-
# if len(messages) > 1:
|
310 |
-
# history = []
|
311 |
-
# for message in messages[:-1]:
|
312 |
-
# history.append({"role":template_map[message["role"]], "message":message["content"]})
|
313 |
-
# else:
|
314 |
-
# history = None
|
315 |
-
|
316 |
-
# output = API_ERROR_OUTPUT
|
317 |
-
# for _ in range(API_MAX_RETRY):
|
318 |
-
# try:
|
319 |
-
# response = co.chat(
|
320 |
-
# message=prompt,
|
321 |
-
# model=model,
|
322 |
-
# temperature=temperature,
|
323 |
-
# max_tokens=max_tokens,
|
324 |
-
# chat_history=history,
|
325 |
-
# )
|
326 |
-
# output = response.text
|
327 |
-
# break
|
328 |
-
# except cohere.core.api_error.ApiError as e:
|
329 |
-
# print(type(e), e)
|
330 |
-
# raise
|
331 |
-
# except Exception as e:
|
332 |
-
# print(type(e), e)
|
333 |
-
# break
|
334 |
-
|
335 |
-
# return output
|
336 |
-
|
337 |
-
|
338 |
-
def reorg_answer_file(answer_file):
|
339 |
-
"""Sort by question id and de-duplication"""
|
340 |
-
answers = {}
|
341 |
-
with open(answer_file, "r") as fin:
|
342 |
-
for l in fin:
|
343 |
-
qid = json.loads(l)["question_id"]
|
344 |
-
answers[qid] = l
|
345 |
-
|
346 |
-
qids = sorted(list(answers.keys()))
|
347 |
-
with open(answer_file, "w") as fout:
|
348 |
-
for qid in qids:
|
349 |
-
fout.write(answers[qid])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/tasks/arena_hard/configs/api_config.yaml
DELETED
@@ -1,17 +0,0 @@
|
|
1 |
-
# gpt-3.5-turbo:
|
2 |
-
# model_name: gpt-3.5-turbo
|
3 |
-
# endpoints: null
|
4 |
-
# api_type: openai
|
5 |
-
# parallel: 8
|
6 |
-
|
7 |
-
gpt-4-1106-preview:
|
8 |
-
model_name: gpt-4-1106-preview
|
9 |
-
endpoints: null
|
10 |
-
api_type: openai
|
11 |
-
parallel: 8
|
12 |
-
|
13 |
-
# llama3-7b:
|
14 |
-
# model_name: llama3-7b
|
15 |
-
# endpoints: null
|
16 |
-
# api_type: openai
|
17 |
-
# parallel: 8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/tasks/arena_hard/configs/judge_config.yaml
DELETED
@@ -1,26 +0,0 @@
|
|
1 |
-
name: judgment config file for Arena Hard
|
2 |
-
|
3 |
-
bench_name: arena-hard-v0.1
|
4 |
-
|
5 |
-
# Arena Hard default
|
6 |
-
judge_model: gpt-4-1106-preview
|
7 |
-
# judge_model: gpt-3.5-turbo
|
8 |
-
reference: False # Optional
|
9 |
-
ref_model: null
|
10 |
-
|
11 |
-
baseline: True
|
12 |
-
baseline_model: gpt-4-0314
|
13 |
-
|
14 |
-
pairwise: True
|
15 |
-
temperature: 0
|
16 |
-
max_tokens: 4096
|
17 |
-
|
18 |
-
regex_pattern: \[\[([AB<>=]+)\]\]
|
19 |
-
|
20 |
-
system_prompt: "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
|
21 |
-
|
22 |
-
prompt_template: ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"]
|
23 |
-
|
24 |
-
# Add your model below for evaluation
|
25 |
-
# model_list:
|
26 |
-
# - gpt-3.5-turbo-0125
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl
DELETED
The diff for this file is too large to render.
See raw diff
|
|
src/backend/tasks/arena_hard/question.jsonl
DELETED
The diff for this file is too large to render.
See raw diff
|
|
src/backend/tasks/arena_hard/task.py
DELETED
@@ -1,220 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from typing import Union, List
|
3 |
-
|
4 |
-
from lm_eval.api.task import ConfigurableTask
|
5 |
-
from lm_eval.api.instance import Instance
|
6 |
-
|
7 |
-
# from lm_eval.api.registry import register_task
|
8 |
-
from lm_eval.api.metrics import mean
|
9 |
-
|
10 |
-
from src.backend.envs import DEVICE
|
11 |
-
|
12 |
-
import pandas as pd
|
13 |
-
|
14 |
-
from src.backend.tasks.measurement_task_utils import measure_system_metrics
|
15 |
-
import json
|
16 |
-
|
17 |
-
from typing import (
|
18 |
-
Any,
|
19 |
-
Dict,
|
20 |
-
List,
|
21 |
-
Optional,
|
22 |
-
Union,
|
23 |
-
)
|
24 |
-
|
25 |
-
from datasets import Dataset
|
26 |
-
import re
|
27 |
-
|
28 |
-
from src.backend.tasks.arena_hard.arena_utils import (
|
29 |
-
load_questions,
|
30 |
-
load_questions,
|
31 |
-
load_model_answers,
|
32 |
-
make_config,
|
33 |
-
)
|
34 |
-
|
35 |
-
from src.backend.tasks.arena_hard.arena_judgment import (
|
36 |
-
judgment,
|
37 |
-
get_battles_from_scores,
|
38 |
-
compute_mle_elo,
|
39 |
-
predict_win_rate,
|
40 |
-
get_win_rate_column
|
41 |
-
)
|
42 |
-
|
43 |
-
def load_questions(question_file: str):
|
44 |
-
"""Load questions from a file."""
|
45 |
-
questions = []
|
46 |
-
with open(question_file, "r") as ques_file:
|
47 |
-
for line in ques_file:
|
48 |
-
if line:
|
49 |
-
questions.append(json.loads(line))
|
50 |
-
return questions
|
51 |
-
|
52 |
-
def download_wrapper(func):
|
53 |
-
def download(self, *args, **kwargs):
|
54 |
-
print("Using Arena Hard, No need to download")
|
55 |
-
return download
|
56 |
-
|
57 |
-
original_download = ConfigurableTask.download
|
58 |
-
ConfigurableTask.download = download_wrapper(original_download)
|
59 |
-
# @register_task("selfcheckgpt")
|
60 |
-
@measure_system_metrics
|
61 |
-
class ArenaHard(ConfigurableTask):
|
62 |
-
VERSION = 0.0
|
63 |
-
OUTPUT_TYPE = "generate_until"
|
64 |
-
data_path = os.path.join(os.path.dirname(__file__), 'question.jsonl')
|
65 |
-
judge_config_path = os.path.join(os.path.dirname(__file__), "configs/judge_config.yaml")
|
66 |
-
configs = make_config(judge_config_path)
|
67 |
-
model_ans_dir = os.path.join(os.path.dirname(__file__), "model_answer")
|
68 |
-
model_answers = load_model_answers(model_ans_dir)
|
69 |
-
data = load_questions(data_path)
|
70 |
-
|
71 |
-
def __init__(self):
|
72 |
-
super().__init__(config={"metadata": {"version": self.VERSION}})
|
73 |
-
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
74 |
-
# self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
75 |
-
self.generation_kwargs = {"until": ["</s>", "<|im_end|>"], "max_gen_toks": 4096}
|
76 |
-
# self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
77 |
-
# self.generation_kwargs_sampling = {
|
78 |
-
# "temperature": 0.99,
|
79 |
-
# "do_sample": True,
|
80 |
-
# "until": ["<im_end>", "<im_end>"],
|
81 |
-
# "max_length": 1024,
|
82 |
-
# }
|
83 |
-
|
84 |
-
def transform_data(self, data):
|
85 |
-
transformed_data = []
|
86 |
-
for i in range(len(data)):
|
87 |
-
if self.configs["baseline"]:
|
88 |
-
baseline_answer = self.model_answers[self.configs["baseline_model"]][data[i]["question_id"]]
|
89 |
-
else:
|
90 |
-
baseline_answer = None
|
91 |
-
transformed_item = {
|
92 |
-
"question_id": data[i]["question_id"],
|
93 |
-
"content": data[i]["turns"][0]["content"], # Assuming you want the first turn's content
|
94 |
-
"model_answer": baseline_answer
|
95 |
-
}
|
96 |
-
transformed_data.append(transformed_item)
|
97 |
-
return transformed_data
|
98 |
-
|
99 |
-
def has_training_docs(self):
|
100 |
-
return False
|
101 |
-
|
102 |
-
def has_validation_docs(self):
|
103 |
-
return True
|
104 |
-
|
105 |
-
def has_test_docs(self):
|
106 |
-
return False
|
107 |
-
|
108 |
-
def validation_docs(self):
|
109 |
-
self.dataset = self.transform_data(self.data)
|
110 |
-
self.dataset = Dataset.from_dict({"question_id": [item["question_id"] for item in self.dataset],
|
111 |
-
"content": [item["content"] for item in self.dataset],
|
112 |
-
"model_answer": [item["model_answer"] for item in self.dataset]})
|
113 |
-
return self.dataset
|
114 |
-
|
115 |
-
def doc_to_text(self, doc):
|
116 |
-
sentence = doc["content"]
|
117 |
-
doc_text = f"{sentence}\n"
|
118 |
-
return doc_text
|
119 |
-
|
120 |
-
def doc_to_target(self, doc):
|
121 |
-
q_id = doc["question_id"]
|
122 |
-
return q_id
|
123 |
-
|
124 |
-
def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
|
125 |
-
arguments = (ctx, self.generation_kwargs)
|
126 |
-
request_list = [
|
127 |
-
Instance(request_type="generate_until", doc=doc, arguments=arguments, idx=0, **kwargs),
|
128 |
-
]
|
129 |
-
# sampling_arguments = (ctx, self.generation_kwargs_sampling)
|
130 |
-
# request_list.extend(
|
131 |
-
# [
|
132 |
-
# Instance(request_type="generate_until", doc=doc, arguments=sampling_arguments, idx=idx, **kwargs)
|
133 |
-
# for idx in range(1, self.generation_kwargs_sampling_number + 1)
|
134 |
-
# ]
|
135 |
-
# )
|
136 |
-
return request_list
|
137 |
-
|
138 |
-
def process_results(self, doc, results):
|
139 |
-
response_temperature_0 = results[0]
|
140 |
-
# other_responses = results[1:]
|
141 |
-
api_config_path = os.path.join(os.path.dirname(__file__), "configs/api_config.yaml")
|
142 |
-
endpoint_list = make_config(api_config_path)
|
143 |
-
|
144 |
-
if self.configs["regex_pattern"]:
|
145 |
-
pattern = re.compile(self.configs["regex_pattern"])
|
146 |
-
|
147 |
-
ref_answer_dir = os.path.join(os.path.dirname(__file__), "reference_answer")
|
148 |
-
|
149 |
-
ref_answers = None
|
150 |
-
if self.configs["reference"]:
|
151 |
-
ref_answers = load_model_answers(ref_answer_dir)
|
152 |
-
ref_answers = [ref_answers[model] for model in self.configs["ref_model"]]
|
153 |
-
|
154 |
-
# output_files = {}
|
155 |
-
# models = ["custom_model"]
|
156 |
-
# output_dir = f"{os.path.join(os.path.dirname(__file__))}/model_judgments/{self.configs['judge_model']}"
|
157 |
-
# for model in models:
|
158 |
-
# output_files[model] = os.path.join(
|
159 |
-
# output_dir,
|
160 |
-
# f"{model}.jsonl",
|
161 |
-
# )
|
162 |
-
|
163 |
-
# for output_file in output_files.values():
|
164 |
-
# os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
165 |
-
|
166 |
-
endpoint_info = endpoint_list[self.configs["judge_model"]]
|
167 |
-
|
168 |
-
question = doc
|
169 |
-
kwargs = {}
|
170 |
-
kwargs["question"] = question
|
171 |
-
kwargs["answer"] = response_temperature_0
|
172 |
-
if ref_answers:
|
173 |
-
kwargs["reference"] = [ref_answer[doc["question_id"]] for ref_answer in ref_answers]
|
174 |
-
assert len(kwargs["reference"]) == len(self.configs["ref_model"])
|
175 |
-
else:
|
176 |
-
kwargs["reference"] = None
|
177 |
-
|
178 |
-
if self.configs["baseline"]:
|
179 |
-
kwargs["baseline_answer"] = doc["model_answer"]
|
180 |
-
else:
|
181 |
-
kwargs["baseline_answer"] = None
|
182 |
-
kwargs["configs"] = self.configs
|
183 |
-
kwargs["endpoint_dict"] = endpoint_info
|
184 |
-
# kwargs["output_file"] = output_files["custom_model"]
|
185 |
-
kwargs["regex_pattern"] = pattern
|
186 |
-
|
187 |
-
scores = judgment(**kwargs)
|
188 |
-
return {"score": scores}
|
189 |
-
|
190 |
-
def aggregation(self):
|
191 |
-
"""
|
192 |
-
:returns: {str: [float] -> float}
|
193 |
-
A dictionary where keys are the names of submetrics and values are
|
194 |
-
functions that aggregate a list of metrics
|
195 |
-
"""
|
196 |
-
##TODO implement the aggregation function to calculate elo for score
|
197 |
-
def get_win_rate(score_list):
|
198 |
-
battles = get_battles_from_scores(score_list)
|
199 |
-
bootstrap_online_elo = compute_mle_elo(battles)
|
200 |
-
stats = pd.DataFrame()
|
201 |
-
stats["results"] = None
|
202 |
-
stats["results"] = stats['results'].astype('object')
|
203 |
-
for i, model in enumerate(bootstrap_online_elo.index):
|
204 |
-
stats.at[i, "model"] = model
|
205 |
-
stats.at[i, "score"] = bootstrap_online_elo[model]
|
206 |
-
|
207 |
-
stats.sort_values(by="model", inplace=True)
|
208 |
-
stats["score"] = get_win_rate_column(stats, "score", "gpt-4-0314").tolist()
|
209 |
-
|
210 |
-
return stats["score"][1]
|
211 |
-
|
212 |
-
return {k: get_win_rate for k in ["score"]}
|
213 |
-
|
214 |
-
def higher_is_better(self):
|
215 |
-
"""
|
216 |
-
:returns: {str: bool}
|
217 |
-
A dictionary where keys are the names of submetrics and values are
|
218 |
-
whether a higher value of the submetric is better
|
219 |
-
"""
|
220 |
-
return {k: True for k in ["score"]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/tasks/gsm8k/gsm8k-custom.yaml
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
group:
|
2 |
-
- math_word_problems
|
3 |
-
task: gsm8k_custom
|
4 |
-
dataset_path: gsm8k
|
5 |
-
dataset_name: main
|
6 |
-
output_type: generate_until
|
7 |
-
training_split: train
|
8 |
-
fewshot_split: train
|
9 |
-
test_split: test
|
10 |
-
doc_to_text: "Question: {{question}}\nAnswer:"
|
11 |
-
doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
|
12 |
-
metric_list:
|
13 |
-
- metric: exact_match
|
14 |
-
aggregation: mean
|
15 |
-
higher_is_better: true
|
16 |
-
ignore_case: true
|
17 |
-
ignore_punctuation: false
|
18 |
-
regexes_to_ignore:
|
19 |
-
- ","
|
20 |
-
- "\\$"
|
21 |
-
- "(?s).*#### "
|
22 |
-
- "\\.$"
|
23 |
-
generation_kwargs:
|
24 |
-
until:
|
25 |
-
- "Question:"
|
26 |
-
- "Question"
|
27 |
-
- "</s>"
|
28 |
-
- "<|im_end|>"
|
29 |
-
do_sample: false
|
30 |
-
temperature: 0.0
|
31 |
-
# is_gsm8k: true
|
32 |
-
repeats: 1
|
33 |
-
num_fewshot: 5
|
34 |
-
filter_list:
|
35 |
-
- name: "strict-match"
|
36 |
-
filter:
|
37 |
-
- function: "regex"
|
38 |
-
regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
|
39 |
-
- function: "take_first"
|
40 |
-
- name: "flexible-extract"
|
41 |
-
filter:
|
42 |
-
- function: "regex"
|
43 |
-
group_select: -1
|
44 |
-
regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
|
45 |
-
- function: "take_first"
|
46 |
-
metadata:
|
47 |
-
version: 3.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/tasks/measurement_task_utils.py
CHANGED
@@ -12,9 +12,6 @@ def process_results_decorator(func):
|
|
12 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
13 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
14 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
15 |
-
mfu = sum([r[4] for r in results]) / len(results)
|
16 |
-
mbu = sum([r[5] for r in results]) / len(results)
|
17 |
-
|
18 |
# print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
19 |
|
20 |
# Now call the original process_results with the processed results
|
@@ -22,8 +19,6 @@ def process_results_decorator(func):
|
|
22 |
result_dict["end_to_end_time"] = end_to_end_time
|
23 |
result_dict["prefilling_time"] = prefilling_time
|
24 |
result_dict["decoding_throughput"] = decoding_throughput
|
25 |
-
result_dict["mfu"] = mfu
|
26 |
-
result_dict["mbu"] = mbu
|
27 |
return result_dict
|
28 |
return wrapper
|
29 |
|
@@ -35,8 +30,6 @@ def aggregation_decorator(func):
|
|
35 |
aggregation_list["end_to_end_time"] = mean
|
36 |
aggregation_list["prefilling_time"] = mean
|
37 |
aggregation_list["decoding_throughput"] = mean
|
38 |
-
aggregation_list["mfu"] = mean
|
39 |
-
aggregation_list["mbu"] = mean
|
40 |
return aggregation_list
|
41 |
return wrapper
|
42 |
|
@@ -48,8 +41,6 @@ def higher_is_better_decorator(func):
|
|
48 |
higher_is_better_dict["end_to_end_time"] = False
|
49 |
higher_is_better_dict["prefilling_time"] = False
|
50 |
higher_is_better_dict["decoding_throughput"] = True
|
51 |
-
higher_is_better_dict["mfu"] = True
|
52 |
-
higher_is_better_dict["mbu"] = True
|
53 |
return higher_is_better_dict
|
54 |
return wrapper
|
55 |
|
|
|
12 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
13 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
14 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
|
|
|
|
|
|
15 |
# print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
16 |
|
17 |
# Now call the original process_results with the processed results
|
|
|
19 |
result_dict["end_to_end_time"] = end_to_end_time
|
20 |
result_dict["prefilling_time"] = prefilling_time
|
21 |
result_dict["decoding_throughput"] = decoding_throughput
|
|
|
|
|
22 |
return result_dict
|
23 |
return wrapper
|
24 |
|
|
|
30 |
aggregation_list["end_to_end_time"] = mean
|
31 |
aggregation_list["prefilling_time"] = mean
|
32 |
aggregation_list["decoding_throughput"] = mean
|
|
|
|
|
33 |
return aggregation_list
|
34 |
return wrapper
|
35 |
|
|
|
41 |
higher_is_better_dict["end_to_end_time"] = False
|
42 |
higher_is_better_dict["prefilling_time"] = False
|
43 |
higher_is_better_dict["decoding_throughput"] = True
|
|
|
|
|
44 |
return higher_is_better_dict
|
45 |
return wrapper
|
46 |
|
src/backend/tasks/selfcheckgpt/task.py
CHANGED
@@ -27,12 +27,12 @@ class SelfCheckGPT(ConfigurableTask):
|
|
27 |
super().__init__(config={"metadata": {"version": self.VERSION}})
|
28 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
29 |
# self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
30 |
-
self.generation_kwargs = {"until": ["
|
31 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
32 |
self.generation_kwargs_sampling = {
|
33 |
"temperature": 0.99,
|
34 |
"do_sample": True,
|
35 |
-
"until": ["
|
36 |
"max_length": 1024,
|
37 |
}
|
38 |
|
|
|
27 |
super().__init__(config={"metadata": {"version": self.VERSION}})
|
28 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
29 |
# self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
30 |
+
self.generation_kwargs = {"until": ["<im_end>"], "max_length": 1024}
|
31 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
32 |
self.generation_kwargs_sampling = {
|
33 |
"temperature": 0.99,
|
34 |
"do_sample": True,
|
35 |
+
"until": ["<im_end>", "</s>"],
|
36 |
"max_length": 1024,
|
37 |
}
|
38 |
|
src/display/about.py
CHANGED
@@ -3,36 +3,23 @@ from src.display.utils import ModelType
|
|
3 |
TITLE = """<h1 align="center" id="space-title">OPEN-MOE-LLM-LEADERBOARD</h1>"""
|
4 |
|
5 |
INTRODUCTION_TEXT = """
|
6 |
-
The OPEN-MOE-LLM-LEADERBOARD is specifically designed to assess the performance and efficiency of various Mixture of Experts (MoE) Large Language Models (LLMs).
|
7 |
-
This initiative, driven by the open-source community, aims to comprehensively evaluate these advanced MoE LLMs.
|
8 |
|
9 |
The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to measure the performance and efficiency of MOE LLMs.
|
10 |
|
11 |
|
12 |
Tasks:
|
|
|
13 |
- **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
|
14 |
-
- **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
|
15 |
-
- **AI Judgment Scores for Responses to Complex User Queries** -- [Arena_Hard](https://lmsys.org/blog/2024-04-19-arena-hard/)
|
16 |
|
17 |
Columns and Metrics:
|
18 |
- Method: The MOE LLMs inference framework.
|
19 |
- E2E(s): Average End to End generation time in seconds.
|
20 |
- PRE(s): Prefilling Time of input prompt in seconds.
|
21 |
- T/s: Tokens throughout per second.
|
22 |
-
- S-MBU(%): Sparse Model Bandwidth Utilization.
|
23 |
-
- S-MFU(%): Sparse Model FLOPs Utilization.
|
24 |
- Precision: The precison of used model.
|
25 |
|
26 |
"""
|
27 |
-
|
28 |
-
ACKNOWLEDGEMENT_TEXT = """
|
29 |
-
<div>
|
30 |
-
<h4>Acknowledgements</h4>
|
31 |
-
{image_html}
|
32 |
-
<p>We express our sincere gratitude to <a href="https://netmind.ai/home">NetMind.AI</a> for their generous donation of GPUs, which plays a crucial role in ensuring the continuous operation of our Leaderboard.</p>
|
33 |
-
</div>
|
34 |
-
"""
|
35 |
-
|
36 |
LLM_BENCHMARKS_TEXT = f"""
|
37 |
|
38 |
"""
|
|
|
3 |
TITLE = """<h1 align="center" id="space-title">OPEN-MOE-LLM-LEADERBOARD</h1>"""
|
4 |
|
5 |
INTRODUCTION_TEXT = """
|
6 |
+
The OPEN-MOE-LLM-LEADERBOARD is specifically designed to assess the performance and efficiency of various Mixture of Experts (MoE) Large Language Models (LLMs). This initiative, driven by the open-source community, aims to comprehensively evaluate these advanced MoE LLMs. We extend our gratitude to the Huggingface for the GPU community grant that supported the initial debugging process, and to [NetMind.AI](https://netmind.ai/home) for their generous GPU donation, which ensures the continuous operation of the Leaderboard.
|
|
|
7 |
|
8 |
The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to measure the performance and efficiency of MOE LLMs.
|
9 |
|
10 |
|
11 |
Tasks:
|
12 |
+
- **Generation Self-consistancy** -- [SelfCheckGPT](https://github.com/potsawee/selfcheckgpt)
|
13 |
- **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
|
|
|
|
|
14 |
|
15 |
Columns and Metrics:
|
16 |
- Method: The MOE LLMs inference framework.
|
17 |
- E2E(s): Average End to End generation time in seconds.
|
18 |
- PRE(s): Prefilling Time of input prompt in seconds.
|
19 |
- T/s: Tokens throughout per second.
|
|
|
|
|
20 |
- Precision: The precison of used model.
|
21 |
|
22 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
LLM_BENCHMARKS_TEXT = f"""
|
24 |
|
25 |
"""
|
src/display/imgs/Netmind.AI_LOGO.jpg
DELETED
Binary file (6.92 kB)
|
|
src/display/utils.py
CHANGED
@@ -13,33 +13,6 @@ TS = "T/s" #Decoding throughput (tok/s)
|
|
13 |
InFrame = "Method" #"Inference framework"
|
14 |
MULTIPLE_CHOICEs = ["mmlu"]
|
15 |
|
16 |
-
GPU_TEMP = 'Temp(C)'
|
17 |
-
GPU_Power = 'Power(W)'
|
18 |
-
GPU_Mem = 'Mem(G)'
|
19 |
-
GPU_Name = "GPU"
|
20 |
-
GPU_Util = 'Util(%)'
|
21 |
-
MFU = 'S-MFU(%)'
|
22 |
-
MBU = 'S-MBU(%)'
|
23 |
-
BATCH_SIZE = 'bs'
|
24 |
-
PRECISION = "Precision"
|
25 |
-
system_metrics_to_name_map = {
|
26 |
-
"end_to_end_time": f"{E2Es}",
|
27 |
-
"prefilling_time": f"{PREs}",
|
28 |
-
"decoding_throughput": f"{TS}",
|
29 |
-
"mfu": f"{MFU}",
|
30 |
-
"mbu": f"{MBU}"
|
31 |
-
}
|
32 |
-
|
33 |
-
gpu_metrics_to_name_map = {
|
34 |
-
GPU_Util: GPU_Util,
|
35 |
-
GPU_TEMP: GPU_TEMP,
|
36 |
-
GPU_Power: GPU_Power,
|
37 |
-
GPU_Mem: GPU_Mem,
|
38 |
-
"batch_size": BATCH_SIZE,
|
39 |
-
"precision": PRECISION,
|
40 |
-
GPU_Name: GPU_Name
|
41 |
-
}
|
42 |
-
|
43 |
@dataclass
|
44 |
class Task:
|
45 |
benchmark: str
|
@@ -77,11 +50,8 @@ class Tasks(Enum):
|
|
77 |
# halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
|
78 |
|
79 |
# # XXX include me back at some point
|
80 |
-
|
81 |
mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
|
82 |
-
gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
|
83 |
-
# gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot)
|
84 |
-
arena_hard = Task("arena_hard", "score", "Arena Hard") #Arena Hard/Score
|
85 |
|
86 |
|
87 |
# These classes are for user facing column names,
|
@@ -106,35 +76,27 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
106 |
# # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
107 |
|
108 |
# Inference framework
|
109 |
-
auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True
|
110 |
|
111 |
for task in Tasks:
|
112 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
113 |
# System performance metrics
|
114 |
-
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name}
|
115 |
-
auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
|
116 |
-
# auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
|
117 |
-
# auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
|
118 |
-
auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
|
119 |
-
# auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
|
120 |
if task.value.benchmark in MULTIPLE_CHOICEs:
|
121 |
continue
|
122 |
-
|
123 |
-
auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name}
|
124 |
-
auto_eval_column_dict.append([f"{task.name}_mbu", ColumnContent, ColumnContent(f"{task.value.col_name} {MBU}", "number", True, hidden=True)])
|
125 |
-
auto_eval_column_dict.append([f"{task.name}_mfu", ColumnContent, ColumnContent(f"{task.value.col_name} {MFU}", "number", True, hidden=True)])
|
126 |
-
|
127 |
|
128 |
# Model information
|
129 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False
|
130 |
-
|
131 |
-
|
132 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
# Dummy column for the search bar (hidden by the custom CSS)
|
139 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
140 |
|
@@ -160,10 +122,10 @@ class ModelDetails:
|
|
160 |
|
161 |
|
162 |
class ModelType(Enum):
|
163 |
-
|
164 |
-
|
165 |
chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
|
166 |
-
|
167 |
Unknown = ModelDetails(name="", symbol="?")
|
168 |
|
169 |
def to_str(self, separator=" "):
|
@@ -171,24 +133,21 @@ class ModelType(Enum):
|
|
171 |
|
172 |
@staticmethod
|
173 |
def from_str(type):
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
|
179 |
return ModelType.chat
|
180 |
-
|
181 |
-
|
182 |
return ModelType.Unknown
|
183 |
|
184 |
|
185 |
class InferenceFramework(Enum):
|
186 |
# "moe-infinity", hf-chat
|
187 |
-
|
188 |
HF_Chat = ModelDetails("hf-chat")
|
189 |
-
VLLM = ModelDetails("vllm_moe")
|
190 |
-
TRTLLM = ModelDetails("tensorrt_llm")
|
191 |
-
VLLM_FIX = ModelDetails("vllm_moe_fixbs")
|
192 |
Unknown = ModelDetails("?")
|
193 |
|
194 |
def to_str(self):
|
@@ -196,34 +155,13 @@ class InferenceFramework(Enum):
|
|
196 |
|
197 |
@staticmethod
|
198 |
def from_str(inference_framework: str):
|
199 |
-
|
200 |
-
|
201 |
-
if inference_framework in ["tensorrt_llm"]:
|
202 |
-
return InferenceFramework.TRTLLM
|
203 |
if inference_framework in ["hf-chat"]:
|
204 |
return InferenceFramework.HF_Chat
|
205 |
-
if inference_framework in ["vllm_moe"]:
|
206 |
-
return InferenceFramework.VLLM
|
207 |
-
if inference_framework in ["vllm_moe_fixbs"]:
|
208 |
-
return InferenceFramework.VLLM_FIX
|
209 |
return InferenceFramework.Unknown
|
210 |
|
211 |
-
class GPUType(Enum):
|
212 |
-
A100_sxm = ModelDetails("NVIDIA-A100-SXM4-80GB")
|
213 |
-
A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
|
214 |
-
Unknown = ModelDetails("?")
|
215 |
|
216 |
-
def to_str(self):
|
217 |
-
return self.value.name
|
218 |
-
|
219 |
-
@staticmethod
|
220 |
-
def from_str(gpu_type: str):
|
221 |
-
if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
|
222 |
-
return GPUType.A100_pcie
|
223 |
-
if gpu_type in ["NVIDIA-A100-SXM4-80GB"]:
|
224 |
-
return GPUType.A100_sxm
|
225 |
-
return GPUType.Unknown
|
226 |
-
|
227 |
class WeightType(Enum):
|
228 |
Adapter = ModelDetails("Adapter")
|
229 |
Original = ModelDetails("Original")
|
@@ -231,34 +169,34 @@ class WeightType(Enum):
|
|
231 |
|
232 |
|
233 |
class Precision(Enum):
|
234 |
-
|
235 |
-
|
236 |
bfloat16 = ModelDetails("bfloat16")
|
237 |
qt_8bit = ModelDetails("8bit")
|
238 |
qt_4bit = ModelDetails("4bit")
|
239 |
-
|
240 |
Unknown = ModelDetails("?")
|
241 |
|
242 |
@staticmethod
|
243 |
def from_str(precision: str):
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
249 |
return Precision.bfloat16
|
250 |
if precision in ["8bit"]:
|
251 |
return Precision.qt_8bit
|
252 |
if precision in ["4bit"]:
|
253 |
return Precision.qt_4bit
|
254 |
-
|
255 |
-
|
256 |
return Precision.Unknown
|
257 |
|
258 |
|
259 |
# Column selection
|
260 |
-
COLS = [c.name for c in fields(AutoEvalColumn)]
|
261 |
-
TYPES = [c.type for c in fields(AutoEvalColumn)]
|
262 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
263 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
264 |
|
|
|
13 |
InFrame = "Method" #"Inference framework"
|
14 |
MULTIPLE_CHOICEs = ["mmlu"]
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
@dataclass
|
17 |
class Task:
|
18 |
benchmark: str
|
|
|
50 |
# halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
|
51 |
|
52 |
# # XXX include me back at some point
|
53 |
+
selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
|
54 |
mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
|
|
|
|
|
|
|
55 |
|
56 |
|
57 |
# These classes are for user facing column names,
|
|
|
76 |
# # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
77 |
|
78 |
# Inference framework
|
79 |
+
auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True)])
|
80 |
|
81 |
for task in Tasks:
|
82 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
83 |
# System performance metrics
|
84 |
+
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name}-{E2Es}", "number", True)])
|
|
|
|
|
|
|
|
|
|
|
85 |
if task.value.benchmark in MULTIPLE_CHOICEs:
|
86 |
continue
|
87 |
+
auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name}-{PREs}", "number", True)])
|
88 |
+
auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name}-{TS}", "number", True)])
|
|
|
|
|
|
|
89 |
|
90 |
# Model information
|
91 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
92 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
93 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
94 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
95 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
96 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
97 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
98 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
99 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
100 |
# Dummy column for the search bar (hidden by the custom CSS)
|
101 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
102 |
|
|
|
122 |
|
123 |
|
124 |
class ModelType(Enum):
|
125 |
+
PT = ModelDetails(name="pretrained", symbol="🟢")
|
126 |
+
FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
|
127 |
chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
|
128 |
+
merges = ModelDetails(name="base merges and moerges", symbol="🤝")
|
129 |
Unknown = ModelDetails(name="", symbol="?")
|
130 |
|
131 |
def to_str(self, separator=" "):
|
|
|
133 |
|
134 |
@staticmethod
|
135 |
def from_str(type):
|
136 |
+
if "fine-tuned" in type or "🔶" in type:
|
137 |
+
return ModelType.FT
|
138 |
+
if "pretrained" in type or "🟢" in type:
|
139 |
+
return ModelType.PT
|
140 |
if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
|
141 |
return ModelType.chat
|
142 |
+
if "merge" in type or "🤝" in type:
|
143 |
+
return ModelType.merges
|
144 |
return ModelType.Unknown
|
145 |
|
146 |
|
147 |
class InferenceFramework(Enum):
|
148 |
# "moe-infinity", hf-chat
|
149 |
+
MoE_Infinity = ModelDetails("moe-infinity")
|
150 |
HF_Chat = ModelDetails("hf-chat")
|
|
|
|
|
|
|
151 |
Unknown = ModelDetails("?")
|
152 |
|
153 |
def to_str(self):
|
|
|
155 |
|
156 |
@staticmethod
|
157 |
def from_str(inference_framework: str):
|
158 |
+
if inference_framework in ["moe-infinity"]:
|
159 |
+
return InferenceFramework.MoE_Infinity
|
|
|
|
|
160 |
if inference_framework in ["hf-chat"]:
|
161 |
return InferenceFramework.HF_Chat
|
|
|
|
|
|
|
|
|
162 |
return InferenceFramework.Unknown
|
163 |
|
|
|
|
|
|
|
|
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
class WeightType(Enum):
|
166 |
Adapter = ModelDetails("Adapter")
|
167 |
Original = ModelDetails("Original")
|
|
|
169 |
|
170 |
|
171 |
class Precision(Enum):
|
172 |
+
float32 = ModelDetails("float32")
|
173 |
+
float16 = ModelDetails("float16")
|
174 |
bfloat16 = ModelDetails("bfloat16")
|
175 |
qt_8bit = ModelDetails("8bit")
|
176 |
qt_4bit = ModelDetails("4bit")
|
177 |
+
qt_GPTQ = ModelDetails("GPTQ")
|
178 |
Unknown = ModelDetails("?")
|
179 |
|
180 |
@staticmethod
|
181 |
def from_str(precision: str):
|
182 |
+
if precision in ["torch.float32", "float32"]:
|
183 |
+
return Precision.float32
|
184 |
+
if precision in ["torch.float16", "float16"]:
|
185 |
+
return Precision.float16
|
186 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
187 |
return Precision.bfloat16
|
188 |
if precision in ["8bit"]:
|
189 |
return Precision.qt_8bit
|
190 |
if precision in ["4bit"]:
|
191 |
return Precision.qt_4bit
|
192 |
+
if precision in ["GPTQ", "None"]:
|
193 |
+
return Precision.qt_GPTQ
|
194 |
return Precision.Unknown
|
195 |
|
196 |
|
197 |
# Column selection
|
198 |
+
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
199 |
+
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
200 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
201 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
202 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -65,11 +65,11 @@ class EvalResult:
|
|
65 |
if len(org_and_model) == 1:
|
66 |
org = None
|
67 |
model = org_and_model[0]
|
68 |
-
result_key = f"{model}_{precision.value.name}
|
69 |
else:
|
70 |
org = org_and_model[0]
|
71 |
model = org_and_model[1]
|
72 |
-
result_key = f"{org}_{model}_{precision.value.name}
|
73 |
full_model = "/".join(org_and_model)
|
74 |
|
75 |
still_on_hub, error, model_config = is_model_on_hub(
|
@@ -103,13 +103,6 @@ class EvalResult:
|
|
103 |
|
104 |
if to_add is True:
|
105 |
multiplier = 100.0
|
106 |
-
if "GPU" in metric:
|
107 |
-
results[benchmark][metric] = value
|
108 |
-
continue
|
109 |
-
if "precision" in metric:
|
110 |
-
results[benchmark][metric] = value
|
111 |
-
continue
|
112 |
-
|
113 |
if "rouge" in metric and "truthful" not in benchmark:
|
114 |
multiplier = 1.0
|
115 |
if "squad" in benchmark:
|
@@ -118,17 +111,9 @@ class EvalResult:
|
|
118 |
multiplier = 1.0
|
119 |
if "throughput" in metric:
|
120 |
multiplier = 1.0
|
121 |
-
if "batch_" in metric or "Mem" in metric or "Util" in metric:
|
122 |
-
multiplier = 1
|
123 |
-
|
124 |
# print('RESULTS', data['results'])
|
125 |
# print('XXX', benchmark, metric, value, multiplier)
|
126 |
-
|
127 |
-
results[benchmark][metric] = "-"
|
128 |
-
elif value == "auto":
|
129 |
-
results[benchmark][metric] = "auto"
|
130 |
-
else:
|
131 |
-
results[benchmark][metric] = value * multiplier
|
132 |
|
133 |
res = EvalResult(
|
134 |
eval_name=result_key,
|
@@ -140,7 +125,6 @@ class EvalResult:
|
|
140 |
revision=config.get("model_sha", ""),
|
141 |
still_on_hub=still_on_hub,
|
142 |
architecture=architecture,
|
143 |
-
model_type=ModelType.from_str(config.get("model_type", "")),
|
144 |
inference_framework=inference_framework,
|
145 |
)
|
146 |
|
@@ -175,22 +159,22 @@ class EvalResult:
|
|
175 |
|
176 |
# breakpoint()
|
177 |
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
178 |
-
|
179 |
data_dict = {
|
180 |
"eval_name": self.eval_name, # not a column, just a save name,
|
181 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
182 |
-
|
183 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
184 |
-
|
185 |
-
|
186 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
187 |
AutoEvalColumn.dummy.name: self.full_model,
|
188 |
-
|
189 |
-
#
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
AutoEvalColumn.inference_framework.name: self.inference_framework,
|
195 |
}
|
196 |
|
@@ -278,22 +262,15 @@ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool
|
|
278 |
|
279 |
eval_results = {}
|
280 |
for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
eval_name
|
288 |
-
|
289 |
-
|
290 |
-
else:
|
291 |
-
eval_results[eval_name] = eval_result
|
292 |
-
|
293 |
-
except (FileNotFoundError, ValueError, KeyError, json.JSONDecodeError) as e:
|
294 |
-
# Log the error and continue with the next file
|
295 |
-
print(f"Error processing file {model_result_filepath}: {e}")
|
296 |
-
continue
|
297 |
|
298 |
results = []
|
299 |
for v in eval_results.values():
|
|
|
65 |
if len(org_and_model) == 1:
|
66 |
org = None
|
67 |
model = org_and_model[0]
|
68 |
+
result_key = f"{model}_{precision.value.name}"
|
69 |
else:
|
70 |
org = org_and_model[0]
|
71 |
model = org_and_model[1]
|
72 |
+
result_key = f"{org}_{model}_{precision.value.name}"
|
73 |
full_model = "/".join(org_and_model)
|
74 |
|
75 |
still_on_hub, error, model_config = is_model_on_hub(
|
|
|
103 |
|
104 |
if to_add is True:
|
105 |
multiplier = 100.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
if "rouge" in metric and "truthful" not in benchmark:
|
107 |
multiplier = 1.0
|
108 |
if "squad" in benchmark:
|
|
|
111 |
multiplier = 1.0
|
112 |
if "throughput" in metric:
|
113 |
multiplier = 1.0
|
|
|
|
|
|
|
114 |
# print('RESULTS', data['results'])
|
115 |
# print('XXX', benchmark, metric, value, multiplier)
|
116 |
+
results[benchmark][metric] = value * multiplier
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
res = EvalResult(
|
119 |
eval_name=result_key,
|
|
|
125 |
revision=config.get("model_sha", ""),
|
126 |
still_on_hub=still_on_hub,
|
127 |
architecture=architecture,
|
|
|
128 |
inference_framework=inference_framework,
|
129 |
)
|
130 |
|
|
|
159 |
|
160 |
# breakpoint()
|
161 |
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
162 |
+
|
163 |
data_dict = {
|
164 |
"eval_name": self.eval_name, # not a column, just a save name,
|
165 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
166 |
+
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
167 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
168 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
169 |
+
AutoEvalColumn.architecture.name: self.architecture,
|
170 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
171 |
AutoEvalColumn.dummy.name: self.full_model,
|
172 |
+
AutoEvalColumn.revision.name: self.revision,
|
173 |
+
# AutoEvalColumn.average.name: average,
|
174 |
+
AutoEvalColumn.license.name: self.license,
|
175 |
+
AutoEvalColumn.likes.name: self.likes,
|
176 |
+
AutoEvalColumn.params.name: self.num_params,
|
177 |
+
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
178 |
AutoEvalColumn.inference_framework.name: self.inference_framework,
|
179 |
}
|
180 |
|
|
|
262 |
|
263 |
eval_results = {}
|
264 |
for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
|
265 |
+
# Creation of result
|
266 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
|
267 |
+
eval_result.update_with_request_file(requests_path)
|
268 |
+
# Store results of same eval together
|
269 |
+
eval_name = eval_result.eval_name
|
270 |
+
if eval_name in eval_results.keys():
|
271 |
+
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
272 |
+
else:
|
273 |
+
eval_results[eval_name] = eval_result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
results = []
|
276 |
for v in eval_results.values():
|
src/populate.py
CHANGED
@@ -12,7 +12,7 @@ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, update_
|
|
12 |
|
13 |
from src.backend.envs import Tasks as BackendTasks
|
14 |
from src.display.utils import Tasks
|
15 |
-
from src.display.utils import
|
16 |
|
17 |
def get_leaderboard_df(
|
18 |
results_path: str,
|
@@ -45,7 +45,12 @@ def get_leaderboard_df(
|
|
45 |
bm = (task.benchmark, task.metric)
|
46 |
name_to_bm_map[name] = bm
|
47 |
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
all_data_json = []
|
51 |
for entry in all_data_json_:
|
@@ -58,9 +63,6 @@ def get_leaderboard_df(
|
|
58 |
if sys_metric in entry[k]:
|
59 |
new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric]
|
60 |
|
61 |
-
for gpu_metric, metric_namne in gpu_metrics_to_name_map.items():
|
62 |
-
if gpu_metric in entry[k]:
|
63 |
-
new_entry[f"{k} {metric_namne}"] = entry[k][gpu_metric]
|
64 |
all_data_json += [new_entry]
|
65 |
|
66 |
# all_data_json.append(baseline_row)
|
|
|
12 |
|
13 |
from src.backend.envs import Tasks as BackendTasks
|
14 |
from src.display.utils import Tasks
|
15 |
+
from src.display.utils import E2Es, PREs, TS
|
16 |
|
17 |
def get_leaderboard_df(
|
18 |
results_path: str,
|
|
|
45 |
bm = (task.benchmark, task.metric)
|
46 |
name_to_bm_map[name] = bm
|
47 |
|
48 |
+
# bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
|
49 |
+
system_metrics_to_name_map = {
|
50 |
+
"end_to_end_time": f"{E2Es}",
|
51 |
+
"prefilling_time": f"{PREs}",
|
52 |
+
"decoding_throughput": f"{TS}",
|
53 |
+
}
|
54 |
|
55 |
all_data_json = []
|
56 |
for entry in all_data_json_:
|
|
|
63 |
if sys_metric in entry[k]:
|
64 |
new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric]
|
65 |
|
|
|
|
|
|
|
66 |
all_data_json += [new_entry]
|
67 |
|
68 |
# all_data_json.append(baseline_row)
|
src/submission/check_validity.py
CHANGED
@@ -74,7 +74,7 @@ def is_model_on_hub(
|
|
74 |
|
75 |
|
76 |
def get_model_size(model_info: ModelInfo, precision: str):
|
77 |
-
size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
78 |
try:
|
79 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
80 |
except (AttributeError, TypeError):
|
@@ -130,8 +130,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
|
|
130 |
continue
|
131 |
with open(os.path.join(root, file), "r") as f:
|
132 |
info = json.load(f)
|
133 |
-
|
134 |
-
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
|
135 |
|
136 |
# Select organisation
|
137 |
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
|
|
74 |
|
75 |
|
76 |
def get_model_size(model_info: ModelInfo, precision: str):
|
77 |
+
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
78 |
try:
|
79 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
80 |
except (AttributeError, TypeError):
|
|
|
130 |
continue
|
131 |
with open(os.path.join(root, file), "r") as f:
|
132 |
info = json.load(f)
|
133 |
+
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}")
|
|
|
134 |
|
135 |
# Select organisation
|
136 |
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
src/submission/submit.py
CHANGED
@@ -26,8 +26,7 @@ def add_new_eval(
|
|
26 |
weight_type: str,
|
27 |
model_type: str,
|
28 |
inference_framework: str,
|
29 |
-
debug: bool = False
|
30 |
-
gpu_type: str = "NVIDIA-A100-PCIe-80GB",
|
31 |
):
|
32 |
global REQUESTED_MODELS
|
33 |
global USERS_TO_SUBMISSION_DATES
|
@@ -115,18 +114,17 @@ def add_new_eval(
|
|
115 |
"params": model_size,
|
116 |
"license": license,
|
117 |
"inference_framework": inference_framework,
|
118 |
-
"gpu_type": gpu_type
|
119 |
}
|
120 |
|
121 |
# Check for duplicate submission
|
122 |
-
if f"{model}_{revision}_{precision}_{inference_framework}
|
123 |
return styled_warning("This model has been already submitted.")
|
124 |
|
125 |
print("Creating eval file")
|
126 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
127 |
os.makedirs(OUT_DIR, exist_ok=True)
|
128 |
# out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
|
129 |
-
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}_{inference_framework}
|
130 |
|
131 |
with open(out_path, "w") as f:
|
132 |
f.write(json.dumps(eval_entry))
|
|
|
26 |
weight_type: str,
|
27 |
model_type: str,
|
28 |
inference_framework: str,
|
29 |
+
debug: bool = False
|
|
|
30 |
):
|
31 |
global REQUESTED_MODELS
|
32 |
global USERS_TO_SUBMISSION_DATES
|
|
|
114 |
"params": model_size,
|
115 |
"license": license,
|
116 |
"inference_framework": inference_framework,
|
|
|
117 |
}
|
118 |
|
119 |
# Check for duplicate submission
|
120 |
+
if f"{model}_{revision}_{precision}_{inference_framework}" in REQUESTED_MODELS:
|
121 |
return styled_warning("This model has been already submitted.")
|
122 |
|
123 |
print("Creating eval file")
|
124 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
125 |
os.makedirs(OUT_DIR, exist_ok=True)
|
126 |
# out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
|
127 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}_{inference_framework}.json"
|
128 |
|
129 |
with open(out_path, "w") as f:
|
130 |
f.write(json.dumps(eval_entry))
|
src/utils.py
CHANGED
@@ -1,56 +1,6 @@
|
|
1 |
import pandas as pd
|
2 |
from huggingface_hub import snapshot_download
|
3 |
-
import subprocess
|
4 |
-
import re
|
5 |
-
import os
|
6 |
-
import GPUtil
|
7 |
|
8 |
-
try:
|
9 |
-
from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
|
10 |
-
except:
|
11 |
-
print("local debug: from display.utils")
|
12 |
-
from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
|
13 |
-
|
14 |
-
MEM_BW_DICT ={
|
15 |
-
"NVIDIA-A100-PCIe-80GB": 1935,
|
16 |
-
"NVIDIA-A100-SXM-80GB": 2039,
|
17 |
-
"NVIDIA-H100-PCIe-80GB": 2039,
|
18 |
-
"NVIDIA-RTX-A5000-24GB": 768
|
19 |
-
}
|
20 |
-
|
21 |
-
PEAK_FLOPS_DICT = {
|
22 |
-
"float32":{
|
23 |
-
"NVIDIA-A100-PCIe-80GB": 312e12,
|
24 |
-
"NVIDIA-A100-SXM-80GB": 312e12,
|
25 |
-
"NVIDIA-H100-PCIe-80GB": 756e12,
|
26 |
-
"NVIDIA-RTX-A5000-24GB": 222.2e12
|
27 |
-
},
|
28 |
-
"float16":{
|
29 |
-
"NVIDIA-A100-PCIe-80GB": 624e12,
|
30 |
-
"NVIDIA-A100-SXM-80GB": 624e12,
|
31 |
-
"NVIDIA-H100-PCIe-80GB": 1513e12,
|
32 |
-
"NVIDIA-RTX-A5000-24GB": 444.4e12
|
33 |
-
},
|
34 |
-
"bfloat16":{
|
35 |
-
"NVIDIA-A100-PCIe-80GB": 624e12,
|
36 |
-
"NVIDIA-A100-SXM-80GB": 624e12,
|
37 |
-
"NVIDIA-H100-PCIe-80GB": 1513e12,
|
38 |
-
"NVIDIA-RTX-A5000-24GB": 444.4e12
|
39 |
-
},
|
40 |
-
"8bit":{
|
41 |
-
"NVIDIA-A100-PCIe-80GB": 1248e12,
|
42 |
-
"NVIDIA-A100-SXM-80GB": 1248e12,
|
43 |
-
"NVIDIA-H100-PCIe-80GB": 3026e12,
|
44 |
-
"NVIDIA-RTX-A5000-24GB": 889e12
|
45 |
-
},
|
46 |
-
"4bit": {
|
47 |
-
"NVIDIA-A100-PCIe-80GB": 2496e12,
|
48 |
-
"NVIDIA-A100-SXM-80GB": 2496e12,
|
49 |
-
"NVIDIA-H100-PCIe-80GB": 6052e12,
|
50 |
-
"NVIDIA-RTX-A5000-24GB": 1778e12
|
51 |
-
}
|
52 |
-
|
53 |
-
}
|
54 |
|
55 |
def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
|
56 |
for i in range(10):
|
@@ -82,130 +32,3 @@ def get_dataset_summary_table(file_path):
|
|
82 |
df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]
|
83 |
|
84 |
return df
|
85 |
-
|
86 |
-
def parse_nvidia_smi():
|
87 |
-
visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
|
88 |
-
if visible_devices is not None:
|
89 |
-
gpu_indices = visible_devices.split(',')
|
90 |
-
else:
|
91 |
-
# Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
|
92 |
-
result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
|
93 |
-
if result.returncode != 0:
|
94 |
-
print("Failed to query GPU indices.")
|
95 |
-
return []
|
96 |
-
gpu_indices = result.stdout.strip().split('\n')
|
97 |
-
# print(f"gpu_indices: {gpu_indices}")
|
98 |
-
gpu_stats = []
|
99 |
-
|
100 |
-
gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
|
101 |
-
# gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
|
102 |
-
gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')
|
103 |
-
|
104 |
-
gpu_name = ""
|
105 |
-
for index in gpu_indices:
|
106 |
-
result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
|
107 |
-
output = result.stdout.strip()
|
108 |
-
lines = output.split("\n")
|
109 |
-
for line in lines:
|
110 |
-
match = gpu_info_pattern.search(line)
|
111 |
-
name_match = gpu_name_pattern.search(line)
|
112 |
-
gpu_info = {}
|
113 |
-
if name_match:
|
114 |
-
gpu_name = ''.join(filter(None, name_match.groups())).strip()
|
115 |
-
if match:
|
116 |
-
temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
|
117 |
-
gpu_info.update({
|
118 |
-
GPU_TEMP: temp,
|
119 |
-
GPU_Power: power_usage,
|
120 |
-
GPU_Mem: round(mem_usage / 1024, 2),
|
121 |
-
GPU_Util: gpu_util
|
122 |
-
})
|
123 |
-
|
124 |
-
if len(gpu_info) >= 4:
|
125 |
-
gpu_stats.append(gpu_info)
|
126 |
-
# print(f"gpu_stats: {gpu_stats}")
|
127 |
-
gpu_name = f"{len(gpu_stats)}x{gpu_name}"
|
128 |
-
gpu_stats_total = {
|
129 |
-
GPU_TEMP: 0,
|
130 |
-
GPU_Power: 0,
|
131 |
-
GPU_Mem: 0,
|
132 |
-
GPU_Util: 0,
|
133 |
-
GPU_Name: gpu_name
|
134 |
-
}
|
135 |
-
for gpu_stat in gpu_stats:
|
136 |
-
gpu_stats_total[GPU_TEMP] += gpu_stat[GPU_TEMP]
|
137 |
-
gpu_stats_total[GPU_Power] += gpu_stat[GPU_Power]
|
138 |
-
gpu_stats_total[GPU_Mem] += gpu_stat[GPU_Mem]
|
139 |
-
gpu_stats_total[GPU_Util] += gpu_stat[GPU_Util]
|
140 |
-
gpu_stats_total[GPU_Mem] = gpu_stats_total[GPU_Mem] # G
|
141 |
-
gpu_stats_total[GPU_TEMP] /= len(gpu_stats)
|
142 |
-
gpu_stats_total[GPU_Power] /= len(gpu_stats)
|
143 |
-
gpu_stats_total[GPU_Util] /= len(gpu_stats)
|
144 |
-
return [gpu_stats_total]
|
145 |
-
|
146 |
-
def monitor_gpus(stop_event, interval, stats_list):
|
147 |
-
while not stop_event.is_set():
|
148 |
-
gpu_stats = parse_nvidia_smi()
|
149 |
-
if gpu_stats:
|
150 |
-
stats_list.extend(gpu_stats)
|
151 |
-
stop_event.wait(interval)
|
152 |
-
|
153 |
-
def analyze_gpu_stats(stats_list):
|
154 |
-
# Check if the stats_list is empty, and return None if it is
|
155 |
-
if not stats_list:
|
156 |
-
return None
|
157 |
-
|
158 |
-
# Initialize dictionaries to store the stats
|
159 |
-
avg_stats = {}
|
160 |
-
max_stats = {}
|
161 |
-
|
162 |
-
# Calculate average stats, excluding 'GPU_Mem'
|
163 |
-
for key in stats_list[0].keys():
|
164 |
-
if key != GPU_Mem and key != GPU_Name:
|
165 |
-
total = sum(d[key] for d in stats_list)
|
166 |
-
avg_stats[key] = total / len(stats_list)
|
167 |
-
|
168 |
-
# Calculate max stats for 'GPU_Mem'
|
169 |
-
max_stats[GPU_Mem] = max(d[GPU_Mem] for d in stats_list)
|
170 |
-
if GPU_Name in stats_list[0]:
|
171 |
-
avg_stats[GPU_Name] = stats_list[0][GPU_Name]
|
172 |
-
# Update average stats with max GPU memory usage
|
173 |
-
avg_stats.update(max_stats)
|
174 |
-
|
175 |
-
return avg_stats
|
176 |
-
|
177 |
-
def get_gpu_details():
|
178 |
-
gpus = GPUtil.getGPUs()
|
179 |
-
gpu = gpus[0]
|
180 |
-
name = gpu.name.replace(" ", "-")
|
181 |
-
memory_gb = round(gpu.memoryTotal / 1024)
|
182 |
-
memory = f"{memory_gb}GB"
|
183 |
-
|
184 |
-
for part in name.split('-'):
|
185 |
-
if part.endswith("GB") and part[:-2].isdigit():
|
186 |
-
name = name.replace(f"-{part}", "").replace(part, "")
|
187 |
-
|
188 |
-
formatted_name = f"{name}-{memory}"
|
189 |
-
|
190 |
-
return formatted_name
|
191 |
-
|
192 |
-
def get_peak_bw(gpu_name):
|
193 |
-
return MEM_BW_DICT[gpu_name]
|
194 |
-
|
195 |
-
def get_peak_flops(gpu_name, precision):
|
196 |
-
return PEAK_FLOPS_DICT[precision][gpu_name]
|
197 |
-
|
198 |
-
def transfer_precision2bytes(precision):
|
199 |
-
if precision == "float32":
|
200 |
-
return 4
|
201 |
-
elif precision in ["float16", "bfloat16"]:
|
202 |
-
return 2
|
203 |
-
elif precision == "8bit":
|
204 |
-
return 1
|
205 |
-
elif precision == "4bit":
|
206 |
-
return 0.5
|
207 |
-
else:
|
208 |
-
raise ValueError(f"Unsupported precision: {precision}")
|
209 |
-
|
210 |
-
if __name__ == "__main__":
|
211 |
-
print(analyze_gpu_stats(parse_nvidia_smi()))
|
|
|
1 |
import pandas as pd
|
2 |
from huggingface_hub import snapshot_download
|
|
|
|
|
|
|
|
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
|
6 |
for i in range(10):
|
|
|
32 |
df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]
|
33 |
|
34 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|