Add GSM8k dataset
#29
by
AppleSwing
- opened
- LICENSE +0 -201
- README.md +1 -1
- app.py +17 -20
- backend-cli.py +42 -71
- requirements.txt +3 -6
- src/backend/envs.py +0 -3
- src/backend/hflm_with_measurement.py +14 -163
- src/backend/moe_infinity.py +4 -9
- src/backend/run_eval_suite.py +0 -8
- src/backend/tasks/arena_hard/__init__.py +0 -0
- src/backend/tasks/arena_hard/arena_hard.yaml +0 -2
- src/backend/tasks/arena_hard/arena_judgment.py +0 -256
- src/backend/tasks/arena_hard/arena_utils.py +0 -349
- src/backend/tasks/arena_hard/configs/api_config.yaml +0 -17
- src/backend/tasks/arena_hard/configs/judge_config.yaml +0 -26
- src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl +0 -0
- src/backend/tasks/arena_hard/question.jsonl +0 -0
- src/backend/tasks/arena_hard/task.py +0 -220
- src/backend/tasks/gsm8k/gsm8k-custom.yaml +0 -47
- src/backend/tasks/measurement_task_utils.py +0 -9
- src/backend/tasks/selfcheckgpt/task.py +2 -2
- src/display/about.py +1 -4
- src/display/utils.py +42 -57
- src/leaderboard/read_evals.py +24 -36
- src/submission/check_validity.py +2 -3
- src/utils.py +4 -80
LICENSE
DELETED
@@ -1,201 +0,0 @@
|
|
1 |
-
Apache License
|
2 |
-
Version 2.0, January 2004
|
3 |
-
http://www.apache.org/licenses/
|
4 |
-
|
5 |
-
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
-
|
7 |
-
1. Definitions.
|
8 |
-
|
9 |
-
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
-
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
-
|
12 |
-
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
-
the copyright owner that is granting the License.
|
14 |
-
|
15 |
-
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
-
other entities that control, are controlled by, or are under common
|
17 |
-
control with that entity. For the purposes of this definition,
|
18 |
-
"control" means (i) the power, direct or indirect, to cause the
|
19 |
-
direction or management of such entity, whether by contract or
|
20 |
-
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
-
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
-
|
23 |
-
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
-
exercising permissions granted by this License.
|
25 |
-
|
26 |
-
"Source" form shall mean the preferred form for making modifications,
|
27 |
-
including but not limited to software source code, documentation
|
28 |
-
source, and configuration files.
|
29 |
-
|
30 |
-
"Object" form shall mean any form resulting from mechanical
|
31 |
-
transformation or translation of a Source form, including but
|
32 |
-
not limited to compiled object code, generated documentation,
|
33 |
-
and conversions to other media types.
|
34 |
-
|
35 |
-
"Work" shall mean the work of authorship, whether in Source or
|
36 |
-
Object form, made available under the License, as indicated by a
|
37 |
-
copyright notice that is included in or attached to the work
|
38 |
-
(an example is provided in the Appendix below).
|
39 |
-
|
40 |
-
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
-
form, that is based on (or derived from) the Work and for which the
|
42 |
-
editorial revisions, annotations, elaborations, or other modifications
|
43 |
-
represent, as a whole, an original work of authorship. For the purposes
|
44 |
-
of this License, Derivative Works shall not include works that remain
|
45 |
-
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
-
the Work and Derivative Works thereof.
|
47 |
-
|
48 |
-
"Contribution" shall mean any work of authorship, including
|
49 |
-
the original version of the Work and any modifications or additions
|
50 |
-
to that Work or Derivative Works thereof, that is intentionally
|
51 |
-
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
-
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
-
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
-
means any form of electronic, verbal, or written communication sent
|
55 |
-
to the Licensor or its representatives, including but not limited to
|
56 |
-
communication on electronic mailing lists, source code control systems,
|
57 |
-
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
-
Licensor for the purpose of discussing and improving the Work, but
|
59 |
-
excluding communication that is conspicuously marked or otherwise
|
60 |
-
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
-
|
62 |
-
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
-
on behalf of whom a Contribution has been received by Licensor and
|
64 |
-
subsequently incorporated within the Work.
|
65 |
-
|
66 |
-
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
-
this License, each Contributor hereby grants to You a perpetual,
|
68 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
-
copyright license to reproduce, prepare Derivative Works of,
|
70 |
-
publicly display, publicly perform, sublicense, and distribute the
|
71 |
-
Work and such Derivative Works in Source or Object form.
|
72 |
-
|
73 |
-
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
-
this License, each Contributor hereby grants to You a perpetual,
|
75 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
-
(except as stated in this section) patent license to make, have made,
|
77 |
-
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
-
where such license applies only to those patent claims licensable
|
79 |
-
by such Contributor that are necessarily infringed by their
|
80 |
-
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
-
with the Work to which such Contribution(s) was submitted. If You
|
82 |
-
institute patent litigation against any entity (including a
|
83 |
-
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
-
or a Contribution incorporated within the Work constitutes direct
|
85 |
-
or contributory patent infringement, then any patent licenses
|
86 |
-
granted to You under this License for that Work shall terminate
|
87 |
-
as of the date such litigation is filed.
|
88 |
-
|
89 |
-
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
-
Work or Derivative Works thereof in any medium, with or without
|
91 |
-
modifications, and in Source or Object form, provided that You
|
92 |
-
meet the following conditions:
|
93 |
-
|
94 |
-
(a) You must give any other recipients of the Work or
|
95 |
-
Derivative Works a copy of this License; and
|
96 |
-
|
97 |
-
(b) You must cause any modified files to carry prominent notices
|
98 |
-
stating that You changed the files; and
|
99 |
-
|
100 |
-
(c) You must retain, in the Source form of any Derivative Works
|
101 |
-
that You distribute, all copyright, patent, trademark, and
|
102 |
-
attribution notices from the Source form of the Work,
|
103 |
-
excluding those notices that do not pertain to any part of
|
104 |
-
the Derivative Works; and
|
105 |
-
|
106 |
-
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
-
distribution, then any Derivative Works that You distribute must
|
108 |
-
include a readable copy of the attribution notices contained
|
109 |
-
within such NOTICE file, excluding those notices that do not
|
110 |
-
pertain to any part of the Derivative Works, in at least one
|
111 |
-
of the following places: within a NOTICE text file distributed
|
112 |
-
as part of the Derivative Works; within the Source form or
|
113 |
-
documentation, if provided along with the Derivative Works; or,
|
114 |
-
within a display generated by the Derivative Works, if and
|
115 |
-
wherever such third-party notices normally appear. The contents
|
116 |
-
of the NOTICE file are for informational purposes only and
|
117 |
-
do not modify the License. You may add Your own attribution
|
118 |
-
notices within Derivative Works that You distribute, alongside
|
119 |
-
or as an addendum to the NOTICE text from the Work, provided
|
120 |
-
that such additional attribution notices cannot be construed
|
121 |
-
as modifying the License.
|
122 |
-
|
123 |
-
You may add Your own copyright statement to Your modifications and
|
124 |
-
may provide additional or different license terms and conditions
|
125 |
-
for use, reproduction, or distribution of Your modifications, or
|
126 |
-
for any such Derivative Works as a whole, provided Your use,
|
127 |
-
reproduction, and distribution of the Work otherwise complies with
|
128 |
-
the conditions stated in this License.
|
129 |
-
|
130 |
-
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
-
any Contribution intentionally submitted for inclusion in the Work
|
132 |
-
by You to the Licensor shall be under the terms and conditions of
|
133 |
-
this License, without any additional terms or conditions.
|
134 |
-
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
-
the terms of any separate license agreement you may have executed
|
136 |
-
with Licensor regarding such Contributions.
|
137 |
-
|
138 |
-
6. Trademarks. This License does not grant permission to use the trade
|
139 |
-
names, trademarks, service marks, or product names of the Licensor,
|
140 |
-
except as required for reasonable and customary use in describing the
|
141 |
-
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
-
|
143 |
-
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
-
agreed to in writing, Licensor provides the Work (and each
|
145 |
-
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
-
implied, including, without limitation, any warranties or conditions
|
148 |
-
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
-
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
-
appropriateness of using or redistributing the Work and assume any
|
151 |
-
risks associated with Your exercise of permissions under this License.
|
152 |
-
|
153 |
-
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
-
whether in tort (including negligence), contract, or otherwise,
|
155 |
-
unless required by applicable law (such as deliberate and grossly
|
156 |
-
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
-
liable to You for damages, including any direct, indirect, special,
|
158 |
-
incidental, or consequential damages of any character arising as a
|
159 |
-
result of this License or out of the use or inability to use the
|
160 |
-
Work (including but not limited to damages for loss of goodwill,
|
161 |
-
work stoppage, computer failure or malfunction, or any and all
|
162 |
-
other commercial damages or losses), even if such Contributor
|
163 |
-
has been advised of the possibility of such damages.
|
164 |
-
|
165 |
-
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
-
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
-
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
-
or other liability obligations and/or rights consistent with this
|
169 |
-
License. However, in accepting such obligations, You may act only
|
170 |
-
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
-
of any other Contributor, and only if You agree to indemnify,
|
172 |
-
defend, and hold each Contributor harmless for any liability
|
173 |
-
incurred by, or claims asserted against, such Contributor by reason
|
174 |
-
of your accepting any such warranty or additional liability.
|
175 |
-
|
176 |
-
END OF TERMS AND CONDITIONS
|
177 |
-
|
178 |
-
APPENDIX: How to apply the Apache License to your work.
|
179 |
-
|
180 |
-
To apply the Apache License to your work, attach the following
|
181 |
-
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
-
replaced with your own identifying information. (Don't include
|
183 |
-
the brackets!) The text should be enclosed in the appropriate
|
184 |
-
comment syntax for the file format. We also recommend that a
|
185 |
-
file or class name and description of purpose be included on the
|
186 |
-
same "printed page" as the copyright notice for easier
|
187 |
-
identification within third-party archives.
|
188 |
-
|
189 |
-
Copyright [yyyy] [name of copyright owner]
|
190 |
-
|
191 |
-
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
-
you may not use this file except in compliance with the License.
|
193 |
-
You may obtain a copy of the License at
|
194 |
-
|
195 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
-
|
197 |
-
Unless required by applicable law or agreed to in writing, software
|
198 |
-
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
-
See the License for the specific language governing permissions and
|
201 |
-
limitations under the License.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🔥
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.9.0
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -11,7 +11,6 @@ import time
|
|
11 |
from apscheduler.schedulers.background import BackgroundScheduler
|
12 |
|
13 |
from huggingface_hub import snapshot_download
|
14 |
-
from pytz import utc
|
15 |
|
16 |
from src.display.about import (
|
17 |
CITATION_BUTTON_LABEL,
|
@@ -76,7 +75,7 @@ def restart_space():
|
|
76 |
|
77 |
|
78 |
def init_space():
|
79 |
-
|
80 |
|
81 |
if socket.gethostname() not in {"neuromancer"}:
|
82 |
# sync model_type with open-llm-leaderboard
|
@@ -91,8 +90,7 @@ def init_space():
|
|
91 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
|
92 |
EVAL_REQUESTS_PATH, EVAL_COLS
|
93 |
)
|
94 |
-
|
95 |
-
return None, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
96 |
|
97 |
|
98 |
def add_benchmark_columns(shown_columns):
|
@@ -160,7 +158,6 @@ def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precisio
|
|
160 |
type_emoji = [t[0] for t in type_query]
|
161 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
162 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
163 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.inference_framework.name].isin(size_query)]
|
164 |
|
165 |
# numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
166 |
# params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
@@ -259,7 +256,7 @@ with demo:
|
|
259 |
for c in fields(AutoEvalColumn)
|
260 |
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
261 |
],
|
262 |
-
label="
|
263 |
elem_id="column-select",
|
264 |
interactive=True,
|
265 |
)
|
@@ -356,21 +353,21 @@ with demo:
|
|
356 |
queue=True,
|
357 |
)
|
358 |
|
359 |
-
|
360 |
-
|
361 |
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
|
372 |
-
|
373 |
-
|
374 |
|
375 |
with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
|
376 |
with gr.Column():
|
@@ -479,7 +476,7 @@ with demo:
|
|
479 |
show_copy_button=True,
|
480 |
)
|
481 |
|
482 |
-
scheduler = BackgroundScheduler(
|
483 |
|
484 |
scheduler.add_job(restart_space, "interval", hours=6)
|
485 |
|
|
|
11 |
from apscheduler.schedulers.background import BackgroundScheduler
|
12 |
|
13 |
from huggingface_hub import snapshot_download
|
|
|
14 |
|
15 |
from src.display.about import (
|
16 |
CITATION_BUTTON_LABEL,
|
|
|
75 |
|
76 |
|
77 |
def init_space():
|
78 |
+
dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
|
79 |
|
80 |
if socket.gethostname() not in {"neuromancer"}:
|
81 |
# sync model_type with open-llm-leaderboard
|
|
|
90 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
|
91 |
EVAL_REQUESTS_PATH, EVAL_COLS
|
92 |
)
|
93 |
+
return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
|
|
94 |
|
95 |
|
96 |
def add_benchmark_columns(shown_columns):
|
|
|
158 |
type_emoji = [t[0] for t in type_query]
|
159 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
160 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
|
|
161 |
|
162 |
# numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
163 |
# params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
|
|
256 |
for c in fields(AutoEvalColumn)
|
257 |
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
258 |
],
|
259 |
+
label="Select columns to show",
|
260 |
elem_id="column-select",
|
261 |
interactive=True,
|
262 |
)
|
|
|
353 |
queue=True,
|
354 |
)
|
355 |
|
356 |
+
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
357 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
358 |
|
359 |
+
dataset_table = gr.components.Dataframe(
|
360 |
+
value=dataset_df,
|
361 |
+
headers=list(dataset_df.columns),
|
362 |
+
datatype=["str", "markdown", "str", "str", "str"],
|
363 |
+
elem_id="dataset-table",
|
364 |
+
interactive=False,
|
365 |
+
visible=True,
|
366 |
+
column_widths=["15%", "20%"],
|
367 |
+
)
|
368 |
|
369 |
+
gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
|
370 |
+
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
371 |
|
372 |
with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
|
373 |
with gr.Column():
|
|
|
476 |
show_copy_button=True,
|
477 |
)
|
478 |
|
479 |
+
scheduler = BackgroundScheduler()
|
480 |
|
481 |
scheduler.add_job(restart_space, "interval", hours=6)
|
482 |
|
backend-cli.py
CHANGED
@@ -17,7 +17,7 @@ from src.backend.manage_requests import EvalRequest
|
|
17 |
from src.leaderboard.read_evals import EvalResult
|
18 |
|
19 |
from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
|
20 |
-
from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus
|
21 |
|
22 |
from src.leaderboard.read_evals import get_raw_eval_results
|
23 |
|
@@ -28,8 +28,6 @@ import time
|
|
28 |
import pprint
|
29 |
import logging
|
30 |
|
31 |
-
from lm_eval.filters.extraction import RegexFilter
|
32 |
-
|
33 |
|
34 |
# Configure the root logger
|
35 |
logging.basicConfig(
|
@@ -44,20 +42,6 @@ eval_logger = logging.getLogger("lm-eval")
|
|
44 |
# Explicitly set the level for 'lm-eval' logger to WARNING
|
45 |
eval_logger.setLevel(logging.WARNING)
|
46 |
|
47 |
-
def tuple_input_decorator(func):
|
48 |
-
def wrapper(self, resps, docs):
|
49 |
-
stripped_resps = [[resp_data[0] for resp_data in group] for group in resps]
|
50 |
-
|
51 |
-
filtered_resps = func(self, stripped_resps, docs)
|
52 |
-
|
53 |
-
combined_resps = []
|
54 |
-
for original_group, new_group in zip(resps, filtered_resps):
|
55 |
-
combined_group = [(new_resp,) + rest_of_data[1:] for new_resp, rest_of_data in zip(new_group, original_group)]
|
56 |
-
combined_resps.append(combined_group)
|
57 |
-
|
58 |
-
return combined_resps
|
59 |
-
return wrapper
|
60 |
-
|
61 |
|
62 |
def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
|
63 |
for i in range(10):
|
@@ -142,6 +126,9 @@ def request_to_result_name(request: EvalRequest) -> str:
|
|
142 |
def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
|
143 |
batch_size = 1
|
144 |
batch_size = eval_request.batch_size
|
|
|
|
|
|
|
145 |
|
146 |
init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
|
147 |
# if init_gpu_info['Mem(M)'] > 500:
|
@@ -150,12 +137,6 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
|
|
150 |
stop_event = threading.Event()
|
151 |
monitor_thread = threading.Thread(target=monitor_gpus, args=(stop_event, 5, gpu_stats_list))
|
152 |
monitor_thread.start()
|
153 |
-
|
154 |
-
original_apply = RegexFilter.apply
|
155 |
-
if task.benchmark in ["gsm8k", "gsm8k_cot", "gsm8k_cot_self_consistency", "gsm8k_custom"]:
|
156 |
-
RegexFilter.apply = tuple_input_decorator(RegexFilter.apply)
|
157 |
-
else:
|
158 |
-
RegexFilter.apply = original_apply
|
159 |
|
160 |
try:
|
161 |
results = run_evaluation(
|
@@ -217,8 +198,6 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
|
|
217 |
repo_id=RESULTS_REPO,
|
218 |
repo_type="dataset",
|
219 |
)
|
220 |
-
|
221 |
-
RegexFilter.apply = original_apply
|
222 |
return results
|
223 |
|
224 |
|
@@ -387,7 +366,21 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
|
|
387 |
|
388 |
return False
|
389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
390 |
def process_pending_requests() -> bool:
|
|
|
|
|
|
|
391 |
sanity_checks()
|
392 |
print("Processing pending requests")
|
393 |
current_pending_status = [PENDING_STATUS]
|
@@ -450,15 +443,13 @@ def get_args():
|
|
450 |
parser = argparse.ArgumentParser(description="Run the backend")
|
451 |
parser.add_argument("--debug", action="store_true", help="Run in debug mode")
|
452 |
# debug parameters
|
453 |
-
parser.add_argument("--task", type=str, default="selfcheckgpt,mmlu
|
454 |
parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1,mistralai/Mixtral-8x7B-v0.1", help="Model to debug")
|
455 |
parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
|
456 |
parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
|
457 |
parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
|
458 |
parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
|
459 |
help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
|
460 |
-
parser.add_argument("--debug_repo", action="store_true", help="Use debug repo")
|
461 |
-
parser.add_argument("--model_type", type=str, default="chat", help="Model type")
|
462 |
return parser.parse_args()
|
463 |
|
464 |
|
@@ -466,7 +457,7 @@ if __name__ == "__main__":
|
|
466 |
args = get_args()
|
467 |
local_debug = args.debug
|
468 |
# debug specific task by ping
|
469 |
-
if local_debug
|
470 |
# debug_model_names = [args.model] # Use model from arguments
|
471 |
# debug_task_name = [args.task] # Use task from arguments
|
472 |
debug_model_names = args.model.split(",")
|
@@ -474,68 +465,48 @@ if __name__ == "__main__":
|
|
474 |
precisions = args.precision.split(",")
|
475 |
print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
|
476 |
task_lst = TASKS_HARNESS.copy()
|
477 |
-
RESULTS_REPO = DEBUG_RESULTS_REPO
|
478 |
for precision in precisions:
|
479 |
for debug_model_name in debug_model_names:
|
480 |
for task in task_lst:
|
481 |
task_name = task.benchmark
|
482 |
if task_name not in debug_task_name:
|
483 |
continue
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
elif local_debug and args.debug_repo:
|
503 |
-
QUEUE_REPO = DEBUG_QUEUE_REPO
|
504 |
-
RESULTS_REPO = DEBUG_RESULTS_REPO
|
505 |
while True:
|
506 |
res = False
|
|
|
507 |
# if random.randint(0, 10) == 0:
|
508 |
res = process_pending_requests()
|
509 |
print(f"waiting for 60 seconds")
|
510 |
time.sleep(60)
|
|
|
511 |
# if res is False:
|
512 |
# if random.randint(0, 5) == 0:
|
513 |
# res = maybe_refresh_results(100)
|
514 |
# else:
|
515 |
# res = process_finished_requests(100)
|
|
|
516 |
# time.sleep(60)
|
|
|
517 |
# if res is False:
|
518 |
# if random.randint(0, 5) == 0:
|
519 |
# res = maybe_refresh_results(0)
|
520 |
# else:
|
521 |
# res = process_finished_requests(0)
|
522 |
-
elif not local_debug and not args.debug_repo:
|
523 |
-
while True:
|
524 |
-
res = False
|
525 |
-
# if random.randint(0, 10) == 0:
|
526 |
-
res = process_pending_requests()
|
527 |
-
print(f"waiting for 60 seconds")
|
528 |
-
time.sleep(60)
|
529 |
-
# if res is False:
|
530 |
-
# if random.randint(0, 5) == 0:
|
531 |
-
# res = maybe_refresh_results(100)
|
532 |
-
# else:
|
533 |
-
# res = process_finished_requests(100)
|
534 |
-
# time.sleep(60)
|
535 |
-
# if res is False:
|
536 |
-
# if random.randint(0, 5) == 0:
|
537 |
-
# res = maybe_refresh_results(0)
|
538 |
-
# else:
|
539 |
-
# res = process_finished_requests(0)
|
540 |
-
else:
|
541 |
-
raise Exception("Cannot use debug_repo without local debug flag")
|
|
|
17 |
from src.leaderboard.read_evals import EvalResult
|
18 |
|
19 |
from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
|
20 |
+
from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus
|
21 |
|
22 |
from src.leaderboard.read_evals import get_raw_eval_results
|
23 |
|
|
|
28 |
import pprint
|
29 |
import logging
|
30 |
|
|
|
|
|
31 |
|
32 |
# Configure the root logger
|
33 |
logging.basicConfig(
|
|
|
42 |
# Explicitly set the level for 'lm-eval' logger to WARNING
|
43 |
eval_logger.setLevel(logging.WARNING)
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
|
47 |
for i in range(10):
|
|
|
126 |
def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
|
127 |
batch_size = 1
|
128 |
batch_size = eval_request.batch_size
|
129 |
+
|
130 |
+
if args.debug:
|
131 |
+
RESULTS_REPO = DEBUG_RESULTS_REPO
|
132 |
|
133 |
init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
|
134 |
# if init_gpu_info['Mem(M)'] > 500:
|
|
|
137 |
stop_event = threading.Event()
|
138 |
monitor_thread = threading.Thread(target=monitor_gpus, args=(stop_event, 5, gpu_stats_list))
|
139 |
monitor_thread.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
try:
|
142 |
results = run_evaluation(
|
|
|
198 |
repo_id=RESULTS_REPO,
|
199 |
repo_type="dataset",
|
200 |
)
|
|
|
|
|
201 |
return results
|
202 |
|
203 |
|
|
|
366 |
|
367 |
return False
|
368 |
|
369 |
+
|
370 |
+
def get_gpu_details():
|
371 |
+
gpus = GPUtil.getGPUs()
|
372 |
+
gpu = gpus[0]
|
373 |
+
name = gpu.name.replace(" ", "-")
|
374 |
+
# Convert memory from MB to GB and round to nearest whole number
|
375 |
+
memory_gb = round(gpu.memoryTotal / 1024)
|
376 |
+
memory = f"{memory_gb}GB"
|
377 |
+
formatted_name = f"{name}-{memory}"
|
378 |
+
return formatted_name
|
379 |
+
|
380 |
def process_pending_requests() -> bool:
|
381 |
+
if args.debug:
|
382 |
+
QUEUE_REPO = DEBUG_QUEUE_REPO
|
383 |
+
|
384 |
sanity_checks()
|
385 |
print("Processing pending requests")
|
386 |
current_pending_status = [PENDING_STATUS]
|
|
|
443 |
parser = argparse.ArgumentParser(description="Run the backend")
|
444 |
parser.add_argument("--debug", action="store_true", help="Run in debug mode")
|
445 |
# debug parameters
|
446 |
+
parser.add_argument("--task", type=str, default="selfcheckgpt,mmlu", help="Task to debug")
|
447 |
parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1,mistralai/Mixtral-8x7B-v0.1", help="Model to debug")
|
448 |
parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
|
449 |
parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
|
450 |
parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
|
451 |
parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
|
452 |
help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
|
|
|
|
|
453 |
return parser.parse_args()
|
454 |
|
455 |
|
|
|
457 |
args = get_args()
|
458 |
local_debug = args.debug
|
459 |
# debug specific task by ping
|
460 |
+
if local_debug:
|
461 |
# debug_model_names = [args.model] # Use model from arguments
|
462 |
# debug_task_name = [args.task] # Use task from arguments
|
463 |
debug_model_names = args.model.split(",")
|
|
|
465 |
precisions = args.precision.split(",")
|
466 |
print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
|
467 |
task_lst = TASKS_HARNESS.copy()
|
|
|
468 |
for precision in precisions:
|
469 |
for debug_model_name in debug_model_names:
|
470 |
for task in task_lst:
|
471 |
task_name = task.benchmark
|
472 |
if task_name not in debug_task_name:
|
473 |
continue
|
474 |
+
try:
|
475 |
+
eval_request = EvalRequest(
|
476 |
+
model=debug_model_name,
|
477 |
+
private=False,
|
478 |
+
status="",
|
479 |
+
json_filepath="",
|
480 |
+
precision=precision, # Use precision from arguments
|
481 |
+
inference_framework=args.inference_framework, # Use inference framework from arguments
|
482 |
+
gpu_type=args.gpu_type
|
483 |
+
)
|
484 |
+
curr_gpu_type = get_gpu_details()
|
485 |
+
if eval_request.gpu_type != curr_gpu_type:
|
486 |
+
print(f"GPU type mismatch: {eval_request.gpu_type} vs {curr_gpu_type}")
|
487 |
+
raise Exception("GPU type mismatch")
|
488 |
+
results = process_evaluation(task, eval_request, limit=args.limit)
|
489 |
+
except Exception as e:
|
490 |
+
print(f"debug running error: {e}")
|
491 |
+
else:
|
|
|
|
|
|
|
492 |
while True:
|
493 |
res = False
|
494 |
+
|
495 |
# if random.randint(0, 10) == 0:
|
496 |
res = process_pending_requests()
|
497 |
print(f"waiting for 60 seconds")
|
498 |
time.sleep(60)
|
499 |
+
|
500 |
# if res is False:
|
501 |
# if random.randint(0, 5) == 0:
|
502 |
# res = maybe_refresh_results(100)
|
503 |
# else:
|
504 |
# res = process_finished_requests(100)
|
505 |
+
|
506 |
# time.sleep(60)
|
507 |
+
|
508 |
# if res is False:
|
509 |
# if random.randint(0, 5) == 0:
|
510 |
# res = maybe_refresh_results(0)
|
511 |
# else:
|
512 |
# res = process_finished_requests(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -4,7 +4,7 @@ APScheduler
|
|
4 |
black
|
5 |
click
|
6 |
datasets
|
7 |
-
gradio
|
8 |
gradio_client
|
9 |
huggingface-hub
|
10 |
matplotlib
|
@@ -16,7 +16,7 @@ requests
|
|
16 |
semantic-version
|
17 |
tqdm
|
18 |
wandb
|
19 |
-
transformers
|
20 |
tokenizers>=0.15.0
|
21 |
lm_eval[ifeval] @ git+https://github.com/EleutherAI/[email protected]
|
22 |
accelerate
|
@@ -30,7 +30,4 @@ evaluate
|
|
30 |
spacy==3.7.4
|
31 |
selfcheckgpt
|
32 |
immutabledict
|
33 |
-
gputil
|
34 |
-
bitsandbytes
|
35 |
-
openai
|
36 |
-
scikit-learn
|
|
|
4 |
black
|
5 |
click
|
6 |
datasets
|
7 |
+
gradio
|
8 |
gradio_client
|
9 |
huggingface-hub
|
10 |
matplotlib
|
|
|
16 |
semantic-version
|
17 |
tqdm
|
18 |
wandb
|
19 |
+
transformers>=4.36.0
|
20 |
tokenizers>=0.15.0
|
21 |
lm_eval[ifeval] @ git+https://github.com/EleutherAI/[email protected]
|
22 |
accelerate
|
|
|
30 |
spacy==3.7.4
|
31 |
selfcheckgpt
|
32 |
immutabledict
|
33 |
+
gputil
|
|
|
|
|
|
src/backend/envs.py
CHANGED
@@ -57,9 +57,6 @@ class Tasks(Enum):
|
|
57 |
|
58 |
# task20 = Task("race", "acc", "RACE", 0)
|
59 |
task21 = Task("mmlu", "acc", "MMLU", 5)
|
60 |
-
task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
|
61 |
-
# task23 = Task("gsm8k_cot", "em", "GSM8K", 8)
|
62 |
-
task24 = Task("arena_hard", "score", "Arena Hard", 0)
|
63 |
|
64 |
|
65 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
|
|
57 |
|
58 |
# task20 = Task("race", "acc", "RACE", 0)
|
59 |
task21 = Task("mmlu", "acc", "MMLU", 5)
|
|
|
|
|
|
|
60 |
|
61 |
|
62 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
src/backend/hflm_with_measurement.py
CHANGED
@@ -24,7 +24,7 @@ from transformers.models.auto.modeling_auto import (
|
|
24 |
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
|
25 |
)
|
26 |
from transformers import TextStreamer
|
27 |
-
|
28 |
from lm_eval import utils
|
29 |
from lm_eval.api.instance import Instance
|
30 |
from lm_eval.api.model import TemplateLM
|
@@ -37,9 +37,6 @@ from lm_eval.models.utils import (
|
|
37 |
stop_sequences_criteria,
|
38 |
)
|
39 |
from lm_eval.models.huggingface import HFLM
|
40 |
-
from src.utils import get_gpu_details, get_peak_bw, transfer_precision2bytes, get_peak_flops
|
41 |
-
from src.submission.check_validity import get_model_size
|
42 |
-
from src.envs import API
|
43 |
|
44 |
|
45 |
class StopWatch(TextStreamer):
|
@@ -70,21 +67,6 @@ class StopWatch(TextStreamer):
|
|
70 |
class HFLMWithMeasurement(HFLM):
|
71 |
def __init__(self, **kwargs):
|
72 |
super().__init__(**kwargs)
|
73 |
-
self.pretrained = kwargs.get("pretrained", None)
|
74 |
-
self.revision = kwargs.get("revision", None)
|
75 |
-
self.precision = kwargs.get("dtype", None)
|
76 |
-
self.num_gpus = None
|
77 |
-
|
78 |
-
def _detect_num_gpus_used(self):
|
79 |
-
if self.num_gpus is not None:
|
80 |
-
return self.num_gpus
|
81 |
-
gpus = []
|
82 |
-
for p in self.model.parameters():
|
83 |
-
if p.device.type == "cuda":
|
84 |
-
gpus.append(p.device.index)
|
85 |
-
|
86 |
-
self.num_gpus = len(set(gpus))
|
87 |
-
return self.num_gpus
|
88 |
|
89 |
def _loglikelihood_tokens(
|
90 |
self,
|
@@ -297,7 +279,7 @@ class HFLMWithMeasurement(HFLM):
|
|
297 |
# Answer: (log prob, is-exact-match)
|
298 |
answer = (float(logits.sum()), bool(max_equal))
|
299 |
|
300 |
-
res.append((answer, per_sample_time, 0, 0
|
301 |
|
302 |
self.cache_hook.add_partial("loglikelihood", request_str, answer)
|
303 |
pbar.update(1)
|
@@ -306,15 +288,13 @@ class HFLMWithMeasurement(HFLM):
|
|
306 |
|
307 |
return re_ord.get_original(res)
|
308 |
|
309 |
-
def _model_generate(self, context,
|
310 |
# temperature = 0.0 if not set
|
311 |
# if do_sample is false and temp==0.0:
|
312 |
# remove temperature, as do_sample=False takes care of this
|
313 |
# and we don't want a warning from HF
|
314 |
generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
|
315 |
do_sample = generation_kwargs.get("do_sample", None)
|
316 |
-
|
317 |
-
# is_gsm8k = generation_kwargs.get("is_gsm8k", False)
|
318 |
|
319 |
# The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
|
320 |
if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
|
@@ -322,52 +302,7 @@ class HFLMWithMeasurement(HFLM):
|
|
322 |
|
323 |
if do_sample is False and generation_kwargs.get("temperature") == 0.0:
|
324 |
generation_kwargs.pop("temperature")
|
325 |
-
|
326 |
-
# if is_gsm8k:
|
327 |
-
# generation_kwargs.pop("is_gsm8k")
|
328 |
-
|
329 |
-
context_length = context.shape[1]
|
330 |
-
|
331 |
-
if self.model.__class__.__name__ == "MoE":
|
332 |
-
model_config = self.model.model.config
|
333 |
-
else:
|
334 |
-
model_config = self.model.config
|
335 |
-
|
336 |
-
if not self.precision:
|
337 |
-
if model_config.quantization_config._load_in_4bit:
|
338 |
-
self.precision = "4bit"
|
339 |
-
elif model_config.quantization_config._load_in_8bit:
|
340 |
-
self.precision = "8bit"
|
341 |
-
else:
|
342 |
-
raise ValueError("Unknown precision")
|
343 |
-
|
344 |
-
# print(self.model)
|
345 |
-
linear_count = 0
|
346 |
-
element_wise_mul = 0
|
347 |
-
for name, module in self.model.named_modules():
|
348 |
-
if ('layers.0.' in name or "transformer.blocks.0" in name) and ('attn' not in name):
|
349 |
-
if 'experts.0.' in name or "ffn.experts" in name:
|
350 |
-
if "linear_v" in name:
|
351 |
-
element_wise_mul = 1
|
352 |
-
if isinstance(module, torch.nn.Linear):
|
353 |
-
# print(name, module)
|
354 |
-
linear_count += 1
|
355 |
-
elif isinstance(module, DbrxExpertGLU):
|
356 |
-
linear_count = 3
|
357 |
-
element_wise_mul = 1
|
358 |
-
# elif 'experts' not in name:
|
359 |
-
# if ("gate" not in name and "router" not in name) or "gate_proj" in name:
|
360 |
-
# if "gate_proj" in name:
|
361 |
-
# element_wise_mul = 1
|
362 |
-
# if isinstance(module, torch.nn.Linear):
|
363 |
-
# # print(name, module)
|
364 |
-
# linear_count += 1
|
365 |
-
else:
|
366 |
-
continue
|
367 |
-
print(f"linear_count: {linear_count}")
|
368 |
-
print(f"element_wise_mul: {element_wise_mul}")
|
369 |
-
print(f"GPU usage: {self._detect_num_gpus_used()}")
|
370 |
-
|
371 |
stopping_criteria = stop_sequences_criteria(
|
372 |
self.tokenizer, stop, context.shape[1], context.shape[0]
|
373 |
)
|
@@ -375,7 +310,7 @@ class HFLMWithMeasurement(HFLM):
|
|
375 |
start = time()
|
376 |
res = self.model.generate(
|
377 |
input_ids=context,
|
378 |
-
|
379 |
stopping_criteria=stopping_criteria,
|
380 |
pad_token_id=self.tokenizer.pad_token_id,
|
381 |
use_cache=True,
|
@@ -386,83 +321,12 @@ class HFLMWithMeasurement(HFLM):
|
|
386 |
|
387 |
batch_size = context.shape[0]
|
388 |
output_length = stop_watch.decoding_iterations
|
389 |
-
|
390 |
-
precision_bytes = transfer_precision2bytes(self.precision)
|
391 |
-
|
392 |
-
model_size_param = sum(p.numel() for p in self.model.parameters())
|
393 |
-
|
394 |
-
n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else \
|
395 |
-
(model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layers)
|
396 |
-
|
397 |
-
d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
|
398 |
-
|
399 |
-
if hasattr(model_config, "num_experts_per_tok"):
|
400 |
-
n_experts_per_tok = model_config.num_experts_per_tok
|
401 |
-
elif hasattr(model_config, "num_selected_experts"):
|
402 |
-
n_experts_per_tok = model_config.num_selected_experts
|
403 |
-
elif hasattr(model_config, "ffn_config"):
|
404 |
-
n_experts_per_tok = model_config.ffn_config.moe_top_k
|
405 |
-
else:
|
406 |
-
n_experts_per_tok = 1
|
407 |
-
|
408 |
-
if hasattr(model_config, "ffn_dim"):
|
409 |
-
d_ff = model_config.ffn_dim
|
410 |
-
elif hasattr(model_config, "intermediate_size"):
|
411 |
-
d_ff = model_config.intermediate_size
|
412 |
-
elif hasattr(model_config, "d_ff"):
|
413 |
-
d_ff = model_config.d_ff
|
414 |
-
elif hasattr(model_config, "ff_ratio"):
|
415 |
-
d_ff = d_model * model_config.ff_ratio
|
416 |
-
elif hasattr(model_config, "ffn_config"):
|
417 |
-
d_ff = model_config.ffn_config.ffn_hidden_size
|
418 |
-
else:
|
419 |
-
raise ValueError("Unknown FFN dimension")
|
420 |
-
|
421 |
-
if hasattr(model_config, "num_local_experts"):
|
422 |
-
num_experts = model_config.num_local_experts
|
423 |
-
elif hasattr(model_config, "num_experts"):
|
424 |
-
num_experts = model_config.num_experts
|
425 |
-
elif hasattr(model_config, "ffn_config"):
|
426 |
-
num_experts = model_config.ffn_config.moe_num_experts
|
427 |
-
else:
|
428 |
-
num_experts = 1
|
429 |
-
|
430 |
-
ffn_params = n_layers * d_ff * linear_count * d_model
|
431 |
-
|
432 |
-
shared_params = model_size_param - num_experts * ffn_params
|
433 |
-
|
434 |
-
model_size = shared_params + n_experts_per_tok * ffn_params
|
435 |
-
|
436 |
-
per_token_kv_size = 2 * n_layers * d_model * precision_bytes
|
437 |
-
|
438 |
-
peak_bw_single = get_peak_bw(get_gpu_details())
|
439 |
-
peak_bw = peak_bw_single * self._detect_num_gpus_used()
|
440 |
-
|
441 |
-
context_prefill_size = context_length
|
442 |
-
kv_size = context_prefill_size * per_token_kv_size + (output_length - 1) * per_token_kv_size / 2
|
443 |
-
|
444 |
-
kv_size = kv_size / 1e9
|
445 |
-
|
446 |
-
n_vocab = model_config.vocab_size
|
447 |
|
448 |
end_to_end_time = (end - start) / batch_size
|
449 |
prefilling_time = stop_watch.prefilling_time / batch_size
|
450 |
decoding_time = stop_watch.decoding_time / batch_size
|
451 |
token_per_sec = output_length / decoding_time
|
452 |
-
|
453 |
-
|
454 |
-
avg_context_length = context_length + (output_length - 1) / 2
|
455 |
-
flops_per_token = 2 * model_size + ((linear_count + element_wise_mul) * n_layers * avg_context_length * d_model) + 4 * d_model + 2 * d_model * n_vocab
|
456 |
-
peak_flops_single = get_peak_flops(get_gpu_details(), self.precision)
|
457 |
-
peak_flops = peak_flops_single * self._detect_num_gpus_used()
|
458 |
-
|
459 |
-
## TODO only support llama-type decoder only models and moe models of switch transformer and mixtrial
|
460 |
-
mfu = token_per_sec * flops_per_token / peak_flops
|
461 |
-
mbu = achieve_mem_bw / peak_bw
|
462 |
-
|
463 |
-
print(f"mfu: {mfu}, mbu: {mbu}")
|
464 |
-
|
465 |
-
return res, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu
|
466 |
|
467 |
def generate_until(
|
468 |
self, requests: List[Instance], disable_tqdm: bool = False
|
@@ -539,19 +403,11 @@ class HFLMWithMeasurement(HFLM):
|
|
539 |
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
|
540 |
)
|
541 |
# add EOS token to stop sequences
|
542 |
-
eos =
|
543 |
if not until:
|
544 |
until = [eos]
|
545 |
else:
|
546 |
until.append(eos)
|
547 |
-
|
548 |
-
# is_gsm8k = kwargs.get("is_gsm8k", False)
|
549 |
-
# if is_gsm8k:
|
550 |
-
# until = ["Question:", "Question", "</s>"]
|
551 |
-
# eos_ids = [self.tokenizer.eos_token_id,
|
552 |
-
# self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
|
553 |
-
|
554 |
-
|
555 |
if "max_gen_toks" in kwargs.keys():
|
556 |
max_gen_toks = kwargs.pop("max_gen_toks")
|
557 |
else:
|
@@ -571,16 +427,14 @@ class HFLMWithMeasurement(HFLM):
|
|
571 |
left_truncate_len=max_ctx_len,
|
572 |
truncation=self.truncation,
|
573 |
)
|
574 |
-
|
575 |
-
# print("context: ", self.tok_decode(context_enc[0]))
|
576 |
context_enc = context_enc.to(self.device)
|
577 |
attn_masks = attn_masks.to(self.device)
|
578 |
|
579 |
-
if "
|
580 |
-
kwargs["
|
581 |
|
582 |
# perform batched generation
|
583 |
-
cont, end_to_end_time, prefilling_time, token_per_sec
|
584 |
context=context_enc,
|
585 |
attention_mask=attn_masks,
|
586 |
stop=until,
|
@@ -591,21 +445,18 @@ class HFLMWithMeasurement(HFLM):
|
|
591 |
for cont_toks, context in zip(cont_toks_list, contexts):
|
592 |
# discard context + left-padding toks if using causal decoder-only LM
|
593 |
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
|
594 |
-
# print("After Generation: ", self.tok_decode(cont_toks))
|
595 |
cont_toks = cont_toks[context_enc.shape[1] :]
|
596 |
-
|
597 |
s = self.tok_decode(cont_toks)
|
598 |
|
599 |
-
#
|
600 |
-
# if not is_gsm8k:
|
601 |
for term in until:
|
602 |
if len(term) > 0:
|
603 |
# ignore '' separator,
|
604 |
# for seq2seq case where self.tok_decode(self.eot_token_id) = ''
|
605 |
s = s.split(term)[0]
|
606 |
-
|
607 |
-
|
608 |
-
res.append((s, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu))
|
609 |
|
610 |
self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
|
611 |
pbar.update(1)
|
|
|
24 |
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
|
25 |
)
|
26 |
from transformers import TextStreamer
|
27 |
+
|
28 |
from lm_eval import utils
|
29 |
from lm_eval.api.instance import Instance
|
30 |
from lm_eval.api.model import TemplateLM
|
|
|
37 |
stop_sequences_criteria,
|
38 |
)
|
39 |
from lm_eval.models.huggingface import HFLM
|
|
|
|
|
|
|
40 |
|
41 |
|
42 |
class StopWatch(TextStreamer):
|
|
|
67 |
class HFLMWithMeasurement(HFLM):
|
68 |
def __init__(self, **kwargs):
|
69 |
super().__init__(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
def _loglikelihood_tokens(
|
72 |
self,
|
|
|
279 |
# Answer: (log prob, is-exact-match)
|
280 |
answer = (float(logits.sum()), bool(max_equal))
|
281 |
|
282 |
+
res.append((answer, per_sample_time, 0, 0))
|
283 |
|
284 |
self.cache_hook.add_partial("loglikelihood", request_str, answer)
|
285 |
pbar.update(1)
|
|
|
288 |
|
289 |
return re_ord.get_original(res)
|
290 |
|
291 |
+
def _model_generate(self, context, max_length, stop, **generation_kwargs):
|
292 |
# temperature = 0.0 if not set
|
293 |
# if do_sample is false and temp==0.0:
|
294 |
# remove temperature, as do_sample=False takes care of this
|
295 |
# and we don't want a warning from HF
|
296 |
generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
|
297 |
do_sample = generation_kwargs.get("do_sample", None)
|
|
|
|
|
298 |
|
299 |
# The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
|
300 |
if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
|
|
|
302 |
|
303 |
if do_sample is False and generation_kwargs.get("temperature") == 0.0:
|
304 |
generation_kwargs.pop("temperature")
|
305 |
+
# build stopping criteria
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
306 |
stopping_criteria = stop_sequences_criteria(
|
307 |
self.tokenizer, stop, context.shape[1], context.shape[0]
|
308 |
)
|
|
|
310 |
start = time()
|
311 |
res = self.model.generate(
|
312 |
input_ids=context,
|
313 |
+
max_length=max_length,
|
314 |
stopping_criteria=stopping_criteria,
|
315 |
pad_token_id=self.tokenizer.pad_token_id,
|
316 |
use_cache=True,
|
|
|
321 |
|
322 |
batch_size = context.shape[0]
|
323 |
output_length = stop_watch.decoding_iterations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
|
325 |
end_to_end_time = (end - start) / batch_size
|
326 |
prefilling_time = stop_watch.prefilling_time / batch_size
|
327 |
decoding_time = stop_watch.decoding_time / batch_size
|
328 |
token_per_sec = output_length / decoding_time
|
329 |
+
return res, end_to_end_time, prefilling_time, token_per_sec
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
|
331 |
def generate_until(
|
332 |
self, requests: List[Instance], disable_tqdm: bool = False
|
|
|
403 |
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
|
404 |
)
|
405 |
# add EOS token to stop sequences
|
406 |
+
eos = self.tok_decode(self.eot_token_id)
|
407 |
if not until:
|
408 |
until = [eos]
|
409 |
else:
|
410 |
until.append(eos)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
411 |
if "max_gen_toks" in kwargs.keys():
|
412 |
max_gen_toks = kwargs.pop("max_gen_toks")
|
413 |
else:
|
|
|
427 |
left_truncate_len=max_ctx_len,
|
428 |
truncation=self.truncation,
|
429 |
)
|
|
|
|
|
430 |
context_enc = context_enc.to(self.device)
|
431 |
attn_masks = attn_masks.to(self.device)
|
432 |
|
433 |
+
if "max_length" not in kwargs:
|
434 |
+
kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
|
435 |
|
436 |
# perform batched generation
|
437 |
+
cont, end_to_end_time, prefilling_time, token_per_sec = self._model_generate(
|
438 |
context=context_enc,
|
439 |
attention_mask=attn_masks,
|
440 |
stop=until,
|
|
|
445 |
for cont_toks, context in zip(cont_toks_list, contexts):
|
446 |
# discard context + left-padding toks if using causal decoder-only LM
|
447 |
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
|
|
|
448 |
cont_toks = cont_toks[context_enc.shape[1] :]
|
449 |
+
|
450 |
s = self.tok_decode(cont_toks)
|
451 |
|
452 |
+
# use secondary stop seqs to cut off should-have-been-stopped content post-hoc
|
|
|
453 |
for term in until:
|
454 |
if len(term) > 0:
|
455 |
# ignore '' separator,
|
456 |
# for seq2seq case where self.tok_decode(self.eot_token_id) = ''
|
457 |
s = s.split(term)[0]
|
458 |
+
|
459 |
+
res.append((s, end_to_end_time, prefilling_time, token_per_sec))
|
|
|
460 |
|
461 |
self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
|
462 |
pbar.update(1)
|
src/backend/moe_infinity.py
CHANGED
@@ -31,20 +31,15 @@ class MoEHFLM(HFLMWithMeasurement):
|
|
31 |
self.use_chat_template = use_chat_template
|
32 |
if "device" in kwargs:
|
33 |
kwargs.pop("device")
|
34 |
-
if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")):
|
35 |
-
shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
|
36 |
-
kwargs["device_map"] = "cuda:0"
|
37 |
super().__init__(
|
38 |
-
*args, **kwargs, pretrained=pretrained
|
39 |
) # Assuming HFLM accepts a 'pretrained' arg and handles it
|
40 |
# self._create_model()
|
|
|
41 |
|
42 |
def __del__(self):
|
43 |
-
|
44 |
-
|
45 |
-
if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")):
|
46 |
-
shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads")) # clean up offload model
|
47 |
-
|
48 |
|
49 |
def _create_model(self, *args, **kwargs):
|
50 |
"""
|
|
|
31 |
self.use_chat_template = use_chat_template
|
32 |
if "device" in kwargs:
|
33 |
kwargs.pop("device")
|
|
|
|
|
|
|
34 |
super().__init__(
|
35 |
+
*args, **kwargs, pretrained=pretrained, device_map="cuda:0"
|
36 |
) # Assuming HFLM accepts a 'pretrained' arg and handles it
|
37 |
# self._create_model()
|
38 |
+
shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
|
39 |
|
40 |
def __del__(self):
|
41 |
+
# Clean up offloaded models from self.offload_path
|
42 |
+
shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
|
|
|
|
|
|
|
43 |
|
44 |
def _create_model(self, *args, **kwargs):
|
45 |
"""
|
src/backend/run_eval_suite.py
CHANGED
@@ -17,16 +17,12 @@ def process_results_decorator(func):
|
|
17 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
18 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
19 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
20 |
-
mfu = sum([r[4] for r in results]) / len(results)
|
21 |
-
mbu = sum([r[5] for r in results]) / len(results)
|
22 |
# print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
23 |
|
24 |
result_dict = func(self, doc, processed_results, *args, **kwargs)
|
25 |
result_dict["end_to_end_time"] = end_to_end_time
|
26 |
result_dict["prefilling_time"] = prefilling_time
|
27 |
result_dict["decoding_throughput"] = decoding_throughput
|
28 |
-
result_dict["mfu"] = mfu
|
29 |
-
result_dict["mbu"] = mbu
|
30 |
return result_dict
|
31 |
return wrapper
|
32 |
ConfigurableTask.process_results = process_results_decorator(orig_process_results)
|
@@ -37,8 +33,6 @@ def aggregation_decorator(func):
|
|
37 |
aggregation_list["end_to_end_time"] = mean
|
38 |
aggregation_list["prefilling_time"] = mean
|
39 |
aggregation_list["decoding_throughput"] = mean
|
40 |
-
aggregation_list["mfu"] = mean
|
41 |
-
aggregation_list["mbu"] = mean
|
42 |
return aggregation_list
|
43 |
return wrapper
|
44 |
ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
|
@@ -49,8 +43,6 @@ def higher_is_better_decorator(func):
|
|
49 |
higher_is_better_dict["end_to_end_time"] = False
|
50 |
higher_is_better_dict["prefilling_time"] = False
|
51 |
higher_is_better_dict["decoding_throughput"] = True
|
52 |
-
higher_is_better_dict["mfu"] = True
|
53 |
-
higher_is_better_dict["mbu"] = True
|
54 |
return higher_is_better_dict
|
55 |
return wrapper
|
56 |
ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)
|
|
|
17 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
18 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
19 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
|
|
|
|
20 |
# print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
21 |
|
22 |
result_dict = func(self, doc, processed_results, *args, **kwargs)
|
23 |
result_dict["end_to_end_time"] = end_to_end_time
|
24 |
result_dict["prefilling_time"] = prefilling_time
|
25 |
result_dict["decoding_throughput"] = decoding_throughput
|
|
|
|
|
26 |
return result_dict
|
27 |
return wrapper
|
28 |
ConfigurableTask.process_results = process_results_decorator(orig_process_results)
|
|
|
33 |
aggregation_list["end_to_end_time"] = mean
|
34 |
aggregation_list["prefilling_time"] = mean
|
35 |
aggregation_list["decoding_throughput"] = mean
|
|
|
|
|
36 |
return aggregation_list
|
37 |
return wrapper
|
38 |
ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
|
|
|
43 |
higher_is_better_dict["end_to_end_time"] = False
|
44 |
higher_is_better_dict["prefilling_time"] = False
|
45 |
higher_is_better_dict["decoding_throughput"] = True
|
|
|
|
|
46 |
return higher_is_better_dict
|
47 |
return wrapper
|
48 |
ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)
|
src/backend/tasks/arena_hard/__init__.py
DELETED
File without changes
|
src/backend/tasks/arena_hard/arena_hard.yaml
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
task: arena_hard
|
2 |
-
class: !function task.ArenaHard
|
|
|
|
|
|
src/backend/tasks/arena_hard/arena_judgment.py
DELETED
@@ -1,256 +0,0 @@
|
|
1 |
-
'''
|
2 |
-
This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
|
3 |
-
under the Apache 2.0 License from the arena-hard project.
|
4 |
-
(https://github.com/lm-sys/arena-hard)
|
5 |
-
Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
|
6 |
-
See the NOTICE file distributed with this work for additional
|
7 |
-
information regarding copyright ownership.
|
8 |
-
'''
|
9 |
-
|
10 |
-
import pandas as pd
|
11 |
-
from tqdm import tqdm
|
12 |
-
import numpy as np
|
13 |
-
from sklearn.linear_model import LogisticRegression
|
14 |
-
import math
|
15 |
-
from collections import defaultdict
|
16 |
-
from tqdm import tqdm
|
17 |
-
|
18 |
-
from src.backend.tasks.arena_hard.arena_utils import (
|
19 |
-
chat_completion_openai,
|
20 |
-
load_questions,
|
21 |
-
load_model_answers,
|
22 |
-
get_endpoint,
|
23 |
-
make_config,
|
24 |
-
)
|
25 |
-
|
26 |
-
|
27 |
-
def get_score(judgment, pattern, pairwise=True):
|
28 |
-
matches = pattern.findall(judgment)
|
29 |
-
matches = [m for m in matches if m != ""]
|
30 |
-
if len(set(matches)) == 0:
|
31 |
-
return None, True
|
32 |
-
elif len(set(matches)) == 1:
|
33 |
-
if pairwise:
|
34 |
-
return matches[0].strip("\n"), False
|
35 |
-
return int(matches[0])
|
36 |
-
else:
|
37 |
-
return None, False
|
38 |
-
|
39 |
-
|
40 |
-
# get answer from model
|
41 |
-
def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None):
|
42 |
-
api_dict = get_endpoint(endpoint_dict["endpoints"])
|
43 |
-
|
44 |
-
# if endpoint_dict["api_type"] == "anthropic":
|
45 |
-
# output = chat_completion_anthropic(model, conv, temperature, max_tokens)
|
46 |
-
# elif endpoint_dict["api_type"] == "azure":
|
47 |
-
# output = chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict)
|
48 |
-
|
49 |
-
output = chat_completion_openai(model, conv, temperature, max_tokens, api_dict)
|
50 |
-
return output
|
51 |
-
|
52 |
-
|
53 |
-
def judgment(**args):
|
54 |
-
question = args["question"]
|
55 |
-
answer = args["answer"]
|
56 |
-
reference = args["reference"]
|
57 |
-
baseline = args["baseline_answer"]
|
58 |
-
configs = args["configs"]
|
59 |
-
# output_file = args["output_file"]
|
60 |
-
model = configs["judge_model"]
|
61 |
-
|
62 |
-
num_games = 2 if configs["pairwise"] else 1
|
63 |
-
|
64 |
-
# output = {
|
65 |
-
# "question_id":question["question_id"],
|
66 |
-
# "judge": model,
|
67 |
-
# "model": "custom_model",
|
68 |
-
# "games":[]
|
69 |
-
# }
|
70 |
-
output = [question["question_id"]]
|
71 |
-
|
72 |
-
for game in range(num_games):
|
73 |
-
conv = [{"role": "system", "content": configs["system_prompt"]}]
|
74 |
-
|
75 |
-
for template in configs["prompt_template"]:
|
76 |
-
prompt_args = {}
|
77 |
-
|
78 |
-
prompt_args[f"question_{1}"] = question["content"]
|
79 |
-
base = 1
|
80 |
-
|
81 |
-
if baseline:
|
82 |
-
if game % 2 == 1: # swap position
|
83 |
-
temp = baseline
|
84 |
-
baseline = answer
|
85 |
-
answer = temp
|
86 |
-
|
87 |
-
if game == 0:
|
88 |
-
for i, turn in enumerate(baseline["choices"][0]["turns"]):
|
89 |
-
prompt_args[f"answer_{i+1}"] = turn["content"]
|
90 |
-
base += 1
|
91 |
-
|
92 |
-
if game == 1:
|
93 |
-
prompt_args[f"answer_{1}"] = baseline
|
94 |
-
base += 1
|
95 |
-
|
96 |
-
if answer:
|
97 |
-
prompt_args[f"answer_{base}"] = answer
|
98 |
-
|
99 |
-
if reference:
|
100 |
-
for j, ref_answer in enumerate(reference):
|
101 |
-
for i, turn in enumerate(ref_answer["choices"][0]["turns"]):
|
102 |
-
prompt_args[f"ref_answer_{i+j+1}"] = turn["content"]
|
103 |
-
|
104 |
-
user_prompt = template.format(**prompt_args)
|
105 |
-
conv.append({"role": "user", "content": user_prompt})
|
106 |
-
|
107 |
-
judgment = ""
|
108 |
-
for _ in range(2):
|
109 |
-
new_judgment = get_answer(
|
110 |
-
model,
|
111 |
-
conv,
|
112 |
-
configs["temperature"],
|
113 |
-
configs["max_tokens"],
|
114 |
-
args["endpoint_dict"],
|
115 |
-
)
|
116 |
-
|
117 |
-
judgment += ("\n" + new_judgment)
|
118 |
-
|
119 |
-
score, try_again = get_score(judgment, args["regex_pattern"])
|
120 |
-
|
121 |
-
conv.append({"role": "assistant", "content": new_judgment})
|
122 |
-
|
123 |
-
if not try_again:
|
124 |
-
break
|
125 |
-
|
126 |
-
conv.append({"role": "user", "content": "continue your judgment and finish by outputting a final verdict label"})
|
127 |
-
print("Finish judgment!!!")
|
128 |
-
# result = {
|
129 |
-
# "user_prompt": conv[1]["content"],
|
130 |
-
# "judgment": judgment,
|
131 |
-
# "score":score
|
132 |
-
# }
|
133 |
-
output.append(score)
|
134 |
-
|
135 |
-
return output
|
136 |
-
|
137 |
-
def get_battles_from_scores(score_list, first_game_only=False, WEIGHT=3):
|
138 |
-
arena_hard_battles = pd.DataFrame()
|
139 |
-
|
140 |
-
print("Turning score list into battles...")
|
141 |
-
|
142 |
-
for scores in tqdm(score_list):
|
143 |
-
question_id, score1, score2 = scores
|
144 |
-
|
145 |
-
# Process game 1
|
146 |
-
output = {"question_id": question_id,
|
147 |
-
"model_a": "gpt-4-0314",
|
148 |
-
"model_b": f"custom_model"} # Unique identifier for model
|
149 |
-
weight = 1
|
150 |
-
if score1 == "A=B":
|
151 |
-
output["winner"] = "tie"
|
152 |
-
elif score1 == "A>B":
|
153 |
-
output["winner"] = "model_a"
|
154 |
-
elif score1 == "A>>B":
|
155 |
-
output["winner"] = "model_a"
|
156 |
-
weight = WEIGHT
|
157 |
-
elif score1 == "B>A":
|
158 |
-
output["winner"] = "model_b"
|
159 |
-
elif score1 == "B>>A":
|
160 |
-
output["winner"] = "model_b"
|
161 |
-
weight = WEIGHT
|
162 |
-
else:
|
163 |
-
weight = 0
|
164 |
-
|
165 |
-
if weight:
|
166 |
-
arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
|
167 |
-
|
168 |
-
if not first_game_only:
|
169 |
-
# Process game 2
|
170 |
-
output = {"question_id": question_id,
|
171 |
-
"model_a": "gpt-4-0314",
|
172 |
-
"model_b": f"custom_model"} # Unique identifier for model
|
173 |
-
weight = 1
|
174 |
-
if score2 == "A=B":
|
175 |
-
output["winner"] = "tie"
|
176 |
-
elif score2 == "A>B":
|
177 |
-
output["winner"] = "model_b"
|
178 |
-
elif score2 == "A>>B":
|
179 |
-
output["winner"] = "model_b"
|
180 |
-
weight = WEIGHT
|
181 |
-
elif score2 == "B>A":
|
182 |
-
output["winner"] = "model_a"
|
183 |
-
elif score2 == "B>>A":
|
184 |
-
output["winner"] = "model_a"
|
185 |
-
weight = WEIGHT
|
186 |
-
else:
|
187 |
-
weight = 0
|
188 |
-
|
189 |
-
if weight:
|
190 |
-
arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
|
191 |
-
|
192 |
-
arena_hard_battles.to_json("./arena_hard_battles.jsonl", lines=True, orient="records")
|
193 |
-
return arena_hard_battles
|
194 |
-
|
195 |
-
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
196 |
-
models = pd.concat([df["model_a"], df["model_b"]]).unique()
|
197 |
-
models = pd.Series(np.arange(len(models)), index=models)
|
198 |
-
|
199 |
-
LOW_RATING = 100
|
200 |
-
# duplicate battles
|
201 |
-
df = pd.concat([df, df], ignore_index=True)
|
202 |
-
p = len(models.index)
|
203 |
-
n = df.shape[0]
|
204 |
-
|
205 |
-
X = np.zeros([n, p])
|
206 |
-
X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
|
207 |
-
X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
|
208 |
-
|
209 |
-
# one A win => two A win
|
210 |
-
Y = np.zeros(n)
|
211 |
-
Y[df["winner"] == "model_a"] = 1.0
|
212 |
-
|
213 |
-
# one tie => one A win + one B win
|
214 |
-
# find tie + tie (both bad) index
|
215 |
-
tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
|
216 |
-
tie_idx[len(tie_idx)//2:] = False
|
217 |
-
Y[tie_idx] = 1.0
|
218 |
-
|
219 |
-
if len(np.unique(Y)) == 1:
|
220 |
-
# If there's only one class in the data, assign default ratings
|
221 |
-
elo_scores = np.full(p, LOW_RATING)
|
222 |
-
elo_scores[models["gpt-4-0314"]] = INIT_RATING
|
223 |
-
else:
|
224 |
-
lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
|
225 |
-
lr.fit(X,Y)
|
226 |
-
|
227 |
-
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
|
228 |
-
|
229 |
-
# set anchor as gpt-4-0314 = 1000
|
230 |
-
if "gpt-4-0314" in models.index:
|
231 |
-
elo_scores += 1000 - elo_scores[models["gpt-4-0314"]]
|
232 |
-
return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)
|
233 |
-
|
234 |
-
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
|
235 |
-
names = sorted(list(elo_ratings.keys()))
|
236 |
-
wins = defaultdict(lambda: defaultdict(lambda: 0))
|
237 |
-
for a in names:
|
238 |
-
for b in names:
|
239 |
-
ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
|
240 |
-
wins[a][b] = ea
|
241 |
-
wins[b][a] = 1 - ea
|
242 |
-
|
243 |
-
data = {
|
244 |
-
a: [wins[a][b] if a != b else np.NAN for b in names]
|
245 |
-
for a in names
|
246 |
-
}
|
247 |
-
|
248 |
-
df = pd.DataFrame(data, index=names)
|
249 |
-
df.index.name = "model_a"
|
250 |
-
df.columns.name = "model_b"
|
251 |
-
return df.T
|
252 |
-
|
253 |
-
def get_win_rate_column(df, column, baseline="gpt-4-0314"):
|
254 |
-
to_dict = df[["model", column]].set_index("model").to_dict()[column]
|
255 |
-
win_rate_table = predict_win_rate(to_dict)
|
256 |
-
return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/tasks/arena_hard/arena_utils.py
DELETED
@@ -1,349 +0,0 @@
|
|
1 |
-
'''
|
2 |
-
This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
|
3 |
-
under the Apache 2.0 License from the arena-hard project.
|
4 |
-
(https://github.com/lm-sys/arena-hard)
|
5 |
-
Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
|
6 |
-
See the NOTICE file distributed with this work for additional
|
7 |
-
information regarding copyright ownership.
|
8 |
-
'''
|
9 |
-
|
10 |
-
|
11 |
-
import os
|
12 |
-
import json
|
13 |
-
import time
|
14 |
-
import yaml
|
15 |
-
import random
|
16 |
-
|
17 |
-
from typing import Optional
|
18 |
-
from glob import glob
|
19 |
-
|
20 |
-
# API setting constants
|
21 |
-
API_MAX_RETRY = 16
|
22 |
-
API_RETRY_SLEEP = 10
|
23 |
-
API_ERROR_OUTPUT = "$ERROR$"
|
24 |
-
|
25 |
-
|
26 |
-
OPENAI_MODEL_LIST = (
|
27 |
-
"gpt-3.5-turbo",
|
28 |
-
"gpt-3.5-turbo-0301",
|
29 |
-
"gpt-3.5-turbo-0613",
|
30 |
-
"gpt-3.5-turbo-0613-verbose",
|
31 |
-
"gpt-3.5-turbo-1106",
|
32 |
-
"gpt-3.5-turbo-0125",
|
33 |
-
"gpt-4",
|
34 |
-
"gpt-4-0314",
|
35 |
-
"gpt-4-0613",
|
36 |
-
"gpt-4-turbo",
|
37 |
-
"gpt-4-1106-preview",
|
38 |
-
"gpt-4-0125-preview",
|
39 |
-
)
|
40 |
-
|
41 |
-
|
42 |
-
temperature_config = {
|
43 |
-
"writing": 0.7,
|
44 |
-
"roleplay": 0.7,
|
45 |
-
"extraction": 0.0,
|
46 |
-
"math": 0.0,
|
47 |
-
"coding": 0.0,
|
48 |
-
"reasoning": 0.0,
|
49 |
-
"stem": 0.1,
|
50 |
-
"humanities": 0.1,
|
51 |
-
}
|
52 |
-
|
53 |
-
|
54 |
-
def load_questions(question_file: str):
|
55 |
-
"""Load questions from a file."""
|
56 |
-
questions = []
|
57 |
-
with open(question_file, "r") as ques_file:
|
58 |
-
for line in ques_file:
|
59 |
-
if line:
|
60 |
-
questions.append(json.loads(line))
|
61 |
-
return questions
|
62 |
-
|
63 |
-
|
64 |
-
def load_model_answers(answer_dir: str):
|
65 |
-
"""Load model answers.
|
66 |
-
|
67 |
-
The return value is a python dict of type:
|
68 |
-
Dict[model_name: str -> Dict[question_id: int -> answer: dict]]
|
69 |
-
"""
|
70 |
-
filenames = glob(os.path.join(answer_dir, "*.jsonl"))
|
71 |
-
filenames.sort()
|
72 |
-
model_answers = {}
|
73 |
-
|
74 |
-
for filename in filenames:
|
75 |
-
model_name = os.path.basename(filename)[:-6]
|
76 |
-
answer = {}
|
77 |
-
with open(filename) as fin:
|
78 |
-
for line in fin:
|
79 |
-
line = json.loads(line)
|
80 |
-
answer[line["question_id"]] = line
|
81 |
-
model_answers[model_name] = answer
|
82 |
-
|
83 |
-
return model_answers
|
84 |
-
|
85 |
-
|
86 |
-
def get_endpoint(endpoint_list):
|
87 |
-
if endpoint_list is None:
|
88 |
-
return None
|
89 |
-
assert endpoint_list is not None
|
90 |
-
# randomly pick one
|
91 |
-
api_dict = random.choices(
|
92 |
-
endpoint_list
|
93 |
-
)[0]
|
94 |
-
return api_dict
|
95 |
-
|
96 |
-
|
97 |
-
# load config args from config yaml files
|
98 |
-
def make_config(config_file: str) -> dict:
|
99 |
-
config_kwargs = {}
|
100 |
-
with open(config_file, "r") as f:
|
101 |
-
config_kwargs = yaml.load(f, Loader=yaml.SafeLoader)
|
102 |
-
|
103 |
-
return config_kwargs
|
104 |
-
|
105 |
-
|
106 |
-
def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
|
107 |
-
import openai
|
108 |
-
if api_dict:
|
109 |
-
client = openai.OpenAI(
|
110 |
-
base_url=api_dict["api_base"],
|
111 |
-
api_key=api_dict["api_key"],
|
112 |
-
)
|
113 |
-
else:
|
114 |
-
client = openai.OpenAI()
|
115 |
-
|
116 |
-
output = API_ERROR_OUTPUT
|
117 |
-
for _ in range(API_MAX_RETRY):
|
118 |
-
try:
|
119 |
-
# print(messages)
|
120 |
-
completion = client.chat.completions.create(
|
121 |
-
model=model,
|
122 |
-
messages=messages,
|
123 |
-
temperature=temperature,
|
124 |
-
max_tokens=max_tokens
|
125 |
-
)
|
126 |
-
output = completion.choices[0].message.content
|
127 |
-
break
|
128 |
-
except openai.RateLimitError as e:
|
129 |
-
print(type(e), e)
|
130 |
-
time.sleep(API_RETRY_SLEEP)
|
131 |
-
except openai.BadRequestError as e:
|
132 |
-
print(messages)
|
133 |
-
print(type(e), e)
|
134 |
-
except KeyError:
|
135 |
-
print(type(e), e)
|
136 |
-
break
|
137 |
-
|
138 |
-
return output
|
139 |
-
|
140 |
-
|
141 |
-
# def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_dict=None):
|
142 |
-
# import openai
|
143 |
-
# from openai import AzureOpenAI
|
144 |
-
|
145 |
-
# api_base = api_dict["api_base"]
|
146 |
-
# client = AzureOpenAI(
|
147 |
-
# azure_endpoint = api_base,
|
148 |
-
# api_key= api_dict["api_key"],
|
149 |
-
# api_version=api_dict["api_version"],
|
150 |
-
# timeout=240,
|
151 |
-
# max_retries=2
|
152 |
-
# )
|
153 |
-
|
154 |
-
# output = API_ERROR_OUTPUT
|
155 |
-
# for _ in range(API_MAX_RETRY):
|
156 |
-
# try:
|
157 |
-
# response = client.chat.completions.create(
|
158 |
-
# model=model,
|
159 |
-
# messages=messages,
|
160 |
-
# n=1,
|
161 |
-
# temperature=temperature,
|
162 |
-
# max_tokens=max_tokens,
|
163 |
-
# seed=42,
|
164 |
-
# )
|
165 |
-
# output = response.choices[0].message.content
|
166 |
-
# break
|
167 |
-
# except openai.RateLimitError as e:
|
168 |
-
# print(type(e), e)
|
169 |
-
# time.sleep(API_RETRY_SLEEP)
|
170 |
-
# except openai.BadRequestError as e:
|
171 |
-
# print(type(e), e)
|
172 |
-
# break
|
173 |
-
# except KeyError:
|
174 |
-
# print(type(e), e)
|
175 |
-
# break
|
176 |
-
|
177 |
-
# return output
|
178 |
-
|
179 |
-
|
180 |
-
# def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict=None):
|
181 |
-
# import anthropic
|
182 |
-
|
183 |
-
# if api_dict:
|
184 |
-
# api_key = api_dict["api_key"]
|
185 |
-
# else:
|
186 |
-
# api_key = os.environ["ANTHROPIC_API_KEY"]
|
187 |
-
|
188 |
-
# sys_msg = ""
|
189 |
-
# if messages[0]["role"] == "system":
|
190 |
-
# sys_msg = messages[0]["content"]
|
191 |
-
# messages = messages[1:]
|
192 |
-
|
193 |
-
# output = API_ERROR_OUTPUT
|
194 |
-
# for _ in range(API_MAX_RETRY):
|
195 |
-
# try:
|
196 |
-
# # print(sys_msg)
|
197 |
-
# c = anthropic.Anthropic(api_key=api_key)
|
198 |
-
# response = c.messages.create(
|
199 |
-
# model=model,
|
200 |
-
# messages=messages,
|
201 |
-
# stop_sequences=[anthropic.HUMAN_PROMPT],
|
202 |
-
# max_tokens=max_tokens,
|
203 |
-
# temperature=temperature,
|
204 |
-
# system=sys_msg
|
205 |
-
# )
|
206 |
-
# output = response.content[0].text
|
207 |
-
# break
|
208 |
-
# except anthropic.APIError as e:
|
209 |
-
# print(type(e), e)
|
210 |
-
# time.sleep(API_RETRY_SLEEP)
|
211 |
-
# return output
|
212 |
-
|
213 |
-
|
214 |
-
# def chat_completion_mistral(model, messages, temperature, max_tokens):
|
215 |
-
# from mistralai.client import MistralClient
|
216 |
-
# from mistralai.models.chat_completion import ChatMessage
|
217 |
-
# from mistralai.exceptions import MistralException
|
218 |
-
|
219 |
-
# api_key = os.environ["MISTRAL_API_KEY"]
|
220 |
-
# client = MistralClient(api_key=api_key)
|
221 |
-
|
222 |
-
# prompts = [ChatMessage(role=message["role"], content=message["content"]) for message in messages]
|
223 |
-
|
224 |
-
# output = API_ERROR_OUTPUT
|
225 |
-
# for _ in range(API_MAX_RETRY):
|
226 |
-
# try:
|
227 |
-
# chat_response = client.chat(
|
228 |
-
# model=model,
|
229 |
-
# messages=prompts,
|
230 |
-
# temperature=temperature,
|
231 |
-
# max_tokens=max_tokens,
|
232 |
-
# )
|
233 |
-
# output = chat_response.choices[0].message.content
|
234 |
-
# break
|
235 |
-
# except MistralException as e:
|
236 |
-
# print(type(e), e)
|
237 |
-
# break
|
238 |
-
|
239 |
-
# return output
|
240 |
-
|
241 |
-
|
242 |
-
# def chat_completion_gemini(model, messages, temperature, max_tokens):
|
243 |
-
# import google.generativeai as genai
|
244 |
-
# genai.configure(api_key=os.environ["GEMINI_API_KEY"])
|
245 |
-
|
246 |
-
# safety_settings = [
|
247 |
-
# {
|
248 |
-
# "category": "HARM_CATEGORY_HARASSMENT",
|
249 |
-
# "threshold": "BLOCK_NONE"
|
250 |
-
# },
|
251 |
-
# {
|
252 |
-
# "category": "HARM_CATEGORY_HATE_SPEECH",
|
253 |
-
# "threshold": "BLOCK_NONE"
|
254 |
-
# },
|
255 |
-
# {
|
256 |
-
# "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
257 |
-
# "threshold": "BLOCK_NONE"
|
258 |
-
# },
|
259 |
-
# {
|
260 |
-
# "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
|
261 |
-
# "threshold": "BLOCK_NONE"
|
262 |
-
# },
|
263 |
-
# ]
|
264 |
-
|
265 |
-
# # Set up the model
|
266 |
-
# generation_config = {
|
267 |
-
# "temperature": temperature,
|
268 |
-
# "top_p": 1,
|
269 |
-
# "top_k": 1,
|
270 |
-
# "max_output_tokens": max_tokens,
|
271 |
-
# }
|
272 |
-
|
273 |
-
# output = API_ERROR_OUTPUT
|
274 |
-
# for _ in range(API_MAX_RETRY):
|
275 |
-
# try:
|
276 |
-
# gemini = genai.GenerativeModel(
|
277 |
-
# model_name=model,
|
278 |
-
# generation_config=generation_config,
|
279 |
-
# safety_settings=safety_settings)
|
280 |
-
|
281 |
-
# convo = gemini.start_chat(history=[])
|
282 |
-
|
283 |
-
# convo.send_message(messages)
|
284 |
-
# output = convo.last.text
|
285 |
-
# break
|
286 |
-
# except genai.types.generation_types.StopCandidateException as e:
|
287 |
-
# print(type(e), e)
|
288 |
-
# break
|
289 |
-
# except Exception as e:
|
290 |
-
# print(type(e), e)
|
291 |
-
# time.sleep(API_RETRY_SLEEP)
|
292 |
-
|
293 |
-
# return output
|
294 |
-
|
295 |
-
|
296 |
-
# def chat_completion_cohere(model, messages, temperature, max_tokens):
|
297 |
-
# import cohere
|
298 |
-
|
299 |
-
# co = cohere.Client(os.environ["COHERE_API_KEY"])
|
300 |
-
# assert len(messages) > 0
|
301 |
-
|
302 |
-
# template_map = {"system":"SYSTEM",
|
303 |
-
# "assistant":"CHATBOT",
|
304 |
-
# "user":"USER"}
|
305 |
-
|
306 |
-
# assert messages[-1]["role"] == "user"
|
307 |
-
# prompt = messages[-1]["content"]
|
308 |
-
|
309 |
-
# if len(messages) > 1:
|
310 |
-
# history = []
|
311 |
-
# for message in messages[:-1]:
|
312 |
-
# history.append({"role":template_map[message["role"]], "message":message["content"]})
|
313 |
-
# else:
|
314 |
-
# history = None
|
315 |
-
|
316 |
-
# output = API_ERROR_OUTPUT
|
317 |
-
# for _ in range(API_MAX_RETRY):
|
318 |
-
# try:
|
319 |
-
# response = co.chat(
|
320 |
-
# message=prompt,
|
321 |
-
# model=model,
|
322 |
-
# temperature=temperature,
|
323 |
-
# max_tokens=max_tokens,
|
324 |
-
# chat_history=history,
|
325 |
-
# )
|
326 |
-
# output = response.text
|
327 |
-
# break
|
328 |
-
# except cohere.core.api_error.ApiError as e:
|
329 |
-
# print(type(e), e)
|
330 |
-
# raise
|
331 |
-
# except Exception as e:
|
332 |
-
# print(type(e), e)
|
333 |
-
# break
|
334 |
-
|
335 |
-
# return output
|
336 |
-
|
337 |
-
|
338 |
-
def reorg_answer_file(answer_file):
|
339 |
-
"""Sort by question id and de-duplication"""
|
340 |
-
answers = {}
|
341 |
-
with open(answer_file, "r") as fin:
|
342 |
-
for l in fin:
|
343 |
-
qid = json.loads(l)["question_id"]
|
344 |
-
answers[qid] = l
|
345 |
-
|
346 |
-
qids = sorted(list(answers.keys()))
|
347 |
-
with open(answer_file, "w") as fout:
|
348 |
-
for qid in qids:
|
349 |
-
fout.write(answers[qid])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/tasks/arena_hard/configs/api_config.yaml
DELETED
@@ -1,17 +0,0 @@
|
|
1 |
-
# gpt-3.5-turbo:
|
2 |
-
# model_name: gpt-3.5-turbo
|
3 |
-
# endpoints: null
|
4 |
-
# api_type: openai
|
5 |
-
# parallel: 8
|
6 |
-
|
7 |
-
gpt-4-1106-preview:
|
8 |
-
model_name: gpt-4-1106-preview
|
9 |
-
endpoints: null
|
10 |
-
api_type: openai
|
11 |
-
parallel: 8
|
12 |
-
|
13 |
-
# llama3-7b:
|
14 |
-
# model_name: llama3-7b
|
15 |
-
# endpoints: null
|
16 |
-
# api_type: openai
|
17 |
-
# parallel: 8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/tasks/arena_hard/configs/judge_config.yaml
DELETED
@@ -1,26 +0,0 @@
|
|
1 |
-
name: judgment config file for Arena Hard
|
2 |
-
|
3 |
-
bench_name: arena-hard-v0.1
|
4 |
-
|
5 |
-
# Arena Hard default
|
6 |
-
judge_model: gpt-4-1106-preview
|
7 |
-
# judge_model: gpt-3.5-turbo
|
8 |
-
reference: False # Optional
|
9 |
-
ref_model: null
|
10 |
-
|
11 |
-
baseline: True
|
12 |
-
baseline_model: gpt-4-0314
|
13 |
-
|
14 |
-
pairwise: True
|
15 |
-
temperature: 0
|
16 |
-
max_tokens: 4096
|
17 |
-
|
18 |
-
regex_pattern: \[\[([AB<>=]+)\]\]
|
19 |
-
|
20 |
-
system_prompt: "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
|
21 |
-
|
22 |
-
prompt_template: ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"]
|
23 |
-
|
24 |
-
# Add your model below for evaluation
|
25 |
-
# model_list:
|
26 |
-
# - gpt-3.5-turbo-0125
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl
DELETED
The diff for this file is too large to render.
See raw diff
|
|
src/backend/tasks/arena_hard/question.jsonl
DELETED
The diff for this file is too large to render.
See raw diff
|
|
src/backend/tasks/arena_hard/task.py
DELETED
@@ -1,220 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from typing import Union, List
|
3 |
-
|
4 |
-
from lm_eval.api.task import ConfigurableTask
|
5 |
-
from lm_eval.api.instance import Instance
|
6 |
-
|
7 |
-
# from lm_eval.api.registry import register_task
|
8 |
-
from lm_eval.api.metrics import mean
|
9 |
-
|
10 |
-
from src.backend.envs import DEVICE
|
11 |
-
|
12 |
-
import pandas as pd
|
13 |
-
|
14 |
-
from src.backend.tasks.measurement_task_utils import measure_system_metrics
|
15 |
-
import json
|
16 |
-
|
17 |
-
from typing import (
|
18 |
-
Any,
|
19 |
-
Dict,
|
20 |
-
List,
|
21 |
-
Optional,
|
22 |
-
Union,
|
23 |
-
)
|
24 |
-
|
25 |
-
from datasets import Dataset
|
26 |
-
import re
|
27 |
-
|
28 |
-
from src.backend.tasks.arena_hard.arena_utils import (
|
29 |
-
load_questions,
|
30 |
-
load_questions,
|
31 |
-
load_model_answers,
|
32 |
-
make_config,
|
33 |
-
)
|
34 |
-
|
35 |
-
from src.backend.tasks.arena_hard.arena_judgment import (
|
36 |
-
judgment,
|
37 |
-
get_battles_from_scores,
|
38 |
-
compute_mle_elo,
|
39 |
-
predict_win_rate,
|
40 |
-
get_win_rate_column
|
41 |
-
)
|
42 |
-
|
43 |
-
def load_questions(question_file: str):
|
44 |
-
"""Load questions from a file."""
|
45 |
-
questions = []
|
46 |
-
with open(question_file, "r") as ques_file:
|
47 |
-
for line in ques_file:
|
48 |
-
if line:
|
49 |
-
questions.append(json.loads(line))
|
50 |
-
return questions
|
51 |
-
|
52 |
-
def download_wrapper(func):
|
53 |
-
def download(self, *args, **kwargs):
|
54 |
-
print("Using Arena Hard, No need to download")
|
55 |
-
return download
|
56 |
-
|
57 |
-
original_download = ConfigurableTask.download
|
58 |
-
ConfigurableTask.download = download_wrapper(original_download)
|
59 |
-
# @register_task("selfcheckgpt")
|
60 |
-
@measure_system_metrics
|
61 |
-
class ArenaHard(ConfigurableTask):
|
62 |
-
VERSION = 0.0
|
63 |
-
OUTPUT_TYPE = "generate_until"
|
64 |
-
data_path = os.path.join(os.path.dirname(__file__), 'question.jsonl')
|
65 |
-
judge_config_path = os.path.join(os.path.dirname(__file__), "configs/judge_config.yaml")
|
66 |
-
configs = make_config(judge_config_path)
|
67 |
-
model_ans_dir = os.path.join(os.path.dirname(__file__), "model_answer")
|
68 |
-
model_answers = load_model_answers(model_ans_dir)
|
69 |
-
data = load_questions(data_path)
|
70 |
-
|
71 |
-
def __init__(self):
|
72 |
-
super().__init__(config={"metadata": {"version": self.VERSION}})
|
73 |
-
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
74 |
-
# self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
75 |
-
self.generation_kwargs = {"until": ["</s>", "<|im_end|>"], "max_gen_toks": 4096}
|
76 |
-
# self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
77 |
-
# self.generation_kwargs_sampling = {
|
78 |
-
# "temperature": 0.99,
|
79 |
-
# "do_sample": True,
|
80 |
-
# "until": ["<im_end>", "<im_end>"],
|
81 |
-
# "max_length": 1024,
|
82 |
-
# }
|
83 |
-
|
84 |
-
def transform_data(self, data):
|
85 |
-
transformed_data = []
|
86 |
-
for i in range(len(data)):
|
87 |
-
if self.configs["baseline"]:
|
88 |
-
baseline_answer = self.model_answers[self.configs["baseline_model"]][data[i]["question_id"]]
|
89 |
-
else:
|
90 |
-
baseline_answer = None
|
91 |
-
transformed_item = {
|
92 |
-
"question_id": data[i]["question_id"],
|
93 |
-
"content": data[i]["turns"][0]["content"], # Assuming you want the first turn's content
|
94 |
-
"model_answer": baseline_answer
|
95 |
-
}
|
96 |
-
transformed_data.append(transformed_item)
|
97 |
-
return transformed_data
|
98 |
-
|
99 |
-
def has_training_docs(self):
|
100 |
-
return False
|
101 |
-
|
102 |
-
def has_validation_docs(self):
|
103 |
-
return True
|
104 |
-
|
105 |
-
def has_test_docs(self):
|
106 |
-
return False
|
107 |
-
|
108 |
-
def validation_docs(self):
|
109 |
-
self.dataset = self.transform_data(self.data)
|
110 |
-
self.dataset = Dataset.from_dict({"question_id": [item["question_id"] for item in self.dataset],
|
111 |
-
"content": [item["content"] for item in self.dataset],
|
112 |
-
"model_answer": [item["model_answer"] for item in self.dataset]})
|
113 |
-
return self.dataset
|
114 |
-
|
115 |
-
def doc_to_text(self, doc):
|
116 |
-
sentence = doc["content"]
|
117 |
-
doc_text = f"{sentence}\n"
|
118 |
-
return doc_text
|
119 |
-
|
120 |
-
def doc_to_target(self, doc):
|
121 |
-
q_id = doc["question_id"]
|
122 |
-
return q_id
|
123 |
-
|
124 |
-
def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
|
125 |
-
arguments = (ctx, self.generation_kwargs)
|
126 |
-
request_list = [
|
127 |
-
Instance(request_type="generate_until", doc=doc, arguments=arguments, idx=0, **kwargs),
|
128 |
-
]
|
129 |
-
# sampling_arguments = (ctx, self.generation_kwargs_sampling)
|
130 |
-
# request_list.extend(
|
131 |
-
# [
|
132 |
-
# Instance(request_type="generate_until", doc=doc, arguments=sampling_arguments, idx=idx, **kwargs)
|
133 |
-
# for idx in range(1, self.generation_kwargs_sampling_number + 1)
|
134 |
-
# ]
|
135 |
-
# )
|
136 |
-
return request_list
|
137 |
-
|
138 |
-
def process_results(self, doc, results):
|
139 |
-
response_temperature_0 = results[0]
|
140 |
-
# other_responses = results[1:]
|
141 |
-
api_config_path = os.path.join(os.path.dirname(__file__), "configs/api_config.yaml")
|
142 |
-
endpoint_list = make_config(api_config_path)
|
143 |
-
|
144 |
-
if self.configs["regex_pattern"]:
|
145 |
-
pattern = re.compile(self.configs["regex_pattern"])
|
146 |
-
|
147 |
-
ref_answer_dir = os.path.join(os.path.dirname(__file__), "reference_answer")
|
148 |
-
|
149 |
-
ref_answers = None
|
150 |
-
if self.configs["reference"]:
|
151 |
-
ref_answers = load_model_answers(ref_answer_dir)
|
152 |
-
ref_answers = [ref_answers[model] for model in self.configs["ref_model"]]
|
153 |
-
|
154 |
-
# output_files = {}
|
155 |
-
# models = ["custom_model"]
|
156 |
-
# output_dir = f"{os.path.join(os.path.dirname(__file__))}/model_judgments/{self.configs['judge_model']}"
|
157 |
-
# for model in models:
|
158 |
-
# output_files[model] = os.path.join(
|
159 |
-
# output_dir,
|
160 |
-
# f"{model}.jsonl",
|
161 |
-
# )
|
162 |
-
|
163 |
-
# for output_file in output_files.values():
|
164 |
-
# os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
165 |
-
|
166 |
-
endpoint_info = endpoint_list[self.configs["judge_model"]]
|
167 |
-
|
168 |
-
question = doc
|
169 |
-
kwargs = {}
|
170 |
-
kwargs["question"] = question
|
171 |
-
kwargs["answer"] = response_temperature_0
|
172 |
-
if ref_answers:
|
173 |
-
kwargs["reference"] = [ref_answer[doc["question_id"]] for ref_answer in ref_answers]
|
174 |
-
assert len(kwargs["reference"]) == len(self.configs["ref_model"])
|
175 |
-
else:
|
176 |
-
kwargs["reference"] = None
|
177 |
-
|
178 |
-
if self.configs["baseline"]:
|
179 |
-
kwargs["baseline_answer"] = doc["model_answer"]
|
180 |
-
else:
|
181 |
-
kwargs["baseline_answer"] = None
|
182 |
-
kwargs["configs"] = self.configs
|
183 |
-
kwargs["endpoint_dict"] = endpoint_info
|
184 |
-
# kwargs["output_file"] = output_files["custom_model"]
|
185 |
-
kwargs["regex_pattern"] = pattern
|
186 |
-
|
187 |
-
scores = judgment(**kwargs)
|
188 |
-
return {"score": scores}
|
189 |
-
|
190 |
-
def aggregation(self):
|
191 |
-
"""
|
192 |
-
:returns: {str: [float] -> float}
|
193 |
-
A dictionary where keys are the names of submetrics and values are
|
194 |
-
functions that aggregate a list of metrics
|
195 |
-
"""
|
196 |
-
##TODO implement the aggregation function to calculate elo for score
|
197 |
-
def get_win_rate(score_list):
|
198 |
-
battles = get_battles_from_scores(score_list)
|
199 |
-
bootstrap_online_elo = compute_mle_elo(battles)
|
200 |
-
stats = pd.DataFrame()
|
201 |
-
stats["results"] = None
|
202 |
-
stats["results"] = stats['results'].astype('object')
|
203 |
-
for i, model in enumerate(bootstrap_online_elo.index):
|
204 |
-
stats.at[i, "model"] = model
|
205 |
-
stats.at[i, "score"] = bootstrap_online_elo[model]
|
206 |
-
|
207 |
-
stats.sort_values(by="model", inplace=True)
|
208 |
-
stats["score"] = get_win_rate_column(stats, "score", "gpt-4-0314").tolist()
|
209 |
-
|
210 |
-
return stats["score"][1]
|
211 |
-
|
212 |
-
return {k: get_win_rate for k in ["score"]}
|
213 |
-
|
214 |
-
def higher_is_better(self):
|
215 |
-
"""
|
216 |
-
:returns: {str: bool}
|
217 |
-
A dictionary where keys are the names of submetrics and values are
|
218 |
-
whether a higher value of the submetric is better
|
219 |
-
"""
|
220 |
-
return {k: True for k in ["score"]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/tasks/gsm8k/gsm8k-custom.yaml
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
group:
|
2 |
-
- math_word_problems
|
3 |
-
task: gsm8k_custom
|
4 |
-
dataset_path: gsm8k
|
5 |
-
dataset_name: main
|
6 |
-
output_type: generate_until
|
7 |
-
training_split: train
|
8 |
-
fewshot_split: train
|
9 |
-
test_split: test
|
10 |
-
doc_to_text: "Question: {{question}}\nAnswer:"
|
11 |
-
doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
|
12 |
-
metric_list:
|
13 |
-
- metric: exact_match
|
14 |
-
aggregation: mean
|
15 |
-
higher_is_better: true
|
16 |
-
ignore_case: true
|
17 |
-
ignore_punctuation: false
|
18 |
-
regexes_to_ignore:
|
19 |
-
- ","
|
20 |
-
- "\\$"
|
21 |
-
- "(?s).*#### "
|
22 |
-
- "\\.$"
|
23 |
-
generation_kwargs:
|
24 |
-
until:
|
25 |
-
- "Question:"
|
26 |
-
- "Question"
|
27 |
-
- "</s>"
|
28 |
-
- "<|im_end|>"
|
29 |
-
do_sample: false
|
30 |
-
temperature: 0.0
|
31 |
-
# is_gsm8k: true
|
32 |
-
repeats: 1
|
33 |
-
num_fewshot: 5
|
34 |
-
filter_list:
|
35 |
-
- name: "strict-match"
|
36 |
-
filter:
|
37 |
-
- function: "regex"
|
38 |
-
regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
|
39 |
-
- function: "take_first"
|
40 |
-
- name: "flexible-extract"
|
41 |
-
filter:
|
42 |
-
- function: "regex"
|
43 |
-
group_select: -1
|
44 |
-
regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
|
45 |
-
- function: "take_first"
|
46 |
-
metadata:
|
47 |
-
version: 3.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/tasks/measurement_task_utils.py
CHANGED
@@ -12,9 +12,6 @@ def process_results_decorator(func):
|
|
12 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
13 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
14 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
15 |
-
mfu = sum([r[4] for r in results]) / len(results)
|
16 |
-
mbu = sum([r[5] for r in results]) / len(results)
|
17 |
-
|
18 |
# print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
19 |
|
20 |
# Now call the original process_results with the processed results
|
@@ -22,8 +19,6 @@ def process_results_decorator(func):
|
|
22 |
result_dict["end_to_end_time"] = end_to_end_time
|
23 |
result_dict["prefilling_time"] = prefilling_time
|
24 |
result_dict["decoding_throughput"] = decoding_throughput
|
25 |
-
result_dict["mfu"] = mfu
|
26 |
-
result_dict["mbu"] = mbu
|
27 |
return result_dict
|
28 |
return wrapper
|
29 |
|
@@ -35,8 +30,6 @@ def aggregation_decorator(func):
|
|
35 |
aggregation_list["end_to_end_time"] = mean
|
36 |
aggregation_list["prefilling_time"] = mean
|
37 |
aggregation_list["decoding_throughput"] = mean
|
38 |
-
aggregation_list["mfu"] = mean
|
39 |
-
aggregation_list["mbu"] = mean
|
40 |
return aggregation_list
|
41 |
return wrapper
|
42 |
|
@@ -48,8 +41,6 @@ def higher_is_better_decorator(func):
|
|
48 |
higher_is_better_dict["end_to_end_time"] = False
|
49 |
higher_is_better_dict["prefilling_time"] = False
|
50 |
higher_is_better_dict["decoding_throughput"] = True
|
51 |
-
higher_is_better_dict["mfu"] = True
|
52 |
-
higher_is_better_dict["mbu"] = True
|
53 |
return higher_is_better_dict
|
54 |
return wrapper
|
55 |
|
|
|
12 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
13 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
14 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
|
|
|
|
|
|
15 |
# print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
16 |
|
17 |
# Now call the original process_results with the processed results
|
|
|
19 |
result_dict["end_to_end_time"] = end_to_end_time
|
20 |
result_dict["prefilling_time"] = prefilling_time
|
21 |
result_dict["decoding_throughput"] = decoding_throughput
|
|
|
|
|
22 |
return result_dict
|
23 |
return wrapper
|
24 |
|
|
|
30 |
aggregation_list["end_to_end_time"] = mean
|
31 |
aggregation_list["prefilling_time"] = mean
|
32 |
aggregation_list["decoding_throughput"] = mean
|
|
|
|
|
33 |
return aggregation_list
|
34 |
return wrapper
|
35 |
|
|
|
41 |
higher_is_better_dict["end_to_end_time"] = False
|
42 |
higher_is_better_dict["prefilling_time"] = False
|
43 |
higher_is_better_dict["decoding_throughput"] = True
|
|
|
|
|
44 |
return higher_is_better_dict
|
45 |
return wrapper
|
46 |
|
src/backend/tasks/selfcheckgpt/task.py
CHANGED
@@ -27,12 +27,12 @@ class SelfCheckGPT(ConfigurableTask):
|
|
27 |
super().__init__(config={"metadata": {"version": self.VERSION}})
|
28 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
29 |
# self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
30 |
-
self.generation_kwargs = {"until": ["
|
31 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
32 |
self.generation_kwargs_sampling = {
|
33 |
"temperature": 0.99,
|
34 |
"do_sample": True,
|
35 |
-
"until": ["
|
36 |
"max_length": 1024,
|
37 |
}
|
38 |
|
|
|
27 |
super().__init__(config={"metadata": {"version": self.VERSION}})
|
28 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
29 |
# self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
30 |
+
self.generation_kwargs = {"until": ["<im_end>"], "max_length": 1024}
|
31 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
32 |
self.generation_kwargs_sampling = {
|
33 |
"temperature": 0.99,
|
34 |
"do_sample": True,
|
35 |
+
"until": ["<im_end>", "</s>"],
|
36 |
"max_length": 1024,
|
37 |
}
|
38 |
|
src/display/about.py
CHANGED
@@ -10,17 +10,14 @@ The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to me
|
|
10 |
|
11 |
|
12 |
Tasks:
|
|
|
13 |
- **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
|
14 |
-
- **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
|
15 |
-
- **AI Judgment Scores for Responses to Complex User Queries** -- [Arena_Hard](https://lmsys.org/blog/2024-04-19-arena-hard/)
|
16 |
|
17 |
Columns and Metrics:
|
18 |
- Method: The MOE LLMs inference framework.
|
19 |
- E2E(s): Average End to End generation time in seconds.
|
20 |
- PRE(s): Prefilling Time of input prompt in seconds.
|
21 |
- T/s: Tokens throughout per second.
|
22 |
-
- S-MBU(%): Sparse Model Bandwidth Utilization.
|
23 |
-
- S-MFU(%): Sparse Model FLOPs Utilization.
|
24 |
- Precision: The precison of used model.
|
25 |
|
26 |
"""
|
|
|
10 |
|
11 |
|
12 |
Tasks:
|
13 |
+
- **Generation Self-consistancy** -- [SelfCheckGPT](https://github.com/potsawee/selfcheckgpt)
|
14 |
- **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
|
|
|
|
|
15 |
|
16 |
Columns and Metrics:
|
17 |
- Method: The MOE LLMs inference framework.
|
18 |
- E2E(s): Average End to End generation time in seconds.
|
19 |
- PRE(s): Prefilling Time of input prompt in seconds.
|
20 |
- T/s: Tokens throughout per second.
|
|
|
|
|
21 |
- Precision: The precison of used model.
|
22 |
|
23 |
"""
|
src/display/utils.py
CHANGED
@@ -18,16 +18,12 @@ GPU_Power = 'Power(W)'
|
|
18 |
GPU_Mem = 'Mem(G)'
|
19 |
GPU_Name = "GPU"
|
20 |
GPU_Util = 'Util(%)'
|
21 |
-
MFU = 'S-MFU(%)'
|
22 |
-
MBU = 'S-MBU(%)'
|
23 |
BATCH_SIZE = 'bs'
|
24 |
PRECISION = "Precision"
|
25 |
system_metrics_to_name_map = {
|
26 |
"end_to_end_time": f"{E2Es}",
|
27 |
"prefilling_time": f"{PREs}",
|
28 |
"decoding_throughput": f"{TS}",
|
29 |
-
"mfu": f"{MFU}",
|
30 |
-
"mbu": f"{MBU}"
|
31 |
}
|
32 |
|
33 |
gpu_metrics_to_name_map = {
|
@@ -37,7 +33,7 @@ gpu_metrics_to_name_map = {
|
|
37 |
GPU_Mem: GPU_Mem,
|
38 |
"batch_size": BATCH_SIZE,
|
39 |
"precision": PRECISION,
|
40 |
-
GPU_Name: GPU_Name
|
41 |
}
|
42 |
|
43 |
@dataclass
|
@@ -77,11 +73,8 @@ class Tasks(Enum):
|
|
77 |
# halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
|
78 |
|
79 |
# # XXX include me back at some point
|
80 |
-
|
81 |
mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
|
82 |
-
gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
|
83 |
-
# gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot)
|
84 |
-
arena_hard = Task("arena_hard", "score", "Arena Hard") #Arena Hard/Score
|
85 |
|
86 |
|
87 |
# These classes are for user facing column names,
|
@@ -106,7 +99,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
106 |
# # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
107 |
|
108 |
# Inference framework
|
109 |
-
auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True
|
110 |
|
111 |
for task in Tasks:
|
112 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
@@ -114,27 +107,25 @@ for task in Tasks:
|
|
114 |
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
|
115 |
auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
|
116 |
# auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
|
117 |
-
|
118 |
auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
|
119 |
-
|
120 |
if task.value.benchmark in MULTIPLE_CHOICEs:
|
121 |
continue
|
122 |
# auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
|
123 |
auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True, hidden=True)])
|
124 |
-
auto_eval_column_dict.append([f"{task.name}_mbu", ColumnContent, ColumnContent(f"{task.value.col_name} {MBU}", "number", True, hidden=True)])
|
125 |
-
auto_eval_column_dict.append([f"{task.name}_mfu", ColumnContent, ColumnContent(f"{task.value.col_name} {MFU}", "number", True, hidden=True)])
|
126 |
|
127 |
|
128 |
# Model information
|
129 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False
|
130 |
-
|
131 |
-
|
132 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
# Dummy column for the search bar (hidden by the custom CSS)
|
139 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
140 |
|
@@ -160,10 +151,10 @@ class ModelDetails:
|
|
160 |
|
161 |
|
162 |
class ModelType(Enum):
|
163 |
-
|
164 |
-
|
165 |
chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
|
166 |
-
|
167 |
Unknown = ModelDetails(name="", symbol="?")
|
168 |
|
169 |
def to_str(self, separator=" "):
|
@@ -171,24 +162,21 @@ class ModelType(Enum):
|
|
171 |
|
172 |
@staticmethod
|
173 |
def from_str(type):
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
|
179 |
return ModelType.chat
|
180 |
-
|
181 |
-
|
182 |
return ModelType.Unknown
|
183 |
|
184 |
|
185 |
class InferenceFramework(Enum):
|
186 |
# "moe-infinity", hf-chat
|
187 |
-
|
188 |
HF_Chat = ModelDetails("hf-chat")
|
189 |
-
VLLM = ModelDetails("vllm_moe")
|
190 |
-
TRTLLM = ModelDetails("tensorrt_llm")
|
191 |
-
VLLM_FIX = ModelDetails("vllm_moe_fixbs")
|
192 |
Unknown = ModelDetails("?")
|
193 |
|
194 |
def to_str(self):
|
@@ -196,21 +184,16 @@ class InferenceFramework(Enum):
|
|
196 |
|
197 |
@staticmethod
|
198 |
def from_str(inference_framework: str):
|
199 |
-
|
200 |
-
|
201 |
-
if inference_framework in ["tensorrt_llm"]:
|
202 |
-
return InferenceFramework.TRTLLM
|
203 |
if inference_framework in ["hf-chat"]:
|
204 |
return InferenceFramework.HF_Chat
|
205 |
-
if inference_framework in ["vllm_moe"]:
|
206 |
-
return InferenceFramework.VLLM
|
207 |
-
if inference_framework in ["vllm_moe_fixbs"]:
|
208 |
-
return InferenceFramework.VLLM_FIX
|
209 |
return InferenceFramework.Unknown
|
210 |
|
211 |
class GPUType(Enum):
|
212 |
-
|
213 |
A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
|
|
|
214 |
Unknown = ModelDetails("?")
|
215 |
|
216 |
def to_str(self):
|
@@ -218,10 +201,12 @@ class GPUType(Enum):
|
|
218 |
|
219 |
@staticmethod
|
220 |
def from_str(gpu_type: str):
|
221 |
-
if gpu_type in ["NVIDIA-
|
222 |
return GPUType.A100_pcie
|
223 |
-
if gpu_type in ["NVIDIA-A100-
|
224 |
-
return GPUType.
|
|
|
|
|
225 |
return GPUType.Unknown
|
226 |
|
227 |
class WeightType(Enum):
|
@@ -231,28 +216,28 @@ class WeightType(Enum):
|
|
231 |
|
232 |
|
233 |
class Precision(Enum):
|
234 |
-
|
235 |
-
|
236 |
bfloat16 = ModelDetails("bfloat16")
|
237 |
qt_8bit = ModelDetails("8bit")
|
238 |
qt_4bit = ModelDetails("4bit")
|
239 |
-
|
240 |
Unknown = ModelDetails("?")
|
241 |
|
242 |
@staticmethod
|
243 |
def from_str(precision: str):
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
249 |
return Precision.bfloat16
|
250 |
if precision in ["8bit"]:
|
251 |
return Precision.qt_8bit
|
252 |
if precision in ["4bit"]:
|
253 |
return Precision.qt_4bit
|
254 |
-
|
255 |
-
|
256 |
return Precision.Unknown
|
257 |
|
258 |
|
|
|
18 |
GPU_Mem = 'Mem(G)'
|
19 |
GPU_Name = "GPU"
|
20 |
GPU_Util = 'Util(%)'
|
|
|
|
|
21 |
BATCH_SIZE = 'bs'
|
22 |
PRECISION = "Precision"
|
23 |
system_metrics_to_name_map = {
|
24 |
"end_to_end_time": f"{E2Es}",
|
25 |
"prefilling_time": f"{PREs}",
|
26 |
"decoding_throughput": f"{TS}",
|
|
|
|
|
27 |
}
|
28 |
|
29 |
gpu_metrics_to_name_map = {
|
|
|
33 |
GPU_Mem: GPU_Mem,
|
34 |
"batch_size": BATCH_SIZE,
|
35 |
"precision": PRECISION,
|
36 |
+
GPU_Name: GPU_Name,
|
37 |
}
|
38 |
|
39 |
@dataclass
|
|
|
73 |
# halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
|
74 |
|
75 |
# # XXX include me back at some point
|
76 |
+
selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
|
77 |
mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
|
|
|
|
|
|
|
78 |
|
79 |
|
80 |
# These classes are for user facing column names,
|
|
|
99 |
# # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
100 |
|
101 |
# Inference framework
|
102 |
+
auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True)])
|
103 |
|
104 |
for task in Tasks:
|
105 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
107 |
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
|
108 |
auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
|
109 |
# auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
|
110 |
+
auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
|
111 |
auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
|
112 |
+
auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
|
113 |
if task.value.benchmark in MULTIPLE_CHOICEs:
|
114 |
continue
|
115 |
# auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
|
116 |
auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True, hidden=True)])
|
|
|
|
|
117 |
|
118 |
|
119 |
# Model information
|
120 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
121 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
122 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
123 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
124 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
125 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
126 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
127 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
128 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
129 |
# Dummy column for the search bar (hidden by the custom CSS)
|
130 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
131 |
|
|
|
151 |
|
152 |
|
153 |
class ModelType(Enum):
|
154 |
+
PT = ModelDetails(name="pretrained", symbol="🟢")
|
155 |
+
FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
|
156 |
chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
|
157 |
+
merges = ModelDetails(name="base merges and moerges", symbol="🤝")
|
158 |
Unknown = ModelDetails(name="", symbol="?")
|
159 |
|
160 |
def to_str(self, separator=" "):
|
|
|
162 |
|
163 |
@staticmethod
|
164 |
def from_str(type):
|
165 |
+
if "fine-tuned" in type or "🔶" in type:
|
166 |
+
return ModelType.FT
|
167 |
+
if "pretrained" in type or "🟢" in type:
|
168 |
+
return ModelType.PT
|
169 |
if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
|
170 |
return ModelType.chat
|
171 |
+
if "merge" in type or "🤝" in type:
|
172 |
+
return ModelType.merges
|
173 |
return ModelType.Unknown
|
174 |
|
175 |
|
176 |
class InferenceFramework(Enum):
|
177 |
# "moe-infinity", hf-chat
|
178 |
+
MoE_Infinity = ModelDetails("moe-infinity")
|
179 |
HF_Chat = ModelDetails("hf-chat")
|
|
|
|
|
|
|
180 |
Unknown = ModelDetails("?")
|
181 |
|
182 |
def to_str(self):
|
|
|
184 |
|
185 |
@staticmethod
|
186 |
def from_str(inference_framework: str):
|
187 |
+
if inference_framework in ["moe-infinity"]:
|
188 |
+
return InferenceFramework.MoE_Infinity
|
|
|
|
|
189 |
if inference_framework in ["hf-chat"]:
|
190 |
return InferenceFramework.HF_Chat
|
|
|
|
|
|
|
|
|
191 |
return InferenceFramework.Unknown
|
192 |
|
193 |
class GPUType(Enum):
|
194 |
+
H100_pcie = ModelDetails("NVIDIA-H100-PCIe-80GB")
|
195 |
A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
|
196 |
+
A5000 = ModelDetails("NVIDIA-RTX-A5000-24GB")
|
197 |
Unknown = ModelDetails("?")
|
198 |
|
199 |
def to_str(self):
|
|
|
201 |
|
202 |
@staticmethod
|
203 |
def from_str(gpu_type: str):
|
204 |
+
if gpu_type in ["NVIDIA-H100-PCIe-80GB"]:
|
205 |
return GPUType.A100_pcie
|
206 |
+
if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
|
207 |
+
return GPUType.H100_pcie
|
208 |
+
if gpu_type in ["NVIDIA-A5000-24GB"]:
|
209 |
+
return GPUType.A5000
|
210 |
return GPUType.Unknown
|
211 |
|
212 |
class WeightType(Enum):
|
|
|
216 |
|
217 |
|
218 |
class Precision(Enum):
|
219 |
+
float32 = ModelDetails("float32")
|
220 |
+
float16 = ModelDetails("float16")
|
221 |
bfloat16 = ModelDetails("bfloat16")
|
222 |
qt_8bit = ModelDetails("8bit")
|
223 |
qt_4bit = ModelDetails("4bit")
|
224 |
+
qt_GPTQ = ModelDetails("GPTQ")
|
225 |
Unknown = ModelDetails("?")
|
226 |
|
227 |
@staticmethod
|
228 |
def from_str(precision: str):
|
229 |
+
if precision in ["torch.float32", "float32"]:
|
230 |
+
return Precision.float32
|
231 |
+
if precision in ["torch.float16", "float16"]:
|
232 |
+
return Precision.float16
|
233 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
234 |
return Precision.bfloat16
|
235 |
if precision in ["8bit"]:
|
236 |
return Precision.qt_8bit
|
237 |
if precision in ["4bit"]:
|
238 |
return Precision.qt_4bit
|
239 |
+
if precision in ["GPTQ", "None"]:
|
240 |
+
return Precision.qt_GPTQ
|
241 |
return Precision.Unknown
|
242 |
|
243 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -65,11 +65,11 @@ class EvalResult:
|
|
65 |
if len(org_and_model) == 1:
|
66 |
org = None
|
67 |
model = org_and_model[0]
|
68 |
-
result_key = f"{model}_{precision.value.name}
|
69 |
else:
|
70 |
org = org_and_model[0]
|
71 |
model = org_and_model[1]
|
72 |
-
result_key = f"{org}_{model}_{precision.value.name}
|
73 |
full_model = "/".join(org_and_model)
|
74 |
|
75 |
still_on_hub, error, model_config = is_model_on_hub(
|
@@ -120,15 +120,11 @@ class EvalResult:
|
|
120 |
multiplier = 1.0
|
121 |
if "batch_" in metric or "Mem" in metric or "Util" in metric:
|
122 |
multiplier = 1
|
123 |
-
|
|
|
124 |
# print('RESULTS', data['results'])
|
125 |
# print('XXX', benchmark, metric, value, multiplier)
|
126 |
-
|
127 |
-
results[benchmark][metric] = "-"
|
128 |
-
elif value == "auto":
|
129 |
-
results[benchmark][metric] = "auto"
|
130 |
-
else:
|
131 |
-
results[benchmark][metric] = value * multiplier
|
132 |
|
133 |
res = EvalResult(
|
134 |
eval_name=result_key,
|
@@ -140,7 +136,6 @@ class EvalResult:
|
|
140 |
revision=config.get("model_sha", ""),
|
141 |
still_on_hub=still_on_hub,
|
142 |
architecture=architecture,
|
143 |
-
model_type=ModelType.from_str(config.get("model_type", "")),
|
144 |
inference_framework=inference_framework,
|
145 |
)
|
146 |
|
@@ -175,22 +170,22 @@ class EvalResult:
|
|
175 |
|
176 |
# breakpoint()
|
177 |
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
178 |
-
|
179 |
data_dict = {
|
180 |
"eval_name": self.eval_name, # not a column, just a save name,
|
181 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
182 |
-
|
183 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
184 |
-
|
185 |
-
|
186 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
187 |
AutoEvalColumn.dummy.name: self.full_model,
|
188 |
-
|
189 |
-
#
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
AutoEvalColumn.inference_framework.name: self.inference_framework,
|
195 |
}
|
196 |
|
@@ -278,22 +273,15 @@ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool
|
|
278 |
|
279 |
eval_results = {}
|
280 |
for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
eval_name
|
288 |
-
|
289 |
-
|
290 |
-
else:
|
291 |
-
eval_results[eval_name] = eval_result
|
292 |
-
|
293 |
-
except (FileNotFoundError, ValueError, KeyError, json.JSONDecodeError) as e:
|
294 |
-
# Log the error and continue with the next file
|
295 |
-
print(f"Error processing file {model_result_filepath}: {e}")
|
296 |
-
continue
|
297 |
|
298 |
results = []
|
299 |
for v in eval_results.values():
|
|
|
65 |
if len(org_and_model) == 1:
|
66 |
org = None
|
67 |
model = org_and_model[0]
|
68 |
+
result_key = f"{model}_{precision.value.name}"
|
69 |
else:
|
70 |
org = org_and_model[0]
|
71 |
model = org_and_model[1]
|
72 |
+
result_key = f"{org}_{model}_{precision.value.name}"
|
73 |
full_model = "/".join(org_and_model)
|
74 |
|
75 |
still_on_hub, error, model_config = is_model_on_hub(
|
|
|
120 |
multiplier = 1.0
|
121 |
if "batch_" in metric or "Mem" in metric or "Util" in metric:
|
122 |
multiplier = 1
|
123 |
+
|
124 |
+
|
125 |
# print('RESULTS', data['results'])
|
126 |
# print('XXX', benchmark, metric, value, multiplier)
|
127 |
+
results[benchmark][metric] = value * multiplier
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
res = EvalResult(
|
130 |
eval_name=result_key,
|
|
|
136 |
revision=config.get("model_sha", ""),
|
137 |
still_on_hub=still_on_hub,
|
138 |
architecture=architecture,
|
|
|
139 |
inference_framework=inference_framework,
|
140 |
)
|
141 |
|
|
|
170 |
|
171 |
# breakpoint()
|
172 |
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
173 |
+
|
174 |
data_dict = {
|
175 |
"eval_name": self.eval_name, # not a column, just a save name,
|
176 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
177 |
+
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
178 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
179 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
180 |
+
AutoEvalColumn.architecture.name: self.architecture,
|
181 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
182 |
AutoEvalColumn.dummy.name: self.full_model,
|
183 |
+
AutoEvalColumn.revision.name: self.revision,
|
184 |
+
# AutoEvalColumn.average.name: average,
|
185 |
+
AutoEvalColumn.license.name: self.license,
|
186 |
+
AutoEvalColumn.likes.name: self.likes,
|
187 |
+
AutoEvalColumn.params.name: self.num_params,
|
188 |
+
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
189 |
AutoEvalColumn.inference_framework.name: self.inference_framework,
|
190 |
}
|
191 |
|
|
|
273 |
|
274 |
eval_results = {}
|
275 |
for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
|
276 |
+
# Creation of result
|
277 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
|
278 |
+
eval_result.update_with_request_file(requests_path)
|
279 |
+
# Store results of same eval together
|
280 |
+
eval_name = eval_result.eval_name
|
281 |
+
if eval_name in eval_results.keys():
|
282 |
+
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
283 |
+
else:
|
284 |
+
eval_results[eval_name] = eval_result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
results = []
|
287 |
for v in eval_results.values():
|
src/submission/check_validity.py
CHANGED
@@ -74,7 +74,7 @@ def is_model_on_hub(
|
|
74 |
|
75 |
|
76 |
def get_model_size(model_info: ModelInfo, precision: str):
|
77 |
-
size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
78 |
try:
|
79 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
80 |
except (AttributeError, TypeError):
|
@@ -130,8 +130,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
|
|
130 |
continue
|
131 |
with open(os.path.join(root, file), "r") as f:
|
132 |
info = json.load(f)
|
133 |
-
|
134 |
-
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
|
135 |
|
136 |
# Select organisation
|
137 |
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
|
|
74 |
|
75 |
|
76 |
def get_model_size(model_info: ModelInfo, precision: str):
|
77 |
+
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
78 |
try:
|
79 |
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
80 |
except (AttributeError, TypeError):
|
|
|
130 |
continue
|
131 |
with open(os.path.join(root, file), "r") as f:
|
132 |
info = json.load(f)
|
133 |
+
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
|
|
|
134 |
|
135 |
# Select organisation
|
136 |
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
src/utils.py
CHANGED
@@ -3,54 +3,12 @@ from huggingface_hub import snapshot_download
|
|
3 |
import subprocess
|
4 |
import re
|
5 |
import os
|
6 |
-
import GPUtil
|
7 |
|
8 |
try:
|
9 |
from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
|
10 |
except:
|
11 |
print("local debug: from display.utils")
|
12 |
from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
|
13 |
-
|
14 |
-
MEM_BW_DICT ={
|
15 |
-
"NVIDIA-A100-PCIe-80GB": 1935,
|
16 |
-
"NVIDIA-A100-SXM-80GB": 2039,
|
17 |
-
"NVIDIA-H100-PCIe-80GB": 2039,
|
18 |
-
"NVIDIA-RTX-A5000-24GB": 768
|
19 |
-
}
|
20 |
-
|
21 |
-
PEAK_FLOPS_DICT = {
|
22 |
-
"float32":{
|
23 |
-
"NVIDIA-A100-PCIe-80GB": 312e12,
|
24 |
-
"NVIDIA-A100-SXM-80GB": 312e12,
|
25 |
-
"NVIDIA-H100-PCIe-80GB": 756e12,
|
26 |
-
"NVIDIA-RTX-A5000-24GB": 222.2e12
|
27 |
-
},
|
28 |
-
"float16":{
|
29 |
-
"NVIDIA-A100-PCIe-80GB": 624e12,
|
30 |
-
"NVIDIA-A100-SXM-80GB": 624e12,
|
31 |
-
"NVIDIA-H100-PCIe-80GB": 1513e12,
|
32 |
-
"NVIDIA-RTX-A5000-24GB": 444.4e12
|
33 |
-
},
|
34 |
-
"bfloat16":{
|
35 |
-
"NVIDIA-A100-PCIe-80GB": 624e12,
|
36 |
-
"NVIDIA-A100-SXM-80GB": 624e12,
|
37 |
-
"NVIDIA-H100-PCIe-80GB": 1513e12,
|
38 |
-
"NVIDIA-RTX-A5000-24GB": 444.4e12
|
39 |
-
},
|
40 |
-
"8bit":{
|
41 |
-
"NVIDIA-A100-PCIe-80GB": 1248e12,
|
42 |
-
"NVIDIA-A100-SXM-80GB": 1248e12,
|
43 |
-
"NVIDIA-H100-PCIe-80GB": 3026e12,
|
44 |
-
"NVIDIA-RTX-A5000-24GB": 889e12
|
45 |
-
},
|
46 |
-
"4bit": {
|
47 |
-
"NVIDIA-A100-PCIe-80GB": 2496e12,
|
48 |
-
"NVIDIA-A100-SXM-80GB": 2496e12,
|
49 |
-
"NVIDIA-H100-PCIe-80GB": 6052e12,
|
50 |
-
"NVIDIA-RTX-A5000-24GB": 1778e12
|
51 |
-
}
|
52 |
-
|
53 |
-
}
|
54 |
|
55 |
def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
|
56 |
for i in range(10):
|
@@ -94,12 +52,11 @@ def parse_nvidia_smi():
|
|
94 |
print("Failed to query GPU indices.")
|
95 |
return []
|
96 |
gpu_indices = result.stdout.strip().split('\n')
|
97 |
-
|
98 |
gpu_stats = []
|
99 |
|
100 |
gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
|
101 |
-
|
102 |
-
gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')
|
103 |
|
104 |
gpu_name = ""
|
105 |
for index in gpu_indices:
|
@@ -111,7 +68,7 @@ def parse_nvidia_smi():
|
|
111 |
name_match = gpu_name_pattern.search(line)
|
112 |
gpu_info = {}
|
113 |
if name_match:
|
114 |
-
gpu_name =
|
115 |
if match:
|
116 |
temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
|
117 |
gpu_info.update({
|
@@ -123,7 +80,7 @@ def parse_nvidia_smi():
|
|
123 |
|
124 |
if len(gpu_info) >= 4:
|
125 |
gpu_stats.append(gpu_info)
|
126 |
-
|
127 |
gpu_name = f"{len(gpu_stats)}x{gpu_name}"
|
128 |
gpu_stats_total = {
|
129 |
GPU_TEMP: 0,
|
@@ -174,38 +131,5 @@ def analyze_gpu_stats(stats_list):
|
|
174 |
|
175 |
return avg_stats
|
176 |
|
177 |
-
def get_gpu_details():
|
178 |
-
gpus = GPUtil.getGPUs()
|
179 |
-
gpu = gpus[0]
|
180 |
-
name = gpu.name.replace(" ", "-")
|
181 |
-
memory_gb = round(gpu.memoryTotal / 1024)
|
182 |
-
memory = f"{memory_gb}GB"
|
183 |
-
|
184 |
-
for part in name.split('-'):
|
185 |
-
if part.endswith("GB") and part[:-2].isdigit():
|
186 |
-
name = name.replace(f"-{part}", "").replace(part, "")
|
187 |
-
|
188 |
-
formatted_name = f"{name}-{memory}"
|
189 |
-
|
190 |
-
return formatted_name
|
191 |
-
|
192 |
-
def get_peak_bw(gpu_name):
|
193 |
-
return MEM_BW_DICT[gpu_name]
|
194 |
-
|
195 |
-
def get_peak_flops(gpu_name, precision):
|
196 |
-
return PEAK_FLOPS_DICT[precision][gpu_name]
|
197 |
-
|
198 |
-
def transfer_precision2bytes(precision):
|
199 |
-
if precision == "float32":
|
200 |
-
return 4
|
201 |
-
elif precision in ["float16", "bfloat16"]:
|
202 |
-
return 2
|
203 |
-
elif precision == "8bit":
|
204 |
-
return 1
|
205 |
-
elif precision == "4bit":
|
206 |
-
return 0.5
|
207 |
-
else:
|
208 |
-
raise ValueError(f"Unsupported precision: {precision}")
|
209 |
-
|
210 |
if __name__ == "__main__":
|
211 |
print(analyze_gpu_stats(parse_nvidia_smi()))
|
|
|
3 |
import subprocess
|
4 |
import re
|
5 |
import os
|
|
|
6 |
|
7 |
try:
|
8 |
from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
|
9 |
except:
|
10 |
print("local debug: from display.utils")
|
11 |
from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
|
14 |
for i in range(10):
|
|
|
52 |
print("Failed to query GPU indices.")
|
53 |
return []
|
54 |
gpu_indices = result.stdout.strip().split('\n')
|
55 |
+
print(f"gpu_indices: {gpu_indices}")
|
56 |
gpu_stats = []
|
57 |
|
58 |
gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
|
59 |
+
gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+?\d+GB)')
|
|
|
60 |
|
61 |
gpu_name = ""
|
62 |
for index in gpu_indices:
|
|
|
68 |
name_match = gpu_name_pattern.search(line)
|
69 |
gpu_info = {}
|
70 |
if name_match:
|
71 |
+
gpu_name = name_match.group(1).strip()
|
72 |
if match:
|
73 |
temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
|
74 |
gpu_info.update({
|
|
|
80 |
|
81 |
if len(gpu_info) >= 4:
|
82 |
gpu_stats.append(gpu_info)
|
83 |
+
print(f"gpu_stats: {gpu_stats}")
|
84 |
gpu_name = f"{len(gpu_stats)}x{gpu_name}"
|
85 |
gpu_stats_total = {
|
86 |
GPU_TEMP: 0,
|
|
|
131 |
|
132 |
return avg_stats
|
133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
if __name__ == "__main__":
|
135 |
print(analyze_gpu_stats(parse_nvidia_smi()))
|