Spaces:
Runtime error
Runtime error
Initial Commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +9 -5
- app.py +400 -0
- eval-queue/.gitattributes +55 -0
- eval-queue/01-ai/Yi-1.5-9B-32K_eval_request_False_float16_Original.json +14 -0
- eval-queue/BioMistral/BioMistral-7B_eval_request_False_float16_Original.json +15 -0
- eval-queue/EleutherAI/polyglot-ko-1.3b_eval_request_False_float16_Original.json +14 -0
- eval-queue/HuggingFaceH4/zephyr-7b-beta_eval_request_False_float16_Original.json +15 -0
- eval-queue/README.md +3 -0
- eval-queue/nlpai-lab/KULLM3_eval_request_False_float16_Original.json +15 -0
- eval-queue/x2bee/POLAR-14B-DPO-v1.3_eval_request_False_float16_Original.json +15 -0
- eval-queue/x2bee/POLAR-14B-DPO-v1.4_eval_request_False_float16_Original.json +15 -0
- eval-queue/x2bee/POLAR-14B-HES-DPO-v1.5_eval_request_False_float16_Original.json +15 -0
- eval-queue/x2bee/POLAR-14B-SON-SFT-v0.1_eval_request_False_float16_Original.json +15 -0
- eval-queue/x2bee/POLAR-14B-v0.2_eval_request_False_float16_Original.json +15 -0
- eval-queue/x2bee/POLAR-14B-v0.5_eval_request_False_float16_Original.json +15 -0
- eval-results/.gitattributes +55 -0
- eval-results/01-ai/Yi-1.5-9B-32K/result_2024_07_30 20:36:30.json +450 -0
- eval-results/BioMistral/BioMistral-7B/BioMistral_BioMistral-7B_result_2024-05-30 01_33_58.json +450 -0
- eval-results/EleutherAI/polyglot-ko-1.3b/EleutherAI_polyglot-ko-1.3b_result_2023-09-24 15_21_38.json +450 -0
- eval-results/HuggingFaceH4/.DS_Store +0 -0
- eval-results/HuggingFaceH4/zephyr-7b-beta/result.json +450 -0
- eval-results/README.md +3 -0
- eval-results/nlpai-lab/KULLM3/result.json +450 -0
- eval-results/x2bee/POLAR-14B-DPO-v1.3/result.json +450 -0
- eval-results/x2bee/POLAR-14B-DPO-v1.4/result.json +450 -0
- eval-results/x2bee/POLAR-14B-HES-DPO-v1.5/result.json +450 -0
- eval-results/x2bee/POLAR-14B-SON-SFT-v0.1/result.json +450 -0
- eval-results/x2bee/POLAR-14B-v0.2/result.json +450 -0
- eval-results/x2bee/POLAR-14B-v0.5/result.json +450 -0
- requirements.txt +19 -0
- scripts/create_request_file.py +107 -0
- scripts/update_request_files.py +82 -0
- src/__pycache__/envs.cpython-310.pyc +0 -0
- src/__pycache__/populate.cpython-310.pyc +0 -0
- src/display/__pycache__/about.cpython-310.pyc +0 -0
- src/display/__pycache__/css_html_js.cpython-310.pyc +0 -0
- src/display/__pycache__/formatting.cpython-310.pyc +0 -0
- src/display/__pycache__/utils.cpython-310.pyc +0 -0
- src/display/about.py +84 -0
- src/display/css_html_js.py +84 -0
- src/display/formatting.py +40 -0
- src/display/utils.py +164 -0
- src/envs.py +32 -0
- src/leaderboard/__pycache__/filter_models.cpython-310.pyc +0 -0
- src/leaderboard/__pycache__/read_evals.cpython-310.pyc +0 -0
- src/leaderboard/filter_models.py +51 -0
- src/leaderboard/read_evals.py +272 -0
- src/populate.py +70 -0
- src/submission/__pycache__/check_validity.cpython-310.pyc +0 -0
- src/submission/__pycache__/submit.cpython-310.pyc +0 -0
README.md
CHANGED
@@ -1,13 +1,17 @@
|
|
1 |
---
|
2 |
title: Self Improving Leaderboard
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
10 |
license: apache-2.0
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Self Improving Leaderboard
|
3 |
+
emoji: 🔄
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.36.0
|
8 |
app_file: app.py
|
9 |
+
pinned: true
|
10 |
license: apache-2.0
|
11 |
+
duplicated_from: upstage/open-ko-llm-leaderboard
|
12 |
+
fullWidth: true
|
13 |
+
tags:
|
14 |
+
- leaderboard
|
15 |
---
|
16 |
|
17 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
+
from huggingface_hub import snapshot_download
|
5 |
+
from gradio_space_ci import configure_space_ci # FOR CI
|
6 |
+
|
7 |
+
from src.display.about import (
|
8 |
+
EVALUATION_QUEUE_TEXT,
|
9 |
+
INTRODUCTION_TEXT,
|
10 |
+
LLM_BENCHMARKS_TEXT,
|
11 |
+
TITLE,
|
12 |
+
)
|
13 |
+
from src.display.css_html_js import custom_css
|
14 |
+
from src.display.utils import (
|
15 |
+
BENCHMARK_COLS,
|
16 |
+
COLS,
|
17 |
+
EVAL_COLS,
|
18 |
+
EVAL_TYPES,
|
19 |
+
NUMERIC_INTERVALS,
|
20 |
+
TYPES,
|
21 |
+
AutoEvalColumn,
|
22 |
+
ModelType,
|
23 |
+
fields,
|
24 |
+
WeightType,
|
25 |
+
Precision
|
26 |
+
)
|
27 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
28 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
+
from src.submission.submit import add_new_eval
|
30 |
+
from src.tools.collections import update_collections
|
31 |
+
from src.tools.plots import (
|
32 |
+
create_metric_plot_obj,
|
33 |
+
create_plot_df,
|
34 |
+
create_scores_df,
|
35 |
+
)
|
36 |
+
|
37 |
+
|
38 |
+
def restart_space():
|
39 |
+
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
40 |
+
|
41 |
+
try:
|
42 |
+
print(EVAL_REQUESTS_PATH)
|
43 |
+
snapshot_download(
|
44 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
45 |
+
)
|
46 |
+
except Exception:
|
47 |
+
restart_space()
|
48 |
+
try:
|
49 |
+
print(EVAL_RESULTS_PATH)
|
50 |
+
snapshot_download(
|
51 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
52 |
+
)
|
53 |
+
except Exception:
|
54 |
+
restart_space()
|
55 |
+
|
56 |
+
|
57 |
+
_, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
58 |
+
leaderboard_df = original_df.copy()
|
59 |
+
|
60 |
+
(
|
61 |
+
finished_eval_queue_df,
|
62 |
+
running_eval_queue_df,
|
63 |
+
pending_eval_queue_df,
|
64 |
+
failed_eval_queue_df,
|
65 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
66 |
+
|
67 |
+
|
68 |
+
# Searching and filtering
|
69 |
+
def update_table(
|
70 |
+
hidden_df: pd.DataFrame,
|
71 |
+
columns: list,
|
72 |
+
type_query: list,
|
73 |
+
precision_query: str,
|
74 |
+
size_query: list,
|
75 |
+
show_deleted: bool,
|
76 |
+
show_merges: bool,
|
77 |
+
show_flagged: bool,
|
78 |
+
query: str,
|
79 |
+
):
|
80 |
+
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
|
81 |
+
filtered_df = filter_queries(query, filtered_df)
|
82 |
+
df = select_columns(filtered_df, columns)
|
83 |
+
return df
|
84 |
+
|
85 |
+
def quarter_update_table(
|
86 |
+
hidden_df: pd.DataFrame,
|
87 |
+
columns: list,
|
88 |
+
type_query: list,
|
89 |
+
precision_query: str,
|
90 |
+
size_query: list,
|
91 |
+
show_deleted: bool,
|
92 |
+
show_merges: bool,
|
93 |
+
show_flagged: bool,
|
94 |
+
query: str,
|
95 |
+
):
|
96 |
+
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
|
97 |
+
filtered_df = filter_queries(query, filtered_df)
|
98 |
+
df = quarter_select_columns(filtered_df, columns)
|
99 |
+
return df
|
100 |
+
|
101 |
+
|
102 |
+
def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
|
103 |
+
query = request.query_params.get("query") or ""
|
104 |
+
return query, query # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
|
105 |
+
|
106 |
+
|
107 |
+
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
108 |
+
return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
109 |
+
|
110 |
+
|
111 |
+
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
112 |
+
always_here_cols = [
|
113 |
+
AutoEvalColumn.model_type_symbol.name,
|
114 |
+
AutoEvalColumn.model.name,
|
115 |
+
]
|
116 |
+
# We use COLS to maintain sorting
|
117 |
+
filtered_df = df[
|
118 |
+
always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
|
119 |
+
]
|
120 |
+
return filtered_df
|
121 |
+
|
122 |
+
|
123 |
+
def filter_queries(query: str, filtered_df: pd.DataFrame):
|
124 |
+
"""Added by Abishek"""
|
125 |
+
final_df = []
|
126 |
+
if query != "":
|
127 |
+
queries = [q.strip() for q in query.split(";")]
|
128 |
+
for _q in queries:
|
129 |
+
_q = _q.strip()
|
130 |
+
if _q != "":
|
131 |
+
temp_filtered_df = search_table(filtered_df, _q)
|
132 |
+
if len(temp_filtered_df) > 0:
|
133 |
+
final_df.append(temp_filtered_df)
|
134 |
+
if len(final_df) > 0:
|
135 |
+
filtered_df = pd.concat(final_df)
|
136 |
+
filtered_df = filtered_df.drop_duplicates(
|
137 |
+
subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
|
138 |
+
)
|
139 |
+
|
140 |
+
return filtered_df
|
141 |
+
|
142 |
+
|
143 |
+
def filter_models(
|
144 |
+
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list
|
145 |
+
) -> pd.DataFrame:
|
146 |
+
|
147 |
+
type_emoji = [t[0] for t in type_query]
|
148 |
+
df = df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
149 |
+
df = df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
150 |
+
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
151 |
+
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
152 |
+
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
153 |
+
df = df.loc[mask]
|
154 |
+
|
155 |
+
return df
|
156 |
+
|
157 |
+
|
158 |
+
leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision])
|
159 |
+
|
160 |
+
print(leaderboard_df)
|
161 |
+
|
162 |
+
demo = gr.Blocks(css=custom_css)
|
163 |
+
with demo:
|
164 |
+
gr.HTML(TITLE)
|
165 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
166 |
+
|
167 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
168 |
+
with gr.TabItem("🔄 Self-Improving Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
169 |
+
with gr.Row():
|
170 |
+
with gr.Column():
|
171 |
+
with gr.Row():
|
172 |
+
search_bar = gr.Textbox(
|
173 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
174 |
+
show_label=False,
|
175 |
+
elem_id="search-bar",
|
176 |
+
)
|
177 |
+
with gr.Row():
|
178 |
+
shown_columns = gr.CheckboxGroup(
|
179 |
+
choices=[
|
180 |
+
c.name
|
181 |
+
for c in fields(AutoEvalColumn)
|
182 |
+
if not c.hidden and not c.never_hidden and not c.dummy
|
183 |
+
],
|
184 |
+
value=[
|
185 |
+
c.name
|
186 |
+
for c in fields(AutoEvalColumn)
|
187 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
188 |
+
],
|
189 |
+
label="Select columns to show",
|
190 |
+
elem_id="column-select",
|
191 |
+
interactive=True,
|
192 |
+
)
|
193 |
+
|
194 |
+
with gr.Column(min_width=320):
|
195 |
+
#with gr.Box(elem_id="box-filter"):
|
196 |
+
filter_columns_type = gr.CheckboxGroup(
|
197 |
+
label="Model types",
|
198 |
+
choices=[t.to_str() for t in ModelType],
|
199 |
+
value=[t.to_str() for t in ModelType],
|
200 |
+
interactive=True,
|
201 |
+
elem_id="filter-columns-type",
|
202 |
+
)
|
203 |
+
filter_columns_precision = gr.CheckboxGroup(
|
204 |
+
label="Precision",
|
205 |
+
choices=[i.value.name for i in Precision],
|
206 |
+
value=[i.value.name for i in Precision],
|
207 |
+
interactive=True,
|
208 |
+
elem_id="filter-columns-precision",
|
209 |
+
)
|
210 |
+
filter_columns_size = gr.CheckboxGroup(
|
211 |
+
label="Model sizes (in billions of parameters)",
|
212 |
+
choices=list(NUMERIC_INTERVALS.keys()),
|
213 |
+
value=list(NUMERIC_INTERVALS.keys()),
|
214 |
+
interactive=True,
|
215 |
+
elem_id="filter-columns-size",
|
216 |
+
)
|
217 |
+
|
218 |
+
leaderboard_table = gr.components.Dataframe(
|
219 |
+
value=leaderboard_df[
|
220 |
+
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
221 |
+
+ shown_columns.value
|
222 |
+
+ [AutoEvalColumn.dummy.name]
|
223 |
+
],
|
224 |
+
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
225 |
+
datatype=TYPES,
|
226 |
+
elem_id="leaderboard-table",
|
227 |
+
interactive=False,
|
228 |
+
visible=True,
|
229 |
+
#column_widths=["2%", "33%"]
|
230 |
+
)
|
231 |
+
|
232 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
233 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
234 |
+
value=original_df[COLS],
|
235 |
+
headers=COLS,
|
236 |
+
datatype=TYPES,
|
237 |
+
visible=False,
|
238 |
+
)
|
239 |
+
search_bar.submit(
|
240 |
+
update_table,
|
241 |
+
[
|
242 |
+
hidden_leaderboard_table_for_search,
|
243 |
+
shown_columns,
|
244 |
+
filter_columns_type,
|
245 |
+
filter_columns_precision,
|
246 |
+
filter_columns_size,
|
247 |
+
search_bar,
|
248 |
+
],
|
249 |
+
leaderboard_table,
|
250 |
+
)
|
251 |
+
|
252 |
+
# Define a hidden component that will trigger a reload only if a query parameter has be set
|
253 |
+
hidden_search_bar = gr.Textbox(value="", visible=False)
|
254 |
+
hidden_search_bar.change(
|
255 |
+
update_table,
|
256 |
+
[
|
257 |
+
hidden_leaderboard_table_for_search,
|
258 |
+
shown_columns,
|
259 |
+
filter_columns_type,
|
260 |
+
filter_columns_precision,
|
261 |
+
filter_columns_size,
|
262 |
+
search_bar,
|
263 |
+
],
|
264 |
+
leaderboard_table,
|
265 |
+
)
|
266 |
+
# Check query parameter once at startup and update search bar + hidden component
|
267 |
+
demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
|
268 |
+
|
269 |
+
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
|
270 |
+
selector.change(
|
271 |
+
update_table,
|
272 |
+
[
|
273 |
+
hidden_leaderboard_table_for_search,
|
274 |
+
shown_columns,
|
275 |
+
filter_columns_type,
|
276 |
+
filter_columns_precision,
|
277 |
+
filter_columns_size,
|
278 |
+
search_bar,
|
279 |
+
],
|
280 |
+
leaderboard_table,
|
281 |
+
queue=True,
|
282 |
+
)
|
283 |
+
|
284 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
|
285 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
286 |
+
|
287 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
|
288 |
+
with gr.Column():
|
289 |
+
with gr.Row():
|
290 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
291 |
+
|
292 |
+
with gr.Column():
|
293 |
+
with gr.Accordion(
|
294 |
+
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
295 |
+
open=False,
|
296 |
+
):
|
297 |
+
with gr.Row():
|
298 |
+
finished_eval_table = gr.components.Dataframe(
|
299 |
+
value=finished_eval_queue_df,
|
300 |
+
headers=EVAL_COLS,
|
301 |
+
datatype=EVAL_TYPES,
|
302 |
+
row_count=5,
|
303 |
+
)
|
304 |
+
with gr.Accordion(
|
305 |
+
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
306 |
+
open=False,
|
307 |
+
):
|
308 |
+
with gr.Row():
|
309 |
+
running_eval_table = gr.components.Dataframe(
|
310 |
+
value=running_eval_queue_df,
|
311 |
+
headers=EVAL_COLS,
|
312 |
+
datatype=EVAL_TYPES,
|
313 |
+
row_count=5,
|
314 |
+
)
|
315 |
+
|
316 |
+
with gr.Accordion(
|
317 |
+
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
318 |
+
open=False,
|
319 |
+
):
|
320 |
+
with gr.Row():
|
321 |
+
pending_eval_table = gr.components.Dataframe(
|
322 |
+
value=pending_eval_queue_df,
|
323 |
+
headers=EVAL_COLS,
|
324 |
+
datatype=EVAL_TYPES,
|
325 |
+
row_count=5,
|
326 |
+
)
|
327 |
+
with gr.Accordion(
|
328 |
+
f"❌ Failed Evaluations ({len(failed_eval_queue_df)})",
|
329 |
+
open=False,
|
330 |
+
):
|
331 |
+
with gr.Row():
|
332 |
+
pending_eval_table = gr.components.Dataframe(
|
333 |
+
value=failed_eval_queue_df,
|
334 |
+
headers=EVAL_COLS,
|
335 |
+
datatype=EVAL_TYPES,
|
336 |
+
row_count=5,
|
337 |
+
)
|
338 |
+
with gr.Row():
|
339 |
+
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
340 |
+
|
341 |
+
with gr.Row():
|
342 |
+
with gr.Column():
|
343 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
344 |
+
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
345 |
+
private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
|
346 |
+
model_type = gr.Dropdown(
|
347 |
+
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
348 |
+
label="Model type",
|
349 |
+
multiselect=False,
|
350 |
+
value=ModelType.IFT.to_str(" : "),
|
351 |
+
interactive=True,
|
352 |
+
)
|
353 |
+
|
354 |
+
with gr.Column():
|
355 |
+
precision = gr.Dropdown(
|
356 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
357 |
+
label="Precision",
|
358 |
+
multiselect=False,
|
359 |
+
value="float16",
|
360 |
+
interactive=True,
|
361 |
+
)
|
362 |
+
weight_type = gr.Dropdown(
|
363 |
+
choices=[i.value.name for i in WeightType],
|
364 |
+
label="Weights type",
|
365 |
+
multiselect=False,
|
366 |
+
value="Original",
|
367 |
+
interactive=True,
|
368 |
+
)
|
369 |
+
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
370 |
+
|
371 |
+
submit_button = gr.Button("Submit Evalulation!")
|
372 |
+
submission_result = gr.Markdown()
|
373 |
+
submit_button.click(
|
374 |
+
add_new_eval,
|
375 |
+
[
|
376 |
+
model_name_textbox,
|
377 |
+
base_model_name_textbox,
|
378 |
+
revision_name_textbox,
|
379 |
+
precision,
|
380 |
+
private,
|
381 |
+
weight_type,
|
382 |
+
model_type,
|
383 |
+
],
|
384 |
+
submission_result,
|
385 |
+
)
|
386 |
+
|
387 |
+
scheduler = BackgroundScheduler()
|
388 |
+
scheduler.add_job(restart_space, "interval", seconds=1800)
|
389 |
+
scheduler.start()
|
390 |
+
|
391 |
+
# Both launches the space and its CI
|
392 |
+
configure_space_ci(
|
393 |
+
demo.queue(default_concurrency_limit=40),
|
394 |
+
trusted_authors=[], # add manually trusted authors
|
395 |
+
private="True", # ephemeral spaces will have same visibility as the main space. Otherwise, set to `True` or `False` explicitly.
|
396 |
+
variables={}, # We overwrite HF_HOME as tmp CI spaces will have no cache
|
397 |
+
secrets=["HF_TOKEN", "H4_TOKEN"], # which secret do I want to copy from the main space? Can be a `List[str]`.
|
398 |
+
hardware=None, # "cpu-basic" by default. Otherwise set to "auto" to have same hardware as the main space or any valid string value.
|
399 |
+
storage=None, # no storage by default. Otherwise set to "auto" to have same storage as the main space or any valid string value.
|
400 |
+
).launch()
|
eval-queue/.gitattributes
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
+
# Audio files - uncompressed
|
38 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
+
# Audio files - compressed
|
42 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
# Image files - uncompressed
|
48 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
+
# Image files - compressed
|
53 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
eval-queue/01-ai/Yi-1.5-9B-32K_eval_request_False_float16_Original.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "01-ai/Yi-1.5-9B-32K",
|
3 |
+
"base_model": "",
|
4 |
+
"revision": "c0239dbc923b8a2b5ca849763bdd592d39c60850",
|
5 |
+
"private": false,
|
6 |
+
"precision": "float16",
|
7 |
+
"weight_type": "Original",
|
8 |
+
"status": "FINISHED",
|
9 |
+
"submitted_time": "2024-07-29T13:10:13Z",
|
10 |
+
"model_type": "\ud83d\udfe2 : pretrained",
|
11 |
+
"likes": 18,
|
12 |
+
"params": 8.829,
|
13 |
+
"license": "apache-2.0"
|
14 |
+
}
|
eval-queue/BioMistral/BioMistral-7B_eval_request_False_float16_Original.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "BioMistral/BioMistral-7B",
|
3 |
+
"base_model": "",
|
4 |
+
"revision": "main",
|
5 |
+
"private": false,
|
6 |
+
"precision": "float16",
|
7 |
+
"weight_type": "Original",
|
8 |
+
"status": "FINISHED",
|
9 |
+
"submitted_time": "2024-05-30 01:33:58",
|
10 |
+
"model_type": "\u2b55 : instruction-tuned",
|
11 |
+
"job_id": "2031",
|
12 |
+
"params": 7.0,
|
13 |
+
"likes": 354,
|
14 |
+
"license": "apache-2.0"
|
15 |
+
}
|
eval-queue/EleutherAI/polyglot-ko-1.3b_eval_request_False_float16_Original.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "EleutherAI/polyglot-ko-1.3b",
|
3 |
+
"base_model": "",
|
4 |
+
"revision": "main",
|
5 |
+
"private": false,
|
6 |
+
"precision": "float16",
|
7 |
+
"weight_type": "Original",
|
8 |
+
"status": "FINISHED",
|
9 |
+
"submitted_time": "2024-07-25T11:04:40Z",
|
10 |
+
"model_type": "\ud83d\udfe2 : pretrained",
|
11 |
+
"likes": 71,
|
12 |
+
"params": 1.432,
|
13 |
+
"license": "apache-2.0"
|
14 |
+
}
|
eval-queue/HuggingFaceH4/zephyr-7b-beta_eval_request_False_float16_Original.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "HuggingFaceH4/zephyr-7b-beta",
|
3 |
+
"base_model": "",
|
4 |
+
"revision": "main",
|
5 |
+
"private": false,
|
6 |
+
"precision": "float16",
|
7 |
+
"weight_type": "Original",
|
8 |
+
"status": "FINISHED",
|
9 |
+
"submitted_time": "2023-11-01 04:21:47",
|
10 |
+
"model_type": "\u2b55 : instruction-tuned",
|
11 |
+
"job_id": "401",
|
12 |
+
"params": 7.242,
|
13 |
+
"likes": 1162,
|
14 |
+
"license": "mit"
|
15 |
+
}
|
eval-queue/README.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
---
|
eval-queue/nlpai-lab/KULLM3_eval_request_False_float16_Original.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "nlpai-lab/KULLM3",
|
3 |
+
"base_model": "",
|
4 |
+
"revision": "main",
|
5 |
+
"private": false,
|
6 |
+
"precision": "float16",
|
7 |
+
"weight_type": "Original",
|
8 |
+
"status": "FINISHED",
|
9 |
+
"submitted_time": "2024-04-08 05:16:47",
|
10 |
+
"model_type": "\u2b55 : instruction-tuned",
|
11 |
+
"job_id": "1751",
|
12 |
+
"params": 10.732000350952148,
|
13 |
+
"likes": 13,
|
14 |
+
"license": "cc-by-nc-4.0"
|
15 |
+
}
|
eval-queue/x2bee/POLAR-14B-DPO-v1.3_eval_request_False_float16_Original.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "x2bee/POLAR-14B-DPO-v1.3",
|
3 |
+
"base_model": "",
|
4 |
+
"revision": "main",
|
5 |
+
"private": false,
|
6 |
+
"precision": "float16",
|
7 |
+
"weight_type": "Original",
|
8 |
+
"status": "FINISHED",
|
9 |
+
"submitted_time": "2024-05-23 11:59:50",
|
10 |
+
"model_type": "\u2b55 : instruction-tuned",
|
11 |
+
"job_id": "1987",
|
12 |
+
"params": 14.220999717712402,
|
13 |
+
"likes": 0,
|
14 |
+
"license": "apache-2.0"
|
15 |
+
}
|
eval-queue/x2bee/POLAR-14B-DPO-v1.4_eval_request_False_float16_Original.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "x2bee/POLAR-14B-DPO-v1.4",
|
3 |
+
"base_model": "",
|
4 |
+
"revision": "main",
|
5 |
+
"private": false,
|
6 |
+
"precision": "float16",
|
7 |
+
"weight_type": "Original",
|
8 |
+
"status": "FINISHED",
|
9 |
+
"submitted_time": "2024-05-27 15:02:47",
|
10 |
+
"model_type": "\u2b55 : instruction-tuned",
|
11 |
+
"job_id": "2004",
|
12 |
+
"params": 14.220999717712402,
|
13 |
+
"likes": 0,
|
14 |
+
"license": "apache-2.0"
|
15 |
+
}
|
eval-queue/x2bee/POLAR-14B-HES-DPO-v1.5_eval_request_False_float16_Original.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "x2bee/POLAR-14B-HES-DPO-v1.5",
|
3 |
+
"base_model": "",
|
4 |
+
"revision": "main",
|
5 |
+
"private": false,
|
6 |
+
"precision": "float16",
|
7 |
+
"weight_type": "Original",
|
8 |
+
"status": "FINISHED",
|
9 |
+
"submitted_time": "2024-05-29 23:53:33",
|
10 |
+
"model_type": "\u2b55 : instruction-tuned",
|
11 |
+
"job_id": "2029",
|
12 |
+
"params": 14.220999717712402,
|
13 |
+
"likes": 0,
|
14 |
+
"license": "apache-2.0"
|
15 |
+
}
|
eval-queue/x2bee/POLAR-14B-SON-SFT-v0.1_eval_request_False_float16_Original.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "x2bee/POLAR-14B-SON-SFT-v0.1",
|
3 |
+
"base_model": "x2bee/POLAR-14B-v0.2",
|
4 |
+
"revision": "main",
|
5 |
+
"private": false,
|
6 |
+
"precision": "float16",
|
7 |
+
"weight_type": "Original",
|
8 |
+
"status": "FINISHED",
|
9 |
+
"submitted_time": "2024-05-27 13:52:58",
|
10 |
+
"model_type": "\u2b55 : instruction-tuned",
|
11 |
+
"job_id": "2003",
|
12 |
+
"params": 14.220999717712402,
|
13 |
+
"likes": 0,
|
14 |
+
"license": "apache-2.0"
|
15 |
+
}
|
eval-queue/x2bee/POLAR-14B-v0.2_eval_request_False_float16_Original.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "x2bee/POLAR-14B-v0.2",
|
3 |
+
"base_model": "",
|
4 |
+
"revision": "main",
|
5 |
+
"private": false,
|
6 |
+
"precision": "float16",
|
7 |
+
"weight_type": "Original",
|
8 |
+
"status": "FINISHED",
|
9 |
+
"submitted_time": "2024-05-02 00:34:33",
|
10 |
+
"model_type": "\ud83d\udfe2 : pretrained",
|
11 |
+
"job_id": "1874",
|
12 |
+
"params": 14.220999717712402,
|
13 |
+
"likes": 0,
|
14 |
+
"license": "apache-2.0"
|
15 |
+
}
|
eval-queue/x2bee/POLAR-14B-v0.5_eval_request_False_float16_Original.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "x2bee/POLAR-14B-v0.5",
|
3 |
+
"base_model": "",
|
4 |
+
"revision": "main",
|
5 |
+
"private": false,
|
6 |
+
"precision": "float16",
|
7 |
+
"weight_type": "Original",
|
8 |
+
"status": "FINISHED",
|
9 |
+
"submitted_time": "2024-06-05 00:49:59",
|
10 |
+
"model_type": "\ud83d\udfe2 : pretrained",
|
11 |
+
"job_id": "2041",
|
12 |
+
"params": 14.220999717712402,
|
13 |
+
"likes": 0,
|
14 |
+
"license": "apache-2.0"
|
15 |
+
}
|
eval-results/.gitattributes
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
+
# Audio files - uncompressed
|
38 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
+
# Audio files - compressed
|
42 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
# Image files - uncompressed
|
48 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
+
# Image files - compressed
|
53 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
eval-results/01-ai/Yi-1.5-9B-32K/result_2024_07_30 20:36:30.json
ADDED
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"daily": {
|
4 |
+
"daily": 7
|
5 |
+
},
|
6 |
+
"quarterly": {
|
7 |
+
"quarterly": 7
|
8 |
+
},
|
9 |
+
"harness|arc_challenge|25": {
|
10 |
+
"acc": 0.29948805460750855,
|
11 |
+
"acc_stderr": 0.013385021637313567,
|
12 |
+
"acc_norm": 0.3506825938566553,
|
13 |
+
"acc_norm_stderr": 0.013944635930726089
|
14 |
+
},
|
15 |
+
"harness|hellaswag|10": {
|
16 |
+
"acc": 0.3333001394144593,
|
17 |
+
"acc_stderr": 0.004704293898729902,
|
18 |
+
"acc_norm": 0.4137621987651862,
|
19 |
+
"acc_norm_stderr": 0.004915003499517831
|
20 |
+
},
|
21 |
+
"harness|mmlu_world_religions|5": {
|
22 |
+
"acc": 0.47953216374269003,
|
23 |
+
"acc_stderr": 0.0383161053282193,
|
24 |
+
"acc_norm": 0.47953216374269003,
|
25 |
+
"acc_norm_stderr": 0.0383161053282193
|
26 |
+
},
|
27 |
+
"harness|mmlu_management|5": {
|
28 |
+
"acc": 0.5631067961165048,
|
29 |
+
"acc_stderr": 0.049111471073657764,
|
30 |
+
"acc_norm": 0.5631067961165048,
|
31 |
+
"acc_norm_stderr": 0.049111471073657764
|
32 |
+
},
|
33 |
+
"harness|mmlu_miscellaneous|5": {
|
34 |
+
"acc": 0.47509578544061304,
|
35 |
+
"acc_stderr": 0.01785777070490102,
|
36 |
+
"acc_norm": 0.47509578544061304,
|
37 |
+
"acc_norm_stderr": 0.01785777070490102
|
38 |
+
},
|
39 |
+
"harness|mmlu_anatomy|5": {
|
40 |
+
"acc": 0.28888888888888886,
|
41 |
+
"acc_stderr": 0.0391545063041425,
|
42 |
+
"acc_norm": 0.28888888888888886,
|
43 |
+
"acc_norm_stderr": 0.0391545063041425
|
44 |
+
},
|
45 |
+
"harness|mmlu_abstract_algebra|5": {
|
46 |
+
"acc": 0.31,
|
47 |
+
"acc_stderr": 0.04648231987117316,
|
48 |
+
"acc_norm": 0.31,
|
49 |
+
"acc_norm_stderr": 0.04648231987117316
|
50 |
+
},
|
51 |
+
"harness|mmlu_conceptual_physics|5": {
|
52 |
+
"acc": 0.46808510638297873,
|
53 |
+
"acc_stderr": 0.03261936918467382,
|
54 |
+
"acc_norm": 0.46808510638297873,
|
55 |
+
"acc_norm_stderr": 0.03261936918467382
|
56 |
+
},
|
57 |
+
"harness|mmlu_virology|5": {
|
58 |
+
"acc": 0.45180722891566266,
|
59 |
+
"acc_stderr": 0.03874371556587953,
|
60 |
+
"acc_norm": 0.45180722891566266,
|
61 |
+
"acc_norm_stderr": 0.03874371556587953
|
62 |
+
},
|
63 |
+
"harness|mmlu_philosophy|5": {
|
64 |
+
"acc": 0.47266881028938906,
|
65 |
+
"acc_stderr": 0.028355633568328188,
|
66 |
+
"acc_norm": 0.47266881028938906,
|
67 |
+
"acc_norm_stderr": 0.028355633568328188
|
68 |
+
},
|
69 |
+
"harness|mmlu_human_aging|5": {
|
70 |
+
"acc": 0.45739910313901344,
|
71 |
+
"acc_stderr": 0.033435777055830646,
|
72 |
+
"acc_norm": 0.45739910313901344,
|
73 |
+
"acc_norm_stderr": 0.033435777055830646
|
74 |
+
},
|
75 |
+
"harness|mmlu_human_sexuality|5": {
|
76 |
+
"acc": 0.5267175572519084,
|
77 |
+
"acc_stderr": 0.04379024936553894,
|
78 |
+
"acc_norm": 0.5267175572519084,
|
79 |
+
"acc_norm_stderr": 0.04379024936553894
|
80 |
+
},
|
81 |
+
"harness|mmlu_medical_genetics|5": {
|
82 |
+
"acc": 0.39,
|
83 |
+
"acc_stderr": 0.04902071300001975,
|
84 |
+
"acc_norm": 0.39,
|
85 |
+
"acc_norm_stderr": 0.04902071300001975
|
86 |
+
},
|
87 |
+
"harness|mmlu_high_school_geography|5": {
|
88 |
+
"acc": 0.5555555555555556,
|
89 |
+
"acc_stderr": 0.035402943770953675,
|
90 |
+
"acc_norm": 0.5555555555555556,
|
91 |
+
"acc_norm_stderr": 0.035402943770953675
|
92 |
+
},
|
93 |
+
"harness|mmlu_electrical_engineering|5": {
|
94 |
+
"acc": 0.5724137931034483,
|
95 |
+
"acc_stderr": 0.04122737111370332,
|
96 |
+
"acc_norm": 0.5724137931034483,
|
97 |
+
"acc_norm_stderr": 0.04122737111370332
|
98 |
+
},
|
99 |
+
"harness|mmlu_college_physics|5": {
|
100 |
+
"acc": 0.3137254901960784,
|
101 |
+
"acc_stderr": 0.04617034827006716,
|
102 |
+
"acc_norm": 0.3137254901960784,
|
103 |
+
"acc_norm_stderr": 0.04617034827006716
|
104 |
+
},
|
105 |
+
"harness|mmlu_high_school_microeconomics|5": {
|
106 |
+
"acc": 0.5,
|
107 |
+
"acc_stderr": 0.032478490123081544,
|
108 |
+
"acc_norm": 0.5,
|
109 |
+
"acc_norm_stderr": 0.032478490123081544
|
110 |
+
},
|
111 |
+
"harness|mmlu_high_school_macroeconomics|5": {
|
112 |
+
"acc": 0.47692307692307695,
|
113 |
+
"acc_stderr": 0.025323990861736125,
|
114 |
+
"acc_norm": 0.47692307692307695,
|
115 |
+
"acc_norm_stderr": 0.025323990861736125
|
116 |
+
},
|
117 |
+
"harness|mmlu_computer_security|5": {
|
118 |
+
"acc": 0.53,
|
119 |
+
"acc_stderr": 0.05016135580465919,
|
120 |
+
"acc_norm": 0.53,
|
121 |
+
"acc_norm_stderr": 0.05016135580465919
|
122 |
+
},
|
123 |
+
"harness|mmlu_global_facts|5": {
|
124 |
+
"acc": 0.37,
|
125 |
+
"acc_stderr": 0.048523658709391,
|
126 |
+
"acc_norm": 0.37,
|
127 |
+
"acc_norm_stderr": 0.048523658709391
|
128 |
+
},
|
129 |
+
"harness|mmlu_jurisprudence|5": {
|
130 |
+
"acc": 0.5740740740740741,
|
131 |
+
"acc_stderr": 0.047803436269367894,
|
132 |
+
"acc_norm": 0.5740740740740741,
|
133 |
+
"acc_norm_stderr": 0.047803436269367894
|
134 |
+
},
|
135 |
+
"harness|mmlu_high_school_chemistry|5": {
|
136 |
+
"acc": 0.4187192118226601,
|
137 |
+
"acc_stderr": 0.03471192860518468,
|
138 |
+
"acc_norm": 0.4187192118226601,
|
139 |
+
"acc_norm_stderr": 0.03471192860518468
|
140 |
+
},
|
141 |
+
"harness|mmlu_high_school_biology|5": {
|
142 |
+
"acc": 0.47419354838709676,
|
143 |
+
"acc_stderr": 0.02840609505765332,
|
144 |
+
"acc_norm": 0.47419354838709676,
|
145 |
+
"acc_norm_stderr": 0.02840609505765332
|
146 |
+
},
|
147 |
+
"harness|mmlu_marketing|5": {
|
148 |
+
"acc": 0.6752136752136753,
|
149 |
+
"acc_stderr": 0.03067902276549883,
|
150 |
+
"acc_norm": 0.6752136752136753,
|
151 |
+
"acc_norm_stderr": 0.03067902276549883
|
152 |
+
},
|
153 |
+
"harness|mmlu_clinical_knowledge|5": {
|
154 |
+
"acc": 0.44150943396226416,
|
155 |
+
"acc_stderr": 0.030561590426731833,
|
156 |
+
"acc_norm": 0.44150943396226416,
|
157 |
+
"acc_norm_stderr": 0.030561590426731833
|
158 |
+
},
|
159 |
+
"harness|mmlu_public_relations|5": {
|
160 |
+
"acc": 0.4727272727272727,
|
161 |
+
"acc_stderr": 0.04782001791380063,
|
162 |
+
"acc_norm": 0.4727272727272727,
|
163 |
+
"acc_norm_stderr": 0.04782001791380063
|
164 |
+
},
|
165 |
+
"harness|mmlu_high_school_mathematics|5": {
|
166 |
+
"acc": 0.4185185185185185,
|
167 |
+
"acc_stderr": 0.030078013075022066,
|
168 |
+
"acc_norm": 0.4185185185185185,
|
169 |
+
"acc_norm_stderr": 0.030078013075022066
|
170 |
+
},
|
171 |
+
"harness|mmlu_high_school_physics|5": {
|
172 |
+
"acc": 0.304635761589404,
|
173 |
+
"acc_stderr": 0.03757949922943343,
|
174 |
+
"acc_norm": 0.304635761589404,
|
175 |
+
"acc_norm_stderr": 0.03757949922943343
|
176 |
+
},
|
177 |
+
"harness|mmlu_sociology|5": {
|
178 |
+
"acc": 0.6069651741293532,
|
179 |
+
"acc_stderr": 0.0345368246603156,
|
180 |
+
"acc_norm": 0.6069651741293532,
|
181 |
+
"acc_norm_stderr": 0.0345368246603156
|
182 |
+
},
|
183 |
+
"harness|mmlu_college_medicine|5": {
|
184 |
+
"acc": 0.4046242774566474,
|
185 |
+
"acc_stderr": 0.03742461193887248,
|
186 |
+
"acc_norm": 0.4046242774566474,
|
187 |
+
"acc_norm_stderr": 0.03742461193887248
|
188 |
+
},
|
189 |
+
"harness|mmlu_elementary_mathematics|5": {
|
190 |
+
"acc": 0.5476190476190477,
|
191 |
+
"acc_stderr": 0.02563425811555495,
|
192 |
+
"acc_norm": 0.5476190476190477,
|
193 |
+
"acc_norm_stderr": 0.02563425811555495
|
194 |
+
},
|
195 |
+
"harness|mmlu_college_biology|5": {
|
196 |
+
"acc": 0.3472222222222222,
|
197 |
+
"acc_stderr": 0.039812405437178615,
|
198 |
+
"acc_norm": 0.3472222222222222,
|
199 |
+
"acc_norm_stderr": 0.039812405437178615
|
200 |
+
},
|
201 |
+
"harness|mmlu_college_chemistry|5": {
|
202 |
+
"acc": 0.33,
|
203 |
+
"acc_stderr": 0.04725815626252605,
|
204 |
+
"acc_norm": 0.33,
|
205 |
+
"acc_norm_stderr": 0.04725815626252605
|
206 |
+
},
|
207 |
+
"harness|mmlu_us_foreign_policy|5": {
|
208 |
+
"acc": 0.57,
|
209 |
+
"acc_stderr": 0.04975698519562426,
|
210 |
+
"acc_norm": 0.57,
|
211 |
+
"acc_norm_stderr": 0.04975698519562426
|
212 |
+
},
|
213 |
+
"harness|mmlu_moral_disputes|5": {
|
214 |
+
"acc": 0.49710982658959535,
|
215 |
+
"acc_stderr": 0.026918645383239015,
|
216 |
+
"acc_norm": 0.49710982658959535,
|
217 |
+
"acc_norm_stderr": 0.026918645383239015
|
218 |
+
},
|
219 |
+
"harness|mmlu_logical_fallacies|5": {
|
220 |
+
"acc": 0.5276073619631901,
|
221 |
+
"acc_stderr": 0.03922378290610991,
|
222 |
+
"acc_norm": 0.5276073619631901,
|
223 |
+
"acc_norm_stderr": 0.03922378290610991
|
224 |
+
},
|
225 |
+
"harness|mmlu_prehistory|5": {
|
226 |
+
"acc": 0.49691358024691357,
|
227 |
+
"acc_stderr": 0.027820214158594377,
|
228 |
+
"acc_norm": 0.49691358024691357,
|
229 |
+
"acc_norm_stderr": 0.027820214158594377
|
230 |
+
},
|
231 |
+
"harness|mmlu_college_mathematics|5": {
|
232 |
+
"acc": 0.45,
|
233 |
+
"acc_stderr": 0.05,
|
234 |
+
"acc_norm": 0.45,
|
235 |
+
"acc_norm_stderr": 0.05
|
236 |
+
},
|
237 |
+
"harness|mmlu_high_school_government_and_politics|5": {
|
238 |
+
"acc": 0.49222797927461137,
|
239 |
+
"acc_stderr": 0.03608003225569654,
|
240 |
+
"acc_norm": 0.49222797927461137,
|
241 |
+
"acc_norm_stderr": 0.03608003225569654
|
242 |
+
},
|
243 |
+
"harness|mmlu_econometrics|5": {
|
244 |
+
"acc": 0.41228070175438597,
|
245 |
+
"acc_stderr": 0.046306532033665956,
|
246 |
+
"acc_norm": 0.41228070175438597,
|
247 |
+
"acc_norm_stderr": 0.046306532033665956
|
248 |
+
},
|
249 |
+
"harness|mmlu_high_school_psychology|5": {
|
250 |
+
"acc": 0.5027522935779817,
|
251 |
+
"acc_stderr": 0.02143699835976532,
|
252 |
+
"acc_norm": 0.5027522935779817,
|
253 |
+
"acc_norm_stderr": 0.02143699835976532
|
254 |
+
},
|
255 |
+
"harness|mmlu_formal_logic|5": {
|
256 |
+
"acc": 0.40476190476190477,
|
257 |
+
"acc_stderr": 0.04390259265377561,
|
258 |
+
"acc_norm": 0.40476190476190477,
|
259 |
+
"acc_norm_stderr": 0.04390259265377561
|
260 |
+
},
|
261 |
+
"harness|mmlu_nutrition|5": {
|
262 |
+
"acc": 0.49019607843137253,
|
263 |
+
"acc_stderr": 0.028624412550167958,
|
264 |
+
"acc_norm": 0.49019607843137253,
|
265 |
+
"acc_norm_stderr": 0.028624412550167958
|
266 |
+
},
|
267 |
+
"harness|mmlu_business_ethics|5": {
|
268 |
+
"acc": 0.5,
|
269 |
+
"acc_stderr": 0.050251890762960605,
|
270 |
+
"acc_norm": 0.5,
|
271 |
+
"acc_norm_stderr": 0.050251890762960605
|
272 |
+
},
|
273 |
+
"harness|mmlu_international_law|5": {
|
274 |
+
"acc": 0.7355371900826446,
|
275 |
+
"acc_stderr": 0.04026187527591205,
|
276 |
+
"acc_norm": 0.7355371900826446,
|
277 |
+
"acc_norm_stderr": 0.04026187527591205
|
278 |
+
},
|
279 |
+
"harness|mmlu_astronomy|5": {
|
280 |
+
"acc": 0.45394736842105265,
|
281 |
+
"acc_stderr": 0.04051646342874142,
|
282 |
+
"acc_norm": 0.45394736842105265,
|
283 |
+
"acc_norm_stderr": 0.04051646342874142
|
284 |
+
},
|
285 |
+
"harness|mmlu_professional_psychology|5": {
|
286 |
+
"acc": 0.39705882352941174,
|
287 |
+
"acc_stderr": 0.019794488900024113,
|
288 |
+
"acc_norm": 0.39705882352941174,
|
289 |
+
"acc_norm_stderr": 0.019794488900024113
|
290 |
+
},
|
291 |
+
"harness|mmlu_professional_accounting|5": {
|
292 |
+
"acc": 0.40070921985815605,
|
293 |
+
"acc_stderr": 0.029233465745573086,
|
294 |
+
"acc_norm": 0.40070921985815605,
|
295 |
+
"acc_norm_stderr": 0.029233465745573086
|
296 |
+
},
|
297 |
+
"harness|mmlu_machine_learning|5": {
|
298 |
+
"acc": 0.39285714285714285,
|
299 |
+
"acc_stderr": 0.04635550135609976,
|
300 |
+
"acc_norm": 0.39285714285714285,
|
301 |
+
"acc_norm_stderr": 0.04635550135609976
|
302 |
+
},
|
303 |
+
"harness|mmlu_high_school_statistics|5": {
|
304 |
+
"acc": 0.4675925925925926,
|
305 |
+
"acc_stderr": 0.034028015813589656,
|
306 |
+
"acc_norm": 0.4675925925925926,
|
307 |
+
"acc_norm_stderr": 0.034028015813589656
|
308 |
+
},
|
309 |
+
"harness|mmlu_moral_scenarios|5": {
|
310 |
+
"acc": 0.3329608938547486,
|
311 |
+
"acc_stderr": 0.015761716178397552,
|
312 |
+
"acc_norm": 0.3329608938547486,
|
313 |
+
"acc_norm_stderr": 0.015761716178397552
|
314 |
+
},
|
315 |
+
"harness|mmlu_college_computer_science|5": {
|
316 |
+
"acc": 0.43,
|
317 |
+
"acc_stderr": 0.049756985195624284,
|
318 |
+
"acc_norm": 0.43,
|
319 |
+
"acc_norm_stderr": 0.049756985195624284
|
320 |
+
},
|
321 |
+
"harness|mmlu_high_school_computer_science|5": {
|
322 |
+
"acc": 0.76,
|
323 |
+
"acc_stderr": 0.042923469599092816,
|
324 |
+
"acc_norm": 0.76,
|
325 |
+
"acc_norm_stderr": 0.042923469599092816
|
326 |
+
},
|
327 |
+
"harness|mmlu_professional_medicine|5": {
|
328 |
+
"acc": 0.35294117647058826,
|
329 |
+
"acc_stderr": 0.029029422815681404,
|
330 |
+
"acc_norm": 0.35294117647058826,
|
331 |
+
"acc_norm_stderr": 0.029029422815681404
|
332 |
+
},
|
333 |
+
"harness|mmlu_security_studies|5": {
|
334 |
+
"acc": 0.6163265306122448,
|
335 |
+
"acc_stderr": 0.031130880396235943,
|
336 |
+
"acc_norm": 0.6163265306122448,
|
337 |
+
"acc_norm_stderr": 0.031130880396235943
|
338 |
+
},
|
339 |
+
"harness|mmlu_high_school_world_history|5": {
|
340 |
+
"acc": 0.5654008438818565,
|
341 |
+
"acc_stderr": 0.03226759995510145,
|
342 |
+
"acc_norm": 0.5654008438818565,
|
343 |
+
"acc_norm_stderr": 0.03226759995510145
|
344 |
+
},
|
345 |
+
"harness|mmlu_professional_law|5": {
|
346 |
+
"acc": 0.36571056062581486,
|
347 |
+
"acc_stderr": 0.012301028188840567,
|
348 |
+
"acc_norm": 0.36571056062581486,
|
349 |
+
"acc_norm_stderr": 0.012301028188840567
|
350 |
+
},
|
351 |
+
"harness|mmlu_high_school_us_history|5": {
|
352 |
+
"acc": 0.4852941176470588,
|
353 |
+
"acc_stderr": 0.03507793834791324,
|
354 |
+
"acc_norm": 0.4852941176470588,
|
355 |
+
"acc_norm_stderr": 0.03507793834791324
|
356 |
+
},
|
357 |
+
"harness|mmlu_high_school_european_history|5": {
|
358 |
+
"acc": 0.5151515151515151,
|
359 |
+
"acc_stderr": 0.03902551007374448,
|
360 |
+
"acc_norm": 0.5151515151515151,
|
361 |
+
"acc_norm_stderr": 0.03902551007374448
|
362 |
+
},
|
363 |
+
"harness|truthfulqa_mc|0": {
|
364 |
+
"mc1": 0.2937576499388005,
|
365 |
+
"mc1_stderr": 0.015945068581236614,
|
366 |
+
"mc2": 0.4670848140389129,
|
367 |
+
"mc2_stderr": 0.01585178282587417
|
368 |
+
},
|
369 |
+
"harness|commongen_v2|2": {
|
370 |
+
"acc": 0.47107438016528924,
|
371 |
+
"acc_stderr": 0.017161563949916348,
|
372 |
+
"acc_norm": 0.5171192443919717,
|
373 |
+
"acc_norm_stderr": 0.017180275246085626
|
374 |
+
}
|
375 |
+
},
|
376 |
+
"versions": {
|
377 |
+
"all": 0,
|
378 |
+
"harness|arc_challenge|25": 0,
|
379 |
+
"harness|hellaswag|10": 0,
|
380 |
+
"harness|mmlu_world_religions|5": 1,
|
381 |
+
"harness|mmlu_management|5": 1,
|
382 |
+
"harness|mmlu_miscellaneous|5": 1,
|
383 |
+
"harness|mmlu_anatomy|5": 1,
|
384 |
+
"harness|mmlu_abstract_algebra|5": 1,
|
385 |
+
"harness|mmlu_conceptual_physics|5": 1,
|
386 |
+
"harness|mmlu_virology|5": 1,
|
387 |
+
"harness|mmlu_philosophy|5": 1,
|
388 |
+
"harness|mmlu_human_aging|5": 1,
|
389 |
+
"harness|mmlu_human_sexuality|5": 1,
|
390 |
+
"harness|mmlu_medical_genetics|5": 1,
|
391 |
+
"harness|mmlu_high_school_geography|5": 1,
|
392 |
+
"harness|mmlu_electrical_engineering|5": 1,
|
393 |
+
"harness|mmlu_college_physics|5": 1,
|
394 |
+
"harness|mmlu_high_school_microeconomics|5": 1,
|
395 |
+
"harness|mmlu_high_school_macroeconomics|5": 1,
|
396 |
+
"harness|mmlu_computer_security|5": 1,
|
397 |
+
"harness|mmlu_global_facts|5": 1,
|
398 |
+
"harness|mmlu_jurisprudence|5": 1,
|
399 |
+
"harness|mmlu_high_school_chemistry|5": 1,
|
400 |
+
"harness|mmlu_high_school_biology|5": 1,
|
401 |
+
"harness|mmlu_marketing|5": 1,
|
402 |
+
"harness|mmlu_clinical_knowledge|5": 1,
|
403 |
+
"harness|mmlu_public_relations|5": 1,
|
404 |
+
"harness|mmlu_high_school_mathematics|5": 1,
|
405 |
+
"harness|mmlu_high_school_physics|5": 1,
|
406 |
+
"harness|mmlu_sociology|5": 1,
|
407 |
+
"harness|mmlu_college_medicine|5": 1,
|
408 |
+
"harness|mmlu_elementary_mathematics|5": 1,
|
409 |
+
"harness|mmlu_college_biology|5": 1,
|
410 |
+
"harness|mmlu_college_chemistry|5": 1,
|
411 |
+
"harness|mmlu_us_foreign_policy|5": 1,
|
412 |
+
"harness|mmlu_moral_disputes|5": 1,
|
413 |
+
"harness|mmlu_logical_fallacies|5": 1,
|
414 |
+
"harness|mmlu_prehistory|5": 1,
|
415 |
+
"harness|mmlu_college_mathematics|5": 1,
|
416 |
+
"harness|mmlu_high_school_government_and_politics|5": 1,
|
417 |
+
"harness|mmlu_econometrics|5": 1,
|
418 |
+
"harness|mmlu_high_school_psychology|5": 1,
|
419 |
+
"harness|mmlu_formal_logic|5": 1,
|
420 |
+
"harness|mmlu_nutrition|5": 1,
|
421 |
+
"harness|mmlu_business_ethics|5": 1,
|
422 |
+
"harness|mmlu_international_law|5": 1,
|
423 |
+
"harness|mmlu_astronomy|5": 1,
|
424 |
+
"harness|mmlu_professional_psychology|5": 1,
|
425 |
+
"harness|mmlu_professional_accounting|5": 1,
|
426 |
+
"harness|mmlu_machine_learning|5": 1,
|
427 |
+
"harness|mmlu_high_school_statistics|5": 1,
|
428 |
+
"harness|mmlu_moral_scenarios|5": 1,
|
429 |
+
"harness|mmlu_college_computer_science|5": 1,
|
430 |
+
"harness|mmlu_high_school_computer_science|5": 1,
|
431 |
+
"harness|mmlu_professional_medicine|5": 1,
|
432 |
+
"harness|mmlu_security_studies|5": 1,
|
433 |
+
"harness|mmlu_high_school_world_history|5": 1,
|
434 |
+
"harness|mmlu_professional_law|5": 1,
|
435 |
+
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
+
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
+
"harness|truthfulqa_mc|0": 0,
|
438 |
+
"harness|commongen_v2|2": 1
|
439 |
+
},
|
440 |
+
"config_general": {
|
441 |
+
"model_name": "01-ai/Yi-1.5-9B-32K",
|
442 |
+
"model_sha": "c0239dbc923b8a2b5ca849763bdd592d39c60850",
|
443 |
+
"model_dtype": "torch.float16",
|
444 |
+
"lighteval_sha": "",
|
445 |
+
"num_few_shot_default": 0,
|
446 |
+
"num_fewshot_seeds": 1,
|
447 |
+
"override_batch_size": 1,
|
448 |
+
"max_samples": null
|
449 |
+
}
|
450 |
+
}
|
eval-results/BioMistral/BioMistral-7B/BioMistral_BioMistral-7B_result_2024-05-30 01_33_58.json
ADDED
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"daily": {
|
4 |
+
"daily": 10
|
5 |
+
},
|
6 |
+
"quarterly": {
|
7 |
+
"quarterly": 10
|
8 |
+
},
|
9 |
+
"harness|arc_challenge|25": {
|
10 |
+
"acc": 0.257679180887372,
|
11 |
+
"acc_stderr": 0.012780770562768416,
|
12 |
+
"acc_norm": 0.3122866894197952,
|
13 |
+
"acc_norm_stderr": 0.013542598541688065
|
14 |
+
},
|
15 |
+
"harness|hellaswag|10": {
|
16 |
+
"acc": 0.3229436367257518,
|
17 |
+
"acc_stderr": 0.004666457279979418,
|
18 |
+
"acc_norm": 0.39255128460466043,
|
19 |
+
"acc_norm_stderr": 0.004873203269366306
|
20 |
+
},
|
21 |
+
"harness|mmlu_world_religions|5": {
|
22 |
+
"acc": 0.34502923976608185,
|
23 |
+
"acc_stderr": 0.036459813773888065,
|
24 |
+
"acc_norm": 0.34502923976608185,
|
25 |
+
"acc_norm_stderr": 0.036459813773888065
|
26 |
+
},
|
27 |
+
"harness|mmlu_management|5": {
|
28 |
+
"acc": 0.4368932038834951,
|
29 |
+
"acc_stderr": 0.04911147107365778,
|
30 |
+
"acc_norm": 0.4368932038834951,
|
31 |
+
"acc_norm_stderr": 0.04911147107365778
|
32 |
+
},
|
33 |
+
"harness|mmlu_miscellaneous|5": {
|
34 |
+
"acc": 0.3780332056194125,
|
35 |
+
"acc_stderr": 0.017339844462104625,
|
36 |
+
"acc_norm": 0.3780332056194125,
|
37 |
+
"acc_norm_stderr": 0.017339844462104625
|
38 |
+
},
|
39 |
+
"harness|mmlu_anatomy|5": {
|
40 |
+
"acc": 0.3037037037037037,
|
41 |
+
"acc_stderr": 0.039725528847851355,
|
42 |
+
"acc_norm": 0.3037037037037037,
|
43 |
+
"acc_norm_stderr": 0.039725528847851355
|
44 |
+
},
|
45 |
+
"harness|mmlu_abstract_algebra|5": {
|
46 |
+
"acc": 0.37,
|
47 |
+
"acc_stderr": 0.04852365870939099,
|
48 |
+
"acc_norm": 0.37,
|
49 |
+
"acc_norm_stderr": 0.04852365870939099
|
50 |
+
},
|
51 |
+
"harness|mmlu_conceptual_physics|5": {
|
52 |
+
"acc": 0.28085106382978725,
|
53 |
+
"acc_stderr": 0.02937917046412482,
|
54 |
+
"acc_norm": 0.28085106382978725,
|
55 |
+
"acc_norm_stderr": 0.02937917046412482
|
56 |
+
},
|
57 |
+
"harness|mmlu_virology|5": {
|
58 |
+
"acc": 0.3373493975903614,
|
59 |
+
"acc_stderr": 0.03680783690727581,
|
60 |
+
"acc_norm": 0.3373493975903614,
|
61 |
+
"acc_norm_stderr": 0.03680783690727581
|
62 |
+
},
|
63 |
+
"harness|mmlu_philosophy|5": {
|
64 |
+
"acc": 0.3954983922829582,
|
65 |
+
"acc_stderr": 0.027770918531427838,
|
66 |
+
"acc_norm": 0.3954983922829582,
|
67 |
+
"acc_norm_stderr": 0.027770918531427838
|
68 |
+
},
|
69 |
+
"harness|mmlu_human_aging|5": {
|
70 |
+
"acc": 0.34977578475336324,
|
71 |
+
"acc_stderr": 0.03200736719484503,
|
72 |
+
"acc_norm": 0.34977578475336324,
|
73 |
+
"acc_norm_stderr": 0.03200736719484503
|
74 |
+
},
|
75 |
+
"harness|mmlu_human_sexuality|5": {
|
76 |
+
"acc": 0.3969465648854962,
|
77 |
+
"acc_stderr": 0.04291135671009224,
|
78 |
+
"acc_norm": 0.3969465648854962,
|
79 |
+
"acc_norm_stderr": 0.04291135671009224
|
80 |
+
},
|
81 |
+
"harness|mmlu_medical_genetics|5": {
|
82 |
+
"acc": 0.42,
|
83 |
+
"acc_stderr": 0.049604496374885836,
|
84 |
+
"acc_norm": 0.42,
|
85 |
+
"acc_norm_stderr": 0.049604496374885836
|
86 |
+
},
|
87 |
+
"harness|mmlu_high_school_geography|5": {
|
88 |
+
"acc": 0.4292929292929293,
|
89 |
+
"acc_stderr": 0.03526552724601199,
|
90 |
+
"acc_norm": 0.4292929292929293,
|
91 |
+
"acc_norm_stderr": 0.03526552724601199
|
92 |
+
},
|
93 |
+
"harness|mmlu_electrical_engineering|5": {
|
94 |
+
"acc": 0.4,
|
95 |
+
"acc_stderr": 0.04082482904638628,
|
96 |
+
"acc_norm": 0.4,
|
97 |
+
"acc_norm_stderr": 0.04082482904638628
|
98 |
+
},
|
99 |
+
"harness|mmlu_college_physics|5": {
|
100 |
+
"acc": 0.30392156862745096,
|
101 |
+
"acc_stderr": 0.045766654032077636,
|
102 |
+
"acc_norm": 0.30392156862745096,
|
103 |
+
"acc_norm_stderr": 0.045766654032077636
|
104 |
+
},
|
105 |
+
"harness|mmlu_high_school_microeconomics|5": {
|
106 |
+
"acc": 0.40336134453781514,
|
107 |
+
"acc_stderr": 0.031866081214088314,
|
108 |
+
"acc_norm": 0.40336134453781514,
|
109 |
+
"acc_norm_stderr": 0.031866081214088314
|
110 |
+
},
|
111 |
+
"harness|mmlu_high_school_macroeconomics|5": {
|
112 |
+
"acc": 0.40512820512820513,
|
113 |
+
"acc_stderr": 0.024890471769938145,
|
114 |
+
"acc_norm": 0.40512820512820513,
|
115 |
+
"acc_norm_stderr": 0.024890471769938145
|
116 |
+
},
|
117 |
+
"harness|mmlu_computer_security|5": {
|
118 |
+
"acc": 0.48,
|
119 |
+
"acc_stderr": 0.050211673156867795,
|
120 |
+
"acc_norm": 0.48,
|
121 |
+
"acc_norm_stderr": 0.050211673156867795
|
122 |
+
},
|
123 |
+
"harness|mmlu_global_facts|5": {
|
124 |
+
"acc": 0.32,
|
125 |
+
"acc_stderr": 0.04688261722621505,
|
126 |
+
"acc_norm": 0.32,
|
127 |
+
"acc_norm_stderr": 0.04688261722621505
|
128 |
+
},
|
129 |
+
"harness|mmlu_jurisprudence|5": {
|
130 |
+
"acc": 0.49074074074074076,
|
131 |
+
"acc_stderr": 0.04832853553437055,
|
132 |
+
"acc_norm": 0.49074074074074076,
|
133 |
+
"acc_norm_stderr": 0.04832853553437055
|
134 |
+
},
|
135 |
+
"harness|mmlu_high_school_chemistry|5": {
|
136 |
+
"acc": 0.37438423645320196,
|
137 |
+
"acc_stderr": 0.03405155380561952,
|
138 |
+
"acc_norm": 0.37438423645320196,
|
139 |
+
"acc_norm_stderr": 0.03405155380561952
|
140 |
+
},
|
141 |
+
"harness|mmlu_high_school_biology|5": {
|
142 |
+
"acc": 0.36774193548387096,
|
143 |
+
"acc_stderr": 0.027430866579973474,
|
144 |
+
"acc_norm": 0.36774193548387096,
|
145 |
+
"acc_norm_stderr": 0.027430866579973474
|
146 |
+
},
|
147 |
+
"harness|mmlu_marketing|5": {
|
148 |
+
"acc": 0.5598290598290598,
|
149 |
+
"acc_stderr": 0.0325207417206305,
|
150 |
+
"acc_norm": 0.5598290598290598,
|
151 |
+
"acc_norm_stderr": 0.0325207417206305
|
152 |
+
},
|
153 |
+
"harness|mmlu_clinical_knowledge|5": {
|
154 |
+
"acc": 0.3886792452830189,
|
155 |
+
"acc_stderr": 0.030000485448675986,
|
156 |
+
"acc_norm": 0.3886792452830189,
|
157 |
+
"acc_norm_stderr": 0.030000485448675986
|
158 |
+
},
|
159 |
+
"harness|mmlu_public_relations|5": {
|
160 |
+
"acc": 0.44545454545454544,
|
161 |
+
"acc_stderr": 0.047605488214603246,
|
162 |
+
"acc_norm": 0.44545454545454544,
|
163 |
+
"acc_norm_stderr": 0.047605488214603246
|
164 |
+
},
|
165 |
+
"harness|mmlu_high_school_mathematics|5": {
|
166 |
+
"acc": 0.34444444444444444,
|
167 |
+
"acc_stderr": 0.028972648884844267,
|
168 |
+
"acc_norm": 0.34444444444444444,
|
169 |
+
"acc_norm_stderr": 0.028972648884844267
|
170 |
+
},
|
171 |
+
"harness|mmlu_high_school_physics|5": {
|
172 |
+
"acc": 0.3443708609271523,
|
173 |
+
"acc_stderr": 0.038796870240733264,
|
174 |
+
"acc_norm": 0.3443708609271523,
|
175 |
+
"acc_norm_stderr": 0.038796870240733264
|
176 |
+
},
|
177 |
+
"harness|mmlu_sociology|5": {
|
178 |
+
"acc": 0.4577114427860697,
|
179 |
+
"acc_stderr": 0.035228658640995975,
|
180 |
+
"acc_norm": 0.4577114427860697,
|
181 |
+
"acc_norm_stderr": 0.035228658640995975
|
182 |
+
},
|
183 |
+
"harness|mmlu_college_medicine|5": {
|
184 |
+
"acc": 0.3815028901734104,
|
185 |
+
"acc_stderr": 0.03703851193099521,
|
186 |
+
"acc_norm": 0.3815028901734104,
|
187 |
+
"acc_norm_stderr": 0.03703851193099521
|
188 |
+
},
|
189 |
+
"harness|mmlu_elementary_mathematics|5": {
|
190 |
+
"acc": 0.35714285714285715,
|
191 |
+
"acc_stderr": 0.02467786284133278,
|
192 |
+
"acc_norm": 0.35714285714285715,
|
193 |
+
"acc_norm_stderr": 0.02467786284133278
|
194 |
+
},
|
195 |
+
"harness|mmlu_college_biology|5": {
|
196 |
+
"acc": 0.3333333333333333,
|
197 |
+
"acc_stderr": 0.03942082639927213,
|
198 |
+
"acc_norm": 0.3333333333333333,
|
199 |
+
"acc_norm_stderr": 0.03942082639927213
|
200 |
+
},
|
201 |
+
"harness|mmlu_college_chemistry|5": {
|
202 |
+
"acc": 0.47,
|
203 |
+
"acc_stderr": 0.05016135580465919,
|
204 |
+
"acc_norm": 0.47,
|
205 |
+
"acc_norm_stderr": 0.05016135580465919
|
206 |
+
},
|
207 |
+
"harness|mmlu_us_foreign_policy|5": {
|
208 |
+
"acc": 0.54,
|
209 |
+
"acc_stderr": 0.05009082659620333,
|
210 |
+
"acc_norm": 0.54,
|
211 |
+
"acc_norm_stderr": 0.05009082659620333
|
212 |
+
},
|
213 |
+
"harness|mmlu_moral_disputes|5": {
|
214 |
+
"acc": 0.44508670520231214,
|
215 |
+
"acc_stderr": 0.02675625512966377,
|
216 |
+
"acc_norm": 0.44508670520231214,
|
217 |
+
"acc_norm_stderr": 0.02675625512966377
|
218 |
+
},
|
219 |
+
"harness|mmlu_logical_fallacies|5": {
|
220 |
+
"acc": 0.34355828220858897,
|
221 |
+
"acc_stderr": 0.03731133519673893,
|
222 |
+
"acc_norm": 0.34355828220858897,
|
223 |
+
"acc_norm_stderr": 0.03731133519673893
|
224 |
+
},
|
225 |
+
"harness|mmlu_prehistory|5": {
|
226 |
+
"acc": 0.37037037037037035,
|
227 |
+
"acc_stderr": 0.02686949074481525,
|
228 |
+
"acc_norm": 0.37037037037037035,
|
229 |
+
"acc_norm_stderr": 0.02686949074481525
|
230 |
+
},
|
231 |
+
"harness|mmlu_college_mathematics|5": {
|
232 |
+
"acc": 0.33,
|
233 |
+
"acc_stderr": 0.04725815626252605,
|
234 |
+
"acc_norm": 0.33,
|
235 |
+
"acc_norm_stderr": 0.04725815626252605
|
236 |
+
},
|
237 |
+
"harness|mmlu_high_school_government_and_politics|5": {
|
238 |
+
"acc": 0.44559585492227977,
|
239 |
+
"acc_stderr": 0.0358701498607566,
|
240 |
+
"acc_norm": 0.44559585492227977,
|
241 |
+
"acc_norm_stderr": 0.0358701498607566
|
242 |
+
},
|
243 |
+
"harness|mmlu_econometrics|5": {
|
244 |
+
"acc": 0.2719298245614035,
|
245 |
+
"acc_stderr": 0.041857744240220575,
|
246 |
+
"acc_norm": 0.2719298245614035,
|
247 |
+
"acc_norm_stderr": 0.041857744240220575
|
248 |
+
},
|
249 |
+
"harness|mmlu_high_school_psychology|5": {
|
250 |
+
"acc": 0.3798165137614679,
|
251 |
+
"acc_stderr": 0.020808825617866244,
|
252 |
+
"acc_norm": 0.3798165137614679,
|
253 |
+
"acc_norm_stderr": 0.020808825617866244
|
254 |
+
},
|
255 |
+
"harness|mmlu_formal_logic|5": {
|
256 |
+
"acc": 0.3492063492063492,
|
257 |
+
"acc_stderr": 0.04263906892795132,
|
258 |
+
"acc_norm": 0.3492063492063492,
|
259 |
+
"acc_norm_stderr": 0.04263906892795132
|
260 |
+
},
|
261 |
+
"harness|mmlu_nutrition|5": {
|
262 |
+
"acc": 0.4117647058823529,
|
263 |
+
"acc_stderr": 0.02818059632825929,
|
264 |
+
"acc_norm": 0.4117647058823529,
|
265 |
+
"acc_norm_stderr": 0.02818059632825929
|
266 |
+
},
|
267 |
+
"harness|mmlu_business_ethics|5": {
|
268 |
+
"acc": 0.42,
|
269 |
+
"acc_stderr": 0.049604496374885836,
|
270 |
+
"acc_norm": 0.42,
|
271 |
+
"acc_norm_stderr": 0.049604496374885836
|
272 |
+
},
|
273 |
+
"harness|mmlu_international_law|5": {
|
274 |
+
"acc": 0.5619834710743802,
|
275 |
+
"acc_stderr": 0.045291468044357915,
|
276 |
+
"acc_norm": 0.5619834710743802,
|
277 |
+
"acc_norm_stderr": 0.045291468044357915
|
278 |
+
},
|
279 |
+
"harness|mmlu_astronomy|5": {
|
280 |
+
"acc": 0.34868421052631576,
|
281 |
+
"acc_stderr": 0.038781398887976125,
|
282 |
+
"acc_norm": 0.34868421052631576,
|
283 |
+
"acc_norm_stderr": 0.038781398887976125
|
284 |
+
},
|
285 |
+
"harness|mmlu_professional_psychology|5": {
|
286 |
+
"acc": 0.3284313725490196,
|
287 |
+
"acc_stderr": 0.018999707383162666,
|
288 |
+
"acc_norm": 0.3284313725490196,
|
289 |
+
"acc_norm_stderr": 0.018999707383162666
|
290 |
+
},
|
291 |
+
"harness|mmlu_professional_accounting|5": {
|
292 |
+
"acc": 0.2730496453900709,
|
293 |
+
"acc_stderr": 0.026577860943307857,
|
294 |
+
"acc_norm": 0.2730496453900709,
|
295 |
+
"acc_norm_stderr": 0.026577860943307857
|
296 |
+
},
|
297 |
+
"harness|mmlu_machine_learning|5": {
|
298 |
+
"acc": 0.2767857142857143,
|
299 |
+
"acc_stderr": 0.04246624336697627,
|
300 |
+
"acc_norm": 0.2767857142857143,
|
301 |
+
"acc_norm_stderr": 0.04246624336697627
|
302 |
+
},
|
303 |
+
"harness|mmlu_high_school_statistics|5": {
|
304 |
+
"acc": 0.4074074074074074,
|
305 |
+
"acc_stderr": 0.03350991604696043,
|
306 |
+
"acc_norm": 0.4074074074074074,
|
307 |
+
"acc_norm_stderr": 0.03350991604696043
|
308 |
+
},
|
309 |
+
"harness|mmlu_moral_scenarios|5": {
|
310 |
+
"acc": 0.23910614525139665,
|
311 |
+
"acc_stderr": 0.014265554192331149,
|
312 |
+
"acc_norm": 0.23910614525139665,
|
313 |
+
"acc_norm_stderr": 0.014265554192331149
|
314 |
+
},
|
315 |
+
"harness|mmlu_college_computer_science|5": {
|
316 |
+
"acc": 0.33,
|
317 |
+
"acc_stderr": 0.047258156262526045,
|
318 |
+
"acc_norm": 0.33,
|
319 |
+
"acc_norm_stderr": 0.047258156262526045
|
320 |
+
},
|
321 |
+
"harness|mmlu_high_school_computer_science|5": {
|
322 |
+
"acc": 0.4,
|
323 |
+
"acc_stderr": 0.04923659639173309,
|
324 |
+
"acc_norm": 0.4,
|
325 |
+
"acc_norm_stderr": 0.04923659639173309
|
326 |
+
},
|
327 |
+
"harness|mmlu_professional_medicine|5": {
|
328 |
+
"acc": 0.4227941176470588,
|
329 |
+
"acc_stderr": 0.030008562845003483,
|
330 |
+
"acc_norm": 0.4227941176470588,
|
331 |
+
"acc_norm_stderr": 0.030008562845003483
|
332 |
+
},
|
333 |
+
"harness|mmlu_security_studies|5": {
|
334 |
+
"acc": 0.3469387755102041,
|
335 |
+
"acc_stderr": 0.030472526026726492,
|
336 |
+
"acc_norm": 0.3469387755102041,
|
337 |
+
"acc_norm_stderr": 0.030472526026726492
|
338 |
+
},
|
339 |
+
"harness|mmlu_high_school_world_history|5": {
|
340 |
+
"acc": 0.4177215189873418,
|
341 |
+
"acc_stderr": 0.032103530322412685,
|
342 |
+
"acc_norm": 0.4177215189873418,
|
343 |
+
"acc_norm_stderr": 0.032103530322412685
|
344 |
+
},
|
345 |
+
"harness|mmlu_professional_law|5": {
|
346 |
+
"acc": 0.3005215123859192,
|
347 |
+
"acc_stderr": 0.011709918883039124,
|
348 |
+
"acc_norm": 0.3005215123859192,
|
349 |
+
"acc_norm_stderr": 0.011709918883039124
|
350 |
+
},
|
351 |
+
"harness|mmlu_high_school_us_history|5": {
|
352 |
+
"acc": 0.3872549019607843,
|
353 |
+
"acc_stderr": 0.03418931233833344,
|
354 |
+
"acc_norm": 0.3872549019607843,
|
355 |
+
"acc_norm_stderr": 0.03418931233833344
|
356 |
+
},
|
357 |
+
"harness|mmlu_high_school_european_history|5": {
|
358 |
+
"acc": 0.43636363636363634,
|
359 |
+
"acc_stderr": 0.03872592983524753,
|
360 |
+
"acc_norm": 0.43636363636363634,
|
361 |
+
"acc_norm_stderr": 0.03872592983524753
|
362 |
+
},
|
363 |
+
"harness|truthfulqa_mc|0": {
|
364 |
+
"mc1": 0.3072215422276622,
|
365 |
+
"mc1_stderr": 0.016150201321323002,
|
366 |
+
"mc2": 0.4721418472000992,
|
367 |
+
"mc2_stderr": 0.01626625866283201
|
368 |
+
},
|
369 |
+
"harness|commongen_v2|2": {
|
370 |
+
"acc": 0.27863046044864226,
|
371 |
+
"acc_stderr": 0.01541373949434568,
|
372 |
+
"acc_norm": 0.3825265643447462,
|
373 |
+
"acc_norm_stderr": 0.016709165387228803
|
374 |
+
}
|
375 |
+
},
|
376 |
+
"versions": {
|
377 |
+
"all": 0,
|
378 |
+
"harness|arc_challenge|25": 0,
|
379 |
+
"harness|hellaswag|10": 0,
|
380 |
+
"harness|mmlu_world_religions|5": 1,
|
381 |
+
"harness|mmlu_management|5": 1,
|
382 |
+
"harness|mmlu_miscellaneous|5": 1,
|
383 |
+
"harness|mmlu_anatomy|5": 1,
|
384 |
+
"harness|mmlu_abstract_algebra|5": 1,
|
385 |
+
"harness|mmlu_conceptual_physics|5": 1,
|
386 |
+
"harness|mmlu_virology|5": 1,
|
387 |
+
"harness|mmlu_philosophy|5": 1,
|
388 |
+
"harness|mmlu_human_aging|5": 1,
|
389 |
+
"harness|mmlu_human_sexuality|5": 1,
|
390 |
+
"harness|mmlu_medical_genetics|5": 1,
|
391 |
+
"harness|mmlu_high_school_geography|5": 1,
|
392 |
+
"harness|mmlu_electrical_engineering|5": 1,
|
393 |
+
"harness|mmlu_college_physics|5": 1,
|
394 |
+
"harness|mmlu_high_school_microeconomics|5": 1,
|
395 |
+
"harness|mmlu_high_school_macroeconomics|5": 1,
|
396 |
+
"harness|mmlu_computer_security|5": 1,
|
397 |
+
"harness|mmlu_global_facts|5": 1,
|
398 |
+
"harness|mmlu_jurisprudence|5": 1,
|
399 |
+
"harness|mmlu_high_school_chemistry|5": 1,
|
400 |
+
"harness|mmlu_high_school_biology|5": 1,
|
401 |
+
"harness|mmlu_marketing|5": 1,
|
402 |
+
"harness|mmlu_clinical_knowledge|5": 1,
|
403 |
+
"harness|mmlu_public_relations|5": 1,
|
404 |
+
"harness|mmlu_high_school_mathematics|5": 1,
|
405 |
+
"harness|mmlu_high_school_physics|5": 1,
|
406 |
+
"harness|mmlu_sociology|5": 1,
|
407 |
+
"harness|mmlu_college_medicine|5": 1,
|
408 |
+
"harness|mmlu_elementary_mathematics|5": 1,
|
409 |
+
"harness|mmlu_college_biology|5": 1,
|
410 |
+
"harness|mmlu_college_chemistry|5": 1,
|
411 |
+
"harness|mmlu_us_foreign_policy|5": 1,
|
412 |
+
"harness|mmlu_moral_disputes|5": 1,
|
413 |
+
"harness|mmlu_logical_fallacies|5": 1,
|
414 |
+
"harness|mmlu_prehistory|5": 1,
|
415 |
+
"harness|mmlu_college_mathematics|5": 1,
|
416 |
+
"harness|mmlu_high_school_government_and_politics|5": 1,
|
417 |
+
"harness|mmlu_econometrics|5": 1,
|
418 |
+
"harness|mmlu_high_school_psychology|5": 1,
|
419 |
+
"harness|mmlu_formal_logic|5": 1,
|
420 |
+
"harness|mmlu_nutrition|5": 1,
|
421 |
+
"harness|mmlu_business_ethics|5": 1,
|
422 |
+
"harness|mmlu_international_law|5": 1,
|
423 |
+
"harness|mmlu_astronomy|5": 1,
|
424 |
+
"harness|mmlu_professional_psychology|5": 1,
|
425 |
+
"harness|mmlu_professional_accounting|5": 1,
|
426 |
+
"harness|mmlu_machine_learning|5": 1,
|
427 |
+
"harness|mmlu_high_school_statistics|5": 1,
|
428 |
+
"harness|mmlu_moral_scenarios|5": 1,
|
429 |
+
"harness|mmlu_college_computer_science|5": 1,
|
430 |
+
"harness|mmlu_high_school_computer_science|5": 1,
|
431 |
+
"harness|mmlu_professional_medicine|5": 1,
|
432 |
+
"harness|mmlu_security_studies|5": 1,
|
433 |
+
"harness|mmlu_high_school_world_history|5": 1,
|
434 |
+
"harness|mmlu_professional_law|5": 1,
|
435 |
+
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
+
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
+
"harness|truthfulqa_mc|0": 0,
|
438 |
+
"harness|commongen_v2|2": 1
|
439 |
+
},
|
440 |
+
"config_general": {
|
441 |
+
"model_name": "BioMistral/BioMistral-7B",
|
442 |
+
"model_sha": "9a11e1ffa817c211cbb52ee1fb312dc6b61b40a5",
|
443 |
+
"model_dtype": "torch.float16",
|
444 |
+
"lighteval_sha": "",
|
445 |
+
"num_few_shot_default": 0,
|
446 |
+
"num_fewshot_seeds": 1,
|
447 |
+
"override_batch_size": 1,
|
448 |
+
"max_samples": null
|
449 |
+
}
|
450 |
+
}
|
eval-results/EleutherAI/polyglot-ko-1.3b/EleutherAI_polyglot-ko-1.3b_result_2023-09-24 15_21_38.json
ADDED
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"daily": {
|
4 |
+
"daily": 11
|
5 |
+
},
|
6 |
+
"quarterly": {
|
7 |
+
"quarterly": 11
|
8 |
+
},
|
9 |
+
"harness|arc_challenge|25": {
|
10 |
+
"acc": 0.2235494880546075,
|
11 |
+
"acc_stderr": 0.012174896631202605,
|
12 |
+
"acc_norm": 0.2815699658703072,
|
13 |
+
"acc_norm_stderr": 0.013143376735009015
|
14 |
+
},
|
15 |
+
"harness|hellaswag|10": {
|
16 |
+
"acc": 0.3345947022505477,
|
17 |
+
"acc_stderr": 0.004708842600177431,
|
18 |
+
"acc_norm": 0.4135630352519418,
|
19 |
+
"acc_norm_stderr": 0.0049146550633294974
|
20 |
+
},
|
21 |
+
"harness|mmlu_world_religions|5": {
|
22 |
+
"acc": 0.27485380116959063,
|
23 |
+
"acc_stderr": 0.03424042924691585,
|
24 |
+
"acc_norm": 0.27485380116959063,
|
25 |
+
"acc_norm_stderr": 0.03424042924691585
|
26 |
+
},
|
27 |
+
"harness|mmlu_management|5": {
|
28 |
+
"acc": 0.27184466019417475,
|
29 |
+
"acc_stderr": 0.044052680241409216,
|
30 |
+
"acc_norm": 0.27184466019417475,
|
31 |
+
"acc_norm_stderr": 0.044052680241409216
|
32 |
+
},
|
33 |
+
"harness|mmlu_miscellaneous|5": {
|
34 |
+
"acc": 0.26947637292464877,
|
35 |
+
"acc_stderr": 0.015866243073215065,
|
36 |
+
"acc_norm": 0.26947637292464877,
|
37 |
+
"acc_norm_stderr": 0.015866243073215065
|
38 |
+
},
|
39 |
+
"harness|mmlu_anatomy|5": {
|
40 |
+
"acc": 0.26666666666666666,
|
41 |
+
"acc_stderr": 0.038201699145179055,
|
42 |
+
"acc_norm": 0.26666666666666666,
|
43 |
+
"acc_norm_stderr": 0.038201699145179055
|
44 |
+
},
|
45 |
+
"harness|mmlu_abstract_algebra|5": {
|
46 |
+
"acc": 0.3,
|
47 |
+
"acc_stderr": 0.046056618647183814,
|
48 |
+
"acc_norm": 0.3,
|
49 |
+
"acc_norm_stderr": 0.046056618647183814
|
50 |
+
},
|
51 |
+
"harness|mmlu_conceptual_physics|5": {
|
52 |
+
"acc": 0.2127659574468085,
|
53 |
+
"acc_stderr": 0.026754391348039783,
|
54 |
+
"acc_norm": 0.2127659574468085,
|
55 |
+
"acc_norm_stderr": 0.026754391348039783
|
56 |
+
},
|
57 |
+
"harness|mmlu_virology|5": {
|
58 |
+
"acc": 0.24096385542168675,
|
59 |
+
"acc_stderr": 0.033293941190735296,
|
60 |
+
"acc_norm": 0.24096385542168675,
|
61 |
+
"acc_norm_stderr": 0.033293941190735296
|
62 |
+
},
|
63 |
+
"harness|mmlu_philosophy|5": {
|
64 |
+
"acc": 0.2379421221864952,
|
65 |
+
"acc_stderr": 0.024185150647818707,
|
66 |
+
"acc_norm": 0.2379421221864952,
|
67 |
+
"acc_norm_stderr": 0.024185150647818707
|
68 |
+
},
|
69 |
+
"harness|mmlu_human_aging|5": {
|
70 |
+
"acc": 0.2825112107623318,
|
71 |
+
"acc_stderr": 0.030216831011508766,
|
72 |
+
"acc_norm": 0.2825112107623318,
|
73 |
+
"acc_norm_stderr": 0.030216831011508766
|
74 |
+
},
|
75 |
+
"harness|mmlu_human_sexuality|5": {
|
76 |
+
"acc": 0.21374045801526717,
|
77 |
+
"acc_stderr": 0.0359546161177469,
|
78 |
+
"acc_norm": 0.21374045801526717,
|
79 |
+
"acc_norm_stderr": 0.0359546161177469
|
80 |
+
},
|
81 |
+
"harness|mmlu_medical_genetics|5": {
|
82 |
+
"acc": 0.24,
|
83 |
+
"acc_stderr": 0.042923469599092816,
|
84 |
+
"acc_norm": 0.24,
|
85 |
+
"acc_norm_stderr": 0.042923469599092816
|
86 |
+
},
|
87 |
+
"harness|mmlu_high_school_geography|5": {
|
88 |
+
"acc": 0.2474747474747475,
|
89 |
+
"acc_stderr": 0.03074630074212451,
|
90 |
+
"acc_norm": 0.2474747474747475,
|
91 |
+
"acc_norm_stderr": 0.03074630074212451
|
92 |
+
},
|
93 |
+
"harness|mmlu_electrical_engineering|5": {
|
94 |
+
"acc": 0.22758620689655173,
|
95 |
+
"acc_stderr": 0.03493950380131184,
|
96 |
+
"acc_norm": 0.22758620689655173,
|
97 |
+
"acc_norm_stderr": 0.03493950380131184
|
98 |
+
},
|
99 |
+
"harness|mmlu_college_physics|5": {
|
100 |
+
"acc": 0.22549019607843138,
|
101 |
+
"acc_stderr": 0.041583075330832865,
|
102 |
+
"acc_norm": 0.22549019607843138,
|
103 |
+
"acc_norm_stderr": 0.041583075330832865
|
104 |
+
},
|
105 |
+
"harness|mmlu_high_school_microeconomics|5": {
|
106 |
+
"acc": 0.31512605042016806,
|
107 |
+
"acc_stderr": 0.030176808288974337,
|
108 |
+
"acc_norm": 0.31512605042016806,
|
109 |
+
"acc_norm_stderr": 0.030176808288974337
|
110 |
+
},
|
111 |
+
"harness|mmlu_high_school_macroeconomics|5": {
|
112 |
+
"acc": 0.2205128205128205,
|
113 |
+
"acc_stderr": 0.02102067268082791,
|
114 |
+
"acc_norm": 0.2205128205128205,
|
115 |
+
"acc_norm_stderr": 0.02102067268082791
|
116 |
+
},
|
117 |
+
"harness|mmlu_computer_security|5": {
|
118 |
+
"acc": 0.18,
|
119 |
+
"acc_stderr": 0.038612291966536955,
|
120 |
+
"acc_norm": 0.18,
|
121 |
+
"acc_norm_stderr": 0.038612291966536955
|
122 |
+
},
|
123 |
+
"harness|mmlu_global_facts|5": {
|
124 |
+
"acc": 0.31,
|
125 |
+
"acc_stderr": 0.04648231987117316,
|
126 |
+
"acc_norm": 0.31,
|
127 |
+
"acc_norm_stderr": 0.04648231987117316
|
128 |
+
},
|
129 |
+
"harness|mmlu_jurisprudence|5": {
|
130 |
+
"acc": 0.25,
|
131 |
+
"acc_stderr": 0.04186091791394607,
|
132 |
+
"acc_norm": 0.25,
|
133 |
+
"acc_norm_stderr": 0.04186091791394607
|
134 |
+
},
|
135 |
+
"harness|mmlu_high_school_chemistry|5": {
|
136 |
+
"acc": 0.2660098522167488,
|
137 |
+
"acc_stderr": 0.03108982600293752,
|
138 |
+
"acc_norm": 0.2660098522167488,
|
139 |
+
"acc_norm_stderr": 0.03108982600293752
|
140 |
+
},
|
141 |
+
"harness|mmlu_high_school_biology|5": {
|
142 |
+
"acc": 0.3,
|
143 |
+
"acc_stderr": 0.02606936229533513,
|
144 |
+
"acc_norm": 0.3,
|
145 |
+
"acc_norm_stderr": 0.02606936229533513
|
146 |
+
},
|
147 |
+
"harness|mmlu_marketing|5": {
|
148 |
+
"acc": 0.23076923076923078,
|
149 |
+
"acc_stderr": 0.027601921381417607,
|
150 |
+
"acc_norm": 0.23076923076923078,
|
151 |
+
"acc_norm_stderr": 0.027601921381417607
|
152 |
+
},
|
153 |
+
"harness|mmlu_clinical_knowledge|5": {
|
154 |
+
"acc": 0.25660377358490566,
|
155 |
+
"acc_stderr": 0.026880647889051968,
|
156 |
+
"acc_norm": 0.25660377358490566,
|
157 |
+
"acc_norm_stderr": 0.026880647889051968
|
158 |
+
},
|
159 |
+
"harness|mmlu_public_relations|5": {
|
160 |
+
"acc": 0.2545454545454545,
|
161 |
+
"acc_stderr": 0.04172343038705383,
|
162 |
+
"acc_norm": 0.2545454545454545,
|
163 |
+
"acc_norm_stderr": 0.04172343038705383
|
164 |
+
},
|
165 |
+
"harness|mmlu_high_school_mathematics|5": {
|
166 |
+
"acc": 0.2962962962962963,
|
167 |
+
"acc_stderr": 0.02784081149587194,
|
168 |
+
"acc_norm": 0.2962962962962963,
|
169 |
+
"acc_norm_stderr": 0.02784081149587194
|
170 |
+
},
|
171 |
+
"harness|mmlu_high_school_physics|5": {
|
172 |
+
"acc": 0.304635761589404,
|
173 |
+
"acc_stderr": 0.03757949922943342,
|
174 |
+
"acc_norm": 0.304635761589404,
|
175 |
+
"acc_norm_stderr": 0.03757949922943342
|
176 |
+
},
|
177 |
+
"harness|mmlu_sociology|5": {
|
178 |
+
"acc": 0.25870646766169153,
|
179 |
+
"acc_stderr": 0.03096590312357303,
|
180 |
+
"acc_norm": 0.25870646766169153,
|
181 |
+
"acc_norm_stderr": 0.03096590312357303
|
182 |
+
},
|
183 |
+
"harness|mmlu_college_medicine|5": {
|
184 |
+
"acc": 0.2254335260115607,
|
185 |
+
"acc_stderr": 0.03186209851641144,
|
186 |
+
"acc_norm": 0.2254335260115607,
|
187 |
+
"acc_norm_stderr": 0.03186209851641144
|
188 |
+
},
|
189 |
+
"harness|mmlu_elementary_mathematics|5": {
|
190 |
+
"acc": 0.2566137566137566,
|
191 |
+
"acc_stderr": 0.022494510767503154,
|
192 |
+
"acc_norm": 0.2566137566137566,
|
193 |
+
"acc_norm_stderr": 0.022494510767503154
|
194 |
+
},
|
195 |
+
"harness|mmlu_college_biology|5": {
|
196 |
+
"acc": 0.2638888888888889,
|
197 |
+
"acc_stderr": 0.03685651095897532,
|
198 |
+
"acc_norm": 0.2638888888888889,
|
199 |
+
"acc_norm_stderr": 0.03685651095897532
|
200 |
+
},
|
201 |
+
"harness|mmlu_college_chemistry|5": {
|
202 |
+
"acc": 0.23,
|
203 |
+
"acc_stderr": 0.04229525846816505,
|
204 |
+
"acc_norm": 0.23,
|
205 |
+
"acc_norm_stderr": 0.04229525846816505
|
206 |
+
},
|
207 |
+
"harness|mmlu_us_foreign_policy|5": {
|
208 |
+
"acc": 0.22,
|
209 |
+
"acc_stderr": 0.04163331998932269,
|
210 |
+
"acc_norm": 0.22,
|
211 |
+
"acc_norm_stderr": 0.04163331998932269
|
212 |
+
},
|
213 |
+
"harness|mmlu_moral_disputes|5": {
|
214 |
+
"acc": 0.24855491329479767,
|
215 |
+
"acc_stderr": 0.023267528432100174,
|
216 |
+
"acc_norm": 0.24855491329479767,
|
217 |
+
"acc_norm_stderr": 0.023267528432100174
|
218 |
+
},
|
219 |
+
"harness|mmlu_logical_fallacies|5": {
|
220 |
+
"acc": 0.31901840490797545,
|
221 |
+
"acc_stderr": 0.03661997551073836,
|
222 |
+
"acc_norm": 0.31901840490797545,
|
223 |
+
"acc_norm_stderr": 0.03661997551073836
|
224 |
+
},
|
225 |
+
"harness|mmlu_prehistory|5": {
|
226 |
+
"acc": 0.2623456790123457,
|
227 |
+
"acc_stderr": 0.024477222856135114,
|
228 |
+
"acc_norm": 0.2623456790123457,
|
229 |
+
"acc_norm_stderr": 0.024477222856135114
|
230 |
+
},
|
231 |
+
"harness|mmlu_college_mathematics|5": {
|
232 |
+
"acc": 0.25,
|
233 |
+
"acc_stderr": 0.04351941398892446,
|
234 |
+
"acc_norm": 0.25,
|
235 |
+
"acc_norm_stderr": 0.04351941398892446
|
236 |
+
},
|
237 |
+
"harness|mmlu_high_school_government_and_politics|5": {
|
238 |
+
"acc": 0.33678756476683935,
|
239 |
+
"acc_stderr": 0.03410780251836184,
|
240 |
+
"acc_norm": 0.33678756476683935,
|
241 |
+
"acc_norm_stderr": 0.03410780251836184
|
242 |
+
},
|
243 |
+
"harness|mmlu_econometrics|5": {
|
244 |
+
"acc": 0.20175438596491227,
|
245 |
+
"acc_stderr": 0.037752050135836386,
|
246 |
+
"acc_norm": 0.20175438596491227,
|
247 |
+
"acc_norm_stderr": 0.037752050135836386
|
248 |
+
},
|
249 |
+
"harness|mmlu_high_school_psychology|5": {
|
250 |
+
"acc": 0.24220183486238533,
|
251 |
+
"acc_stderr": 0.01836817630659862,
|
252 |
+
"acc_norm": 0.24220183486238533,
|
253 |
+
"acc_norm_stderr": 0.01836817630659862
|
254 |
+
},
|
255 |
+
"harness|mmlu_formal_logic|5": {
|
256 |
+
"acc": 0.23015873015873015,
|
257 |
+
"acc_stderr": 0.03764950879790606,
|
258 |
+
"acc_norm": 0.23015873015873015,
|
259 |
+
"acc_norm_stderr": 0.03764950879790606
|
260 |
+
},
|
261 |
+
"harness|mmlu_nutrition|5": {
|
262 |
+
"acc": 0.23529411764705882,
|
263 |
+
"acc_stderr": 0.024288619466046102,
|
264 |
+
"acc_norm": 0.23529411764705882,
|
265 |
+
"acc_norm_stderr": 0.024288619466046102
|
266 |
+
},
|
267 |
+
"harness|mmlu_business_ethics|5": {
|
268 |
+
"acc": 0.18,
|
269 |
+
"acc_stderr": 0.03861229196653695,
|
270 |
+
"acc_norm": 0.18,
|
271 |
+
"acc_norm_stderr": 0.03861229196653695
|
272 |
+
},
|
273 |
+
"harness|mmlu_international_law|5": {
|
274 |
+
"acc": 0.256198347107438,
|
275 |
+
"acc_stderr": 0.039849796533028704,
|
276 |
+
"acc_norm": 0.256198347107438,
|
277 |
+
"acc_norm_stderr": 0.039849796533028704
|
278 |
+
},
|
279 |
+
"harness|mmlu_astronomy|5": {
|
280 |
+
"acc": 0.21710526315789475,
|
281 |
+
"acc_stderr": 0.033550453048829226,
|
282 |
+
"acc_norm": 0.21710526315789475,
|
283 |
+
"acc_norm_stderr": 0.033550453048829226
|
284 |
+
},
|
285 |
+
"harness|mmlu_professional_psychology|5": {
|
286 |
+
"acc": 0.24019607843137256,
|
287 |
+
"acc_stderr": 0.01728276069516743,
|
288 |
+
"acc_norm": 0.24019607843137256,
|
289 |
+
"acc_norm_stderr": 0.01728276069516743
|
290 |
+
},
|
291 |
+
"harness|mmlu_professional_accounting|5": {
|
292 |
+
"acc": 0.2553191489361702,
|
293 |
+
"acc_stderr": 0.02601199293090201,
|
294 |
+
"acc_norm": 0.2553191489361702,
|
295 |
+
"acc_norm_stderr": 0.02601199293090201
|
296 |
+
},
|
297 |
+
"harness|mmlu_machine_learning|5": {
|
298 |
+
"acc": 0.21428571428571427,
|
299 |
+
"acc_stderr": 0.03894641120044793,
|
300 |
+
"acc_norm": 0.21428571428571427,
|
301 |
+
"acc_norm_stderr": 0.03894641120044793
|
302 |
+
},
|
303 |
+
"harness|mmlu_high_school_statistics|5": {
|
304 |
+
"acc": 0.46296296296296297,
|
305 |
+
"acc_stderr": 0.03400603625538272,
|
306 |
+
"acc_norm": 0.46296296296296297,
|
307 |
+
"acc_norm_stderr": 0.03400603625538272
|
308 |
+
},
|
309 |
+
"harness|mmlu_moral_scenarios|5": {
|
310 |
+
"acc": 0.24692737430167597,
|
311 |
+
"acc_stderr": 0.014422292204808852,
|
312 |
+
"acc_norm": 0.24692737430167597,
|
313 |
+
"acc_norm_stderr": 0.014422292204808852
|
314 |
+
},
|
315 |
+
"harness|mmlu_college_computer_science|5": {
|
316 |
+
"acc": 0.25,
|
317 |
+
"acc_stderr": 0.04351941398892446,
|
318 |
+
"acc_norm": 0.25,
|
319 |
+
"acc_norm_stderr": 0.04351941398892446
|
320 |
+
},
|
321 |
+
"harness|mmlu_high_school_computer_science|5": {
|
322 |
+
"acc": 0.3,
|
323 |
+
"acc_stderr": 0.046056618647183814,
|
324 |
+
"acc_norm": 0.3,
|
325 |
+
"acc_norm_stderr": 0.046056618647183814
|
326 |
+
},
|
327 |
+
"harness|mmlu_professional_medicine|5": {
|
328 |
+
"acc": 0.4411764705882353,
|
329 |
+
"acc_stderr": 0.030161911930767102,
|
330 |
+
"acc_norm": 0.4411764705882353,
|
331 |
+
"acc_norm_stderr": 0.030161911930767102
|
332 |
+
},
|
333 |
+
"harness|mmlu_security_studies|5": {
|
334 |
+
"acc": 0.3795918367346939,
|
335 |
+
"acc_stderr": 0.03106721126287249,
|
336 |
+
"acc_norm": 0.3795918367346939,
|
337 |
+
"acc_norm_stderr": 0.03106721126287249
|
338 |
+
},
|
339 |
+
"harness|mmlu_high_school_world_history|5": {
|
340 |
+
"acc": 0.2109704641350211,
|
341 |
+
"acc_stderr": 0.02655837250266192,
|
342 |
+
"acc_norm": 0.2109704641350211,
|
343 |
+
"acc_norm_stderr": 0.02655837250266192
|
344 |
+
},
|
345 |
+
"harness|mmlu_professional_law|5": {
|
346 |
+
"acc": 0.23468057366362452,
|
347 |
+
"acc_stderr": 0.010824026872449344,
|
348 |
+
"acc_norm": 0.23468057366362452,
|
349 |
+
"acc_norm_stderr": 0.010824026872449344
|
350 |
+
},
|
351 |
+
"harness|mmlu_high_school_us_history|5": {
|
352 |
+
"acc": 0.25,
|
353 |
+
"acc_stderr": 0.03039153369274154,
|
354 |
+
"acc_norm": 0.25,
|
355 |
+
"acc_norm_stderr": 0.03039153369274154
|
356 |
+
},
|
357 |
+
"harness|mmlu_high_school_european_history|5": {
|
358 |
+
"acc": 0.22424242424242424,
|
359 |
+
"acc_stderr": 0.03256866661681102,
|
360 |
+
"acc_norm": 0.22424242424242424,
|
361 |
+
"acc_norm_stderr": 0.03256866661681102
|
362 |
+
},
|
363 |
+
"harness|truthfulqa_mc|0": {
|
364 |
+
"mc1": 0.25091799265605874,
|
365 |
+
"mc1_stderr": 0.015176985027707682,
|
366 |
+
"mc2": 0.4116568832959107,
|
367 |
+
"mc2_stderr": 0.015044504977529799
|
368 |
+
},
|
369 |
+
"harness|commongen_v2|2": {
|
370 |
+
"acc": 0.27744982290436837,
|
371 |
+
"acc_stderr": 0.015393630236605975,
|
372 |
+
"acc_norm": 0.3400236127508855,
|
373 |
+
"acc_norm_stderr": 0.016286717220737674
|
374 |
+
}
|
375 |
+
},
|
376 |
+
"versions": {
|
377 |
+
"all": 0,
|
378 |
+
"harness|arc_challenge|25": 0,
|
379 |
+
"harness|hellaswag|10": 0,
|
380 |
+
"harness|mmlu_world_religions|5": 1,
|
381 |
+
"harness|mmlu_management|5": 1,
|
382 |
+
"harness|mmlu_miscellaneous|5": 1,
|
383 |
+
"harness|mmlu_anatomy|5": 1,
|
384 |
+
"harness|mmlu_abstract_algebra|5": 1,
|
385 |
+
"harness|mmlu_conceptual_physics|5": 1,
|
386 |
+
"harness|mmlu_virology|5": 1,
|
387 |
+
"harness|mmlu_philosophy|5": 1,
|
388 |
+
"harness|mmlu_human_aging|5": 1,
|
389 |
+
"harness|mmlu_human_sexuality|5": 1,
|
390 |
+
"harness|mmlu_medical_genetics|5": 1,
|
391 |
+
"harness|mmlu_high_school_geography|5": 1,
|
392 |
+
"harness|mmlu_electrical_engineering|5": 1,
|
393 |
+
"harness|mmlu_college_physics|5": 1,
|
394 |
+
"harness|mmlu_high_school_microeconomics|5": 1,
|
395 |
+
"harness|mmlu_high_school_macroeconomics|5": 1,
|
396 |
+
"harness|mmlu_computer_security|5": 1,
|
397 |
+
"harness|mmlu_global_facts|5": 1,
|
398 |
+
"harness|mmlu_jurisprudence|5": 1,
|
399 |
+
"harness|mmlu_high_school_chemistry|5": 1,
|
400 |
+
"harness|mmlu_high_school_biology|5": 1,
|
401 |
+
"harness|mmlu_marketing|5": 1,
|
402 |
+
"harness|mmlu_clinical_knowledge|5": 1,
|
403 |
+
"harness|mmlu_public_relations|5": 1,
|
404 |
+
"harness|mmlu_high_school_mathematics|5": 1,
|
405 |
+
"harness|mmlu_high_school_physics|5": 1,
|
406 |
+
"harness|mmlu_sociology|5": 1,
|
407 |
+
"harness|mmlu_college_medicine|5": 1,
|
408 |
+
"harness|mmlu_elementary_mathematics|5": 1,
|
409 |
+
"harness|mmlu_college_biology|5": 1,
|
410 |
+
"harness|mmlu_college_chemistry|5": 1,
|
411 |
+
"harness|mmlu_us_foreign_policy|5": 1,
|
412 |
+
"harness|mmlu_moral_disputes|5": 1,
|
413 |
+
"harness|mmlu_logical_fallacies|5": 1,
|
414 |
+
"harness|mmlu_prehistory|5": 1,
|
415 |
+
"harness|mmlu_college_mathematics|5": 1,
|
416 |
+
"harness|mmlu_high_school_government_and_politics|5": 1,
|
417 |
+
"harness|mmlu_econometrics|5": 1,
|
418 |
+
"harness|mmlu_high_school_psychology|5": 1,
|
419 |
+
"harness|mmlu_formal_logic|5": 1,
|
420 |
+
"harness|mmlu_nutrition|5": 1,
|
421 |
+
"harness|mmlu_business_ethics|5": 1,
|
422 |
+
"harness|mmlu_international_law|5": 1,
|
423 |
+
"harness|mmlu_astronomy|5": 1,
|
424 |
+
"harness|mmlu_professional_psychology|5": 1,
|
425 |
+
"harness|mmlu_professional_accounting|5": 1,
|
426 |
+
"harness|mmlu_machine_learning|5": 1,
|
427 |
+
"harness|mmlu_high_school_statistics|5": 1,
|
428 |
+
"harness|mmlu_moral_scenarios|5": 1,
|
429 |
+
"harness|mmlu_college_computer_science|5": 1,
|
430 |
+
"harness|mmlu_high_school_computer_science|5": 1,
|
431 |
+
"harness|mmlu_professional_medicine|5": 1,
|
432 |
+
"harness|mmlu_security_studies|5": 1,
|
433 |
+
"harness|mmlu_high_school_world_history|5": 1,
|
434 |
+
"harness|mmlu_professional_law|5": 1,
|
435 |
+
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
+
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
+
"harness|truthfulqa_mc|0": 0,
|
438 |
+
"harness|commongen_v2|2": 1
|
439 |
+
},
|
440 |
+
"config_general": {
|
441 |
+
"model_name": "EleutherAI/polyglot-ko-1.3b",
|
442 |
+
"model_sha": "557e162cf6e944fdbae05bab2e45d066a125eacb",
|
443 |
+
"model_dtype": "torch.float16",
|
444 |
+
"lighteval_sha": "",
|
445 |
+
"num_few_shot_default": 0,
|
446 |
+
"num_fewshot_seeds": 1,
|
447 |
+
"override_batch_size": 1,
|
448 |
+
"max_samples": null
|
449 |
+
}
|
450 |
+
}
|
eval-results/HuggingFaceH4/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
eval-results/HuggingFaceH4/zephyr-7b-beta/result.json
ADDED
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"daily": {
|
4 |
+
"daily": 8
|
5 |
+
},
|
6 |
+
"quarterly": {
|
7 |
+
"quarterly": 8
|
8 |
+
},
|
9 |
+
"harness|arc_challenge|25": {
|
10 |
+
"acc": 0.33532423208191126,
|
11 |
+
"acc_stderr": 0.01379618294778556,
|
12 |
+
"acc_norm": 0.3848122866894198,
|
13 |
+
"acc_norm_stderr": 0.014218371065251112
|
14 |
+
},
|
15 |
+
"harness|hellaswag|10": {
|
16 |
+
"acc": 0.35480979884485164,
|
17 |
+
"acc_stderr": 0.004774778180345192,
|
18 |
+
"acc_norm": 0.44911372236606256,
|
19 |
+
"acc_norm_stderr": 0.00496387293685794
|
20 |
+
},
|
21 |
+
"harness|mmlu_world_religions|5": {
|
22 |
+
"acc": 0.45614035087719296,
|
23 |
+
"acc_stderr": 0.03820042586602966,
|
24 |
+
"acc_norm": 0.45614035087719296,
|
25 |
+
"acc_norm_stderr": 0.03820042586602966
|
26 |
+
},
|
27 |
+
"harness|mmlu_management|5": {
|
28 |
+
"acc": 0.6019417475728155,
|
29 |
+
"acc_stderr": 0.04846748253977238,
|
30 |
+
"acc_norm": 0.6019417475728155,
|
31 |
+
"acc_norm_stderr": 0.04846748253977238
|
32 |
+
},
|
33 |
+
"harness|mmlu_miscellaneous|5": {
|
34 |
+
"acc": 0.41762452107279696,
|
35 |
+
"acc_stderr": 0.017635637326951534,
|
36 |
+
"acc_norm": 0.41762452107279696,
|
37 |
+
"acc_norm_stderr": 0.017635637326951534
|
38 |
+
},
|
39 |
+
"harness|mmlu_anatomy|5": {
|
40 |
+
"acc": 0.34074074074074073,
|
41 |
+
"acc_stderr": 0.040943762699967946,
|
42 |
+
"acc_norm": 0.34074074074074073,
|
43 |
+
"acc_norm_stderr": 0.040943762699967946
|
44 |
+
},
|
45 |
+
"harness|mmlu_abstract_algebra|5": {
|
46 |
+
"acc": 0.19,
|
47 |
+
"acc_stderr": 0.03942772444036623,
|
48 |
+
"acc_norm": 0.19,
|
49 |
+
"acc_norm_stderr": 0.03942772444036623
|
50 |
+
},
|
51 |
+
"harness|mmlu_conceptual_physics|5": {
|
52 |
+
"acc": 0.2978723404255319,
|
53 |
+
"acc_stderr": 0.029896145682095462,
|
54 |
+
"acc_norm": 0.2978723404255319,
|
55 |
+
"acc_norm_stderr": 0.029896145682095462
|
56 |
+
},
|
57 |
+
"harness|mmlu_virology|5": {
|
58 |
+
"acc": 0.3614457831325301,
|
59 |
+
"acc_stderr": 0.0374005938202932,
|
60 |
+
"acc_norm": 0.3614457831325301,
|
61 |
+
"acc_norm_stderr": 0.0374005938202932
|
62 |
+
},
|
63 |
+
"harness|mmlu_philosophy|5": {
|
64 |
+
"acc": 0.4758842443729904,
|
65 |
+
"acc_stderr": 0.028365041542564584,
|
66 |
+
"acc_norm": 0.4758842443729904,
|
67 |
+
"acc_norm_stderr": 0.028365041542564584
|
68 |
+
},
|
69 |
+
"harness|mmlu_human_aging|5": {
|
70 |
+
"acc": 0.3811659192825112,
|
71 |
+
"acc_stderr": 0.032596251184168284,
|
72 |
+
"acc_norm": 0.3811659192825112,
|
73 |
+
"acc_norm_stderr": 0.032596251184168284
|
74 |
+
},
|
75 |
+
"harness|mmlu_human_sexuality|5": {
|
76 |
+
"acc": 0.3511450381679389,
|
77 |
+
"acc_stderr": 0.04186445163013751,
|
78 |
+
"acc_norm": 0.3511450381679389,
|
79 |
+
"acc_norm_stderr": 0.04186445163013751
|
80 |
+
},
|
81 |
+
"harness|mmlu_medical_genetics|5": {
|
82 |
+
"acc": 0.27,
|
83 |
+
"acc_stderr": 0.0446196043338474,
|
84 |
+
"acc_norm": 0.27,
|
85 |
+
"acc_norm_stderr": 0.0446196043338474
|
86 |
+
},
|
87 |
+
"harness|mmlu_high_school_geography|5": {
|
88 |
+
"acc": 0.494949494949495,
|
89 |
+
"acc_stderr": 0.035621707606254015,
|
90 |
+
"acc_norm": 0.494949494949495,
|
91 |
+
"acc_norm_stderr": 0.035621707606254015
|
92 |
+
},
|
93 |
+
"harness|mmlu_electrical_engineering|5": {
|
94 |
+
"acc": 0.4,
|
95 |
+
"acc_stderr": 0.04082482904638628,
|
96 |
+
"acc_norm": 0.4,
|
97 |
+
"acc_norm_stderr": 0.04082482904638628
|
98 |
+
},
|
99 |
+
"harness|mmlu_college_physics|5": {
|
100 |
+
"acc": 0.3137254901960784,
|
101 |
+
"acc_stderr": 0.04617034827006717,
|
102 |
+
"acc_norm": 0.3137254901960784,
|
103 |
+
"acc_norm_stderr": 0.04617034827006717
|
104 |
+
},
|
105 |
+
"harness|mmlu_high_school_microeconomics|5": {
|
106 |
+
"acc": 0.4957983193277311,
|
107 |
+
"acc_stderr": 0.0324773433444811,
|
108 |
+
"acc_norm": 0.4957983193277311,
|
109 |
+
"acc_norm_stderr": 0.0324773433444811
|
110 |
+
},
|
111 |
+
"harness|mmlu_high_school_macroeconomics|5": {
|
112 |
+
"acc": 0.4256410256410256,
|
113 |
+
"acc_stderr": 0.025069094387296546,
|
114 |
+
"acc_norm": 0.4256410256410256,
|
115 |
+
"acc_norm_stderr": 0.025069094387296546
|
116 |
+
},
|
117 |
+
"harness|mmlu_computer_security|5": {
|
118 |
+
"acc": 0.59,
|
119 |
+
"acc_stderr": 0.049431107042371025,
|
120 |
+
"acc_norm": 0.59,
|
121 |
+
"acc_norm_stderr": 0.049431107042371025
|
122 |
+
},
|
123 |
+
"harness|mmlu_global_facts|5": {
|
124 |
+
"acc": 0.29,
|
125 |
+
"acc_stderr": 0.045604802157206845,
|
126 |
+
"acc_norm": 0.29,
|
127 |
+
"acc_norm_stderr": 0.045604802157206845
|
128 |
+
},
|
129 |
+
"harness|mmlu_jurisprudence|5": {
|
130 |
+
"acc": 0.4537037037037037,
|
131 |
+
"acc_stderr": 0.04812917324536821,
|
132 |
+
"acc_norm": 0.4537037037037037,
|
133 |
+
"acc_norm_stderr": 0.04812917324536821
|
134 |
+
},
|
135 |
+
"harness|mmlu_high_school_chemistry|5": {
|
136 |
+
"acc": 0.35467980295566504,
|
137 |
+
"acc_stderr": 0.03366124489051449,
|
138 |
+
"acc_norm": 0.35467980295566504,
|
139 |
+
"acc_norm_stderr": 0.03366124489051449
|
140 |
+
},
|
141 |
+
"harness|mmlu_high_school_biology|5": {
|
142 |
+
"acc": 0.4290322580645161,
|
143 |
+
"acc_stderr": 0.02815603653823321,
|
144 |
+
"acc_norm": 0.4290322580645161,
|
145 |
+
"acc_norm_stderr": 0.02815603653823321
|
146 |
+
},
|
147 |
+
"harness|mmlu_marketing|5": {
|
148 |
+
"acc": 0.6666666666666666,
|
149 |
+
"acc_stderr": 0.03088273697413865,
|
150 |
+
"acc_norm": 0.6666666666666666,
|
151 |
+
"acc_norm_stderr": 0.03088273697413865
|
152 |
+
},
|
153 |
+
"harness|mmlu_clinical_knowledge|5": {
|
154 |
+
"acc": 0.4188679245283019,
|
155 |
+
"acc_stderr": 0.03036505082911521,
|
156 |
+
"acc_norm": 0.4188679245283019,
|
157 |
+
"acc_norm_stderr": 0.03036505082911521
|
158 |
+
},
|
159 |
+
"harness|mmlu_public_relations|5": {
|
160 |
+
"acc": 0.42727272727272725,
|
161 |
+
"acc_stderr": 0.04738198703545483,
|
162 |
+
"acc_norm": 0.42727272727272725,
|
163 |
+
"acc_norm_stderr": 0.04738198703545483
|
164 |
+
},
|
165 |
+
"harness|mmlu_high_school_mathematics|5": {
|
166 |
+
"acc": 0.34814814814814815,
|
167 |
+
"acc_stderr": 0.029045600290616258,
|
168 |
+
"acc_norm": 0.34814814814814815,
|
169 |
+
"acc_norm_stderr": 0.029045600290616258
|
170 |
+
},
|
171 |
+
"harness|mmlu_high_school_physics|5": {
|
172 |
+
"acc": 0.2913907284768212,
|
173 |
+
"acc_stderr": 0.037101857261199946,
|
174 |
+
"acc_norm": 0.2913907284768212,
|
175 |
+
"acc_norm_stderr": 0.037101857261199946
|
176 |
+
},
|
177 |
+
"harness|mmlu_sociology|5": {
|
178 |
+
"acc": 0.5174129353233831,
|
179 |
+
"acc_stderr": 0.03533389234739245,
|
180 |
+
"acc_norm": 0.5174129353233831,
|
181 |
+
"acc_norm_stderr": 0.03533389234739245
|
182 |
+
},
|
183 |
+
"harness|mmlu_college_medicine|5": {
|
184 |
+
"acc": 0.37572254335260113,
|
185 |
+
"acc_stderr": 0.03692820767264867,
|
186 |
+
"acc_norm": 0.37572254335260113,
|
187 |
+
"acc_norm_stderr": 0.03692820767264867
|
188 |
+
},
|
189 |
+
"harness|mmlu_elementary_mathematics|5": {
|
190 |
+
"acc": 0.3492063492063492,
|
191 |
+
"acc_stderr": 0.024552292209342658,
|
192 |
+
"acc_norm": 0.3492063492063492,
|
193 |
+
"acc_norm_stderr": 0.024552292209342658
|
194 |
+
},
|
195 |
+
"harness|mmlu_college_biology|5": {
|
196 |
+
"acc": 0.3333333333333333,
|
197 |
+
"acc_stderr": 0.039420826399272135,
|
198 |
+
"acc_norm": 0.3333333333333333,
|
199 |
+
"acc_norm_stderr": 0.039420826399272135
|
200 |
+
},
|
201 |
+
"harness|mmlu_college_chemistry|5": {
|
202 |
+
"acc": 0.35,
|
203 |
+
"acc_stderr": 0.04793724854411019,
|
204 |
+
"acc_norm": 0.35,
|
205 |
+
"acc_norm_stderr": 0.04793724854411019
|
206 |
+
},
|
207 |
+
"harness|mmlu_us_foreign_policy|5": {
|
208 |
+
"acc": 0.49,
|
209 |
+
"acc_stderr": 0.05024183937956913,
|
210 |
+
"acc_norm": 0.49,
|
211 |
+
"acc_norm_stderr": 0.05024183937956913
|
212 |
+
},
|
213 |
+
"harness|mmlu_moral_disputes|5": {
|
214 |
+
"acc": 0.47398843930635837,
|
215 |
+
"acc_stderr": 0.026882643434022885,
|
216 |
+
"acc_norm": 0.47398843930635837,
|
217 |
+
"acc_norm_stderr": 0.026882643434022885
|
218 |
+
},
|
219 |
+
"harness|mmlu_logical_fallacies|5": {
|
220 |
+
"acc": 0.44171779141104295,
|
221 |
+
"acc_stderr": 0.039015918258361836,
|
222 |
+
"acc_norm": 0.44171779141104295,
|
223 |
+
"acc_norm_stderr": 0.039015918258361836
|
224 |
+
},
|
225 |
+
"harness|mmlu_prehistory|5": {
|
226 |
+
"acc": 0.42592592592592593,
|
227 |
+
"acc_stderr": 0.027513747284379424,
|
228 |
+
"acc_norm": 0.42592592592592593,
|
229 |
+
"acc_norm_stderr": 0.027513747284379424
|
230 |
+
},
|
231 |
+
"harness|mmlu_college_mathematics|5": {
|
232 |
+
"acc": 0.33,
|
233 |
+
"acc_stderr": 0.04725815626252606,
|
234 |
+
"acc_norm": 0.33,
|
235 |
+
"acc_norm_stderr": 0.04725815626252606
|
236 |
+
},
|
237 |
+
"harness|mmlu_high_school_government_and_politics|5": {
|
238 |
+
"acc": 0.5129533678756477,
|
239 |
+
"acc_stderr": 0.0360722806104775,
|
240 |
+
"acc_norm": 0.5129533678756477,
|
241 |
+
"acc_norm_stderr": 0.0360722806104775
|
242 |
+
},
|
243 |
+
"harness|mmlu_econometrics|5": {
|
244 |
+
"acc": 0.24561403508771928,
|
245 |
+
"acc_stderr": 0.0404933929774814,
|
246 |
+
"acc_norm": 0.24561403508771928,
|
247 |
+
"acc_norm_stderr": 0.0404933929774814
|
248 |
+
},
|
249 |
+
"harness|mmlu_high_school_psychology|5": {
|
250 |
+
"acc": 0.47155963302752296,
|
251 |
+
"acc_stderr": 0.02140261569734804,
|
252 |
+
"acc_norm": 0.47155963302752296,
|
253 |
+
"acc_norm_stderr": 0.02140261569734804
|
254 |
+
},
|
255 |
+
"harness|mmlu_formal_logic|5": {
|
256 |
+
"acc": 0.36507936507936506,
|
257 |
+
"acc_stderr": 0.04306241259127152,
|
258 |
+
"acc_norm": 0.36507936507936506,
|
259 |
+
"acc_norm_stderr": 0.04306241259127152
|
260 |
+
},
|
261 |
+
"harness|mmlu_nutrition|5": {
|
262 |
+
"acc": 0.4117647058823529,
|
263 |
+
"acc_stderr": 0.028180596328259297,
|
264 |
+
"acc_norm": 0.4117647058823529,
|
265 |
+
"acc_norm_stderr": 0.028180596328259297
|
266 |
+
},
|
267 |
+
"harness|mmlu_business_ethics|5": {
|
268 |
+
"acc": 0.44,
|
269 |
+
"acc_stderr": 0.04988876515698589,
|
270 |
+
"acc_norm": 0.44,
|
271 |
+
"acc_norm_stderr": 0.04988876515698589
|
272 |
+
},
|
273 |
+
"harness|mmlu_international_law|5": {
|
274 |
+
"acc": 0.5867768595041323,
|
275 |
+
"acc_stderr": 0.04495087843548408,
|
276 |
+
"acc_norm": 0.5867768595041323,
|
277 |
+
"acc_norm_stderr": 0.04495087843548408
|
278 |
+
},
|
279 |
+
"harness|mmlu_astronomy|5": {
|
280 |
+
"acc": 0.40131578947368424,
|
281 |
+
"acc_stderr": 0.03988903703336284,
|
282 |
+
"acc_norm": 0.40131578947368424,
|
283 |
+
"acc_norm_stderr": 0.03988903703336284
|
284 |
+
},
|
285 |
+
"harness|mmlu_professional_psychology|5": {
|
286 |
+
"acc": 0.32679738562091504,
|
287 |
+
"acc_stderr": 0.018975427920507215,
|
288 |
+
"acc_norm": 0.32679738562091504,
|
289 |
+
"acc_norm_stderr": 0.018975427920507215
|
290 |
+
},
|
291 |
+
"harness|mmlu_professional_accounting|5": {
|
292 |
+
"acc": 0.3333333333333333,
|
293 |
+
"acc_stderr": 0.02812163604063988,
|
294 |
+
"acc_norm": 0.3333333333333333,
|
295 |
+
"acc_norm_stderr": 0.02812163604063988
|
296 |
+
},
|
297 |
+
"harness|mmlu_machine_learning|5": {
|
298 |
+
"acc": 0.3392857142857143,
|
299 |
+
"acc_stderr": 0.04493949068613539,
|
300 |
+
"acc_norm": 0.3392857142857143,
|
301 |
+
"acc_norm_stderr": 0.04493949068613539
|
302 |
+
},
|
303 |
+
"harness|mmlu_high_school_statistics|5": {
|
304 |
+
"acc": 0.41203703703703703,
|
305 |
+
"acc_stderr": 0.03356787758160835,
|
306 |
+
"acc_norm": 0.41203703703703703,
|
307 |
+
"acc_norm_stderr": 0.03356787758160835
|
308 |
+
},
|
309 |
+
"harness|mmlu_moral_scenarios|5": {
|
310 |
+
"acc": 0.329608938547486,
|
311 |
+
"acc_stderr": 0.015721531075183884,
|
312 |
+
"acc_norm": 0.329608938547486,
|
313 |
+
"acc_norm_stderr": 0.015721531075183884
|
314 |
+
},
|
315 |
+
"harness|mmlu_college_computer_science|5": {
|
316 |
+
"acc": 0.39,
|
317 |
+
"acc_stderr": 0.04902071300001975,
|
318 |
+
"acc_norm": 0.39,
|
319 |
+
"acc_norm_stderr": 0.04902071300001975
|
320 |
+
},
|
321 |
+
"harness|mmlu_high_school_computer_science|5": {
|
322 |
+
"acc": 0.61,
|
323 |
+
"acc_stderr": 0.04902071300001975,
|
324 |
+
"acc_norm": 0.61,
|
325 |
+
"acc_norm_stderr": 0.04902071300001975
|
326 |
+
},
|
327 |
+
"harness|mmlu_professional_medicine|5": {
|
328 |
+
"acc": 0.375,
|
329 |
+
"acc_stderr": 0.029408372932278746,
|
330 |
+
"acc_norm": 0.375,
|
331 |
+
"acc_norm_stderr": 0.029408372932278746
|
332 |
+
},
|
333 |
+
"harness|mmlu_security_studies|5": {
|
334 |
+
"acc": 0.43673469387755104,
|
335 |
+
"acc_stderr": 0.03175195237583322,
|
336 |
+
"acc_norm": 0.43673469387755104,
|
337 |
+
"acc_norm_stderr": 0.03175195237583322
|
338 |
+
},
|
339 |
+
"harness|mmlu_high_school_world_history|5": {
|
340 |
+
"acc": 0.4810126582278481,
|
341 |
+
"acc_stderr": 0.03252375148090448,
|
342 |
+
"acc_norm": 0.4810126582278481,
|
343 |
+
"acc_norm_stderr": 0.03252375148090448
|
344 |
+
},
|
345 |
+
"harness|mmlu_professional_law|5": {
|
346 |
+
"acc": 0.29791395045632335,
|
347 |
+
"acc_stderr": 0.011680717340400059,
|
348 |
+
"acc_norm": 0.29791395045632335,
|
349 |
+
"acc_norm_stderr": 0.011680717340400059
|
350 |
+
},
|
351 |
+
"harness|mmlu_high_school_us_history|5": {
|
352 |
+
"acc": 0.29411764705882354,
|
353 |
+
"acc_stderr": 0.03198001660115072,
|
354 |
+
"acc_norm": 0.29411764705882354,
|
355 |
+
"acc_norm_stderr": 0.03198001660115072
|
356 |
+
},
|
357 |
+
"harness|mmlu_high_school_european_history|5": {
|
358 |
+
"acc": 0.30303030303030304,
|
359 |
+
"acc_stderr": 0.03588624800091707,
|
360 |
+
"acc_norm": 0.30303030303030304,
|
361 |
+
"acc_norm_stderr": 0.03588624800091707
|
362 |
+
},
|
363 |
+
"harness|truthfulqa_mc|0": {
|
364 |
+
"mc1": 0.3317013463892289,
|
365 |
+
"mc1_stderr": 0.01648214881024147,
|
366 |
+
"mc2": 0.5171680571717291,
|
367 |
+
"mc2_stderr": 0.01606077987901482
|
368 |
+
},
|
369 |
+
"harness|commongen_v2|2": {
|
370 |
+
"acc": 0.39787485242030696,
|
371 |
+
"acc_stderr": 0.01682795905473339,
|
372 |
+
"acc_norm": 0.4014167650531287,
|
373 |
+
"acc_norm_stderr": 0.01685290785872906
|
374 |
+
}
|
375 |
+
},
|
376 |
+
"versions": {
|
377 |
+
"all": 0,
|
378 |
+
"harness|arc_challenge|25": 0,
|
379 |
+
"harness|hellaswag|10": 0,
|
380 |
+
"harness|mmlu_world_religions|5": 1,
|
381 |
+
"harness|mmlu_management|5": 1,
|
382 |
+
"harness|mmlu_miscellaneous|5": 1,
|
383 |
+
"harness|mmlu_anatomy|5": 1,
|
384 |
+
"harness|mmlu_abstract_algebra|5": 1,
|
385 |
+
"harness|mmlu_conceptual_physics|5": 1,
|
386 |
+
"harness|mmlu_virology|5": 1,
|
387 |
+
"harness|mmlu_philosophy|5": 1,
|
388 |
+
"harness|mmlu_human_aging|5": 1,
|
389 |
+
"harness|mmlu_human_sexuality|5": 1,
|
390 |
+
"harness|mmlu_medical_genetics|5": 1,
|
391 |
+
"harness|mmlu_high_school_geography|5": 1,
|
392 |
+
"harness|mmlu_electrical_engineering|5": 1,
|
393 |
+
"harness|mmlu_college_physics|5": 1,
|
394 |
+
"harness|mmlu_high_school_microeconomics|5": 1,
|
395 |
+
"harness|mmlu_high_school_macroeconomics|5": 1,
|
396 |
+
"harness|mmlu_computer_security|5": 1,
|
397 |
+
"harness|mmlu_global_facts|5": 1,
|
398 |
+
"harness|mmlu_jurisprudence|5": 1,
|
399 |
+
"harness|mmlu_high_school_chemistry|5": 1,
|
400 |
+
"harness|mmlu_high_school_biology|5": 1,
|
401 |
+
"harness|mmlu_marketing|5": 1,
|
402 |
+
"harness|mmlu_clinical_knowledge|5": 1,
|
403 |
+
"harness|mmlu_public_relations|5": 1,
|
404 |
+
"harness|mmlu_high_school_mathematics|5": 1,
|
405 |
+
"harness|mmlu_high_school_physics|5": 1,
|
406 |
+
"harness|mmlu_sociology|5": 1,
|
407 |
+
"harness|mmlu_college_medicine|5": 1,
|
408 |
+
"harness|mmlu_elementary_mathematics|5": 1,
|
409 |
+
"harness|mmlu_college_biology|5": 1,
|
410 |
+
"harness|mmlu_college_chemistry|5": 1,
|
411 |
+
"harness|mmlu_us_foreign_policy|5": 1,
|
412 |
+
"harness|mmlu_moral_disputes|5": 1,
|
413 |
+
"harness|mmlu_logical_fallacies|5": 1,
|
414 |
+
"harness|mmlu_prehistory|5": 1,
|
415 |
+
"harness|mmlu_college_mathematics|5": 1,
|
416 |
+
"harness|mmlu_high_school_government_and_politics|5": 1,
|
417 |
+
"harness|mmlu_econometrics|5": 1,
|
418 |
+
"harness|mmlu_high_school_psychology|5": 1,
|
419 |
+
"harness|mmlu_formal_logic|5": 1,
|
420 |
+
"harness|mmlu_nutrition|5": 1,
|
421 |
+
"harness|mmlu_business_ethics|5": 1,
|
422 |
+
"harness|mmlu_international_law|5": 1,
|
423 |
+
"harness|mmlu_astronomy|5": 1,
|
424 |
+
"harness|mmlu_professional_psychology|5": 1,
|
425 |
+
"harness|mmlu_professional_accounting|5": 1,
|
426 |
+
"harness|mmlu_machine_learning|5": 1,
|
427 |
+
"harness|mmlu_high_school_statistics|5": 1,
|
428 |
+
"harness|mmlu_moral_scenarios|5": 1,
|
429 |
+
"harness|mmlu_college_computer_science|5": 1,
|
430 |
+
"harness|mmlu_high_school_computer_science|5": 1,
|
431 |
+
"harness|mmlu_professional_medicine|5": 1,
|
432 |
+
"harness|mmlu_security_studies|5": 1,
|
433 |
+
"harness|mmlu_high_school_world_history|5": 1,
|
434 |
+
"harness|mmlu_professional_law|5": 1,
|
435 |
+
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
+
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
+
"harness|truthfulqa_mc|0": 0,
|
438 |
+
"harness|commongen_v2|2": 1
|
439 |
+
},
|
440 |
+
"config_general": {
|
441 |
+
"model_name": "HuggingFaceH4/zephyr-7b-beta",
|
442 |
+
"model_sha": "3bac358730f8806e5c3dc7c7e19eb36e045bf720",
|
443 |
+
"model_dtype": "torch.float16",
|
444 |
+
"lighteval_sha": "",
|
445 |
+
"num_few_shot_default": 0,
|
446 |
+
"num_fewshot_seeds": 1,
|
447 |
+
"override_batch_size": 1,
|
448 |
+
"max_samples": null
|
449 |
+
}
|
450 |
+
}
|
eval-results/README.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
---
|
eval-results/nlpai-lab/KULLM3/result.json
ADDED
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"daily": {
|
4 |
+
"daily": 6
|
5 |
+
},
|
6 |
+
"quarterly": {
|
7 |
+
"quarterly": 6
|
8 |
+
},
|
9 |
+
"harness|arc_challenge|25": {
|
10 |
+
"acc": 0.42918088737201365,
|
11 |
+
"acc_stderr": 0.014464085894870651,
|
12 |
+
"acc_norm": 0.46501706484641636,
|
13 |
+
"acc_norm_stderr": 0.014575583922019672
|
14 |
+
},
|
15 |
+
"harness|hellaswag|10": {
|
16 |
+
"acc": 0.445628360884286,
|
17 |
+
"acc_stderr": 0.004960191341430244,
|
18 |
+
"acc_norm": 0.589523999203346,
|
19 |
+
"acc_norm_stderr": 0.004909148239488273
|
20 |
+
},
|
21 |
+
"harness|mmlu_world_religions|5": {
|
22 |
+
"acc": 0.6432748538011696,
|
23 |
+
"acc_stderr": 0.03674013002860954,
|
24 |
+
"acc_norm": 0.6432748538011696,
|
25 |
+
"acc_norm_stderr": 0.03674013002860954
|
26 |
+
},
|
27 |
+
"harness|mmlu_management|5": {
|
28 |
+
"acc": 0.6116504854368932,
|
29 |
+
"acc_stderr": 0.04825729337356389,
|
30 |
+
"acc_norm": 0.6116504854368932,
|
31 |
+
"acc_norm_stderr": 0.04825729337356389
|
32 |
+
},
|
33 |
+
"harness|mmlu_miscellaneous|5": {
|
34 |
+
"acc": 0.6155810983397191,
|
35 |
+
"acc_stderr": 0.01739568874281962,
|
36 |
+
"acc_norm": 0.6155810983397191,
|
37 |
+
"acc_norm_stderr": 0.01739568874281962
|
38 |
+
},
|
39 |
+
"harness|mmlu_anatomy|5": {
|
40 |
+
"acc": 0.4962962962962963,
|
41 |
+
"acc_stderr": 0.04319223625811331,
|
42 |
+
"acc_norm": 0.4962962962962963,
|
43 |
+
"acc_norm_stderr": 0.04319223625811331
|
44 |
+
},
|
45 |
+
"harness|mmlu_abstract_algebra|5": {
|
46 |
+
"acc": 0.26,
|
47 |
+
"acc_stderr": 0.04408440022768077,
|
48 |
+
"acc_norm": 0.26,
|
49 |
+
"acc_norm_stderr": 0.04408440022768077
|
50 |
+
},
|
51 |
+
"harness|mmlu_conceptual_physics|5": {
|
52 |
+
"acc": 0.4553191489361702,
|
53 |
+
"acc_stderr": 0.03255525359340354,
|
54 |
+
"acc_norm": 0.4553191489361702,
|
55 |
+
"acc_norm_stderr": 0.03255525359340354
|
56 |
+
},
|
57 |
+
"harness|mmlu_virology|5": {
|
58 |
+
"acc": 0.5180722891566265,
|
59 |
+
"acc_stderr": 0.038899512528272166,
|
60 |
+
"acc_norm": 0.5180722891566265,
|
61 |
+
"acc_norm_stderr": 0.038899512528272166
|
62 |
+
},
|
63 |
+
"harness|mmlu_philosophy|5": {
|
64 |
+
"acc": 0.5755627009646302,
|
65 |
+
"acc_stderr": 0.028071928247946205,
|
66 |
+
"acc_norm": 0.5755627009646302,
|
67 |
+
"acc_norm_stderr": 0.028071928247946205
|
68 |
+
},
|
69 |
+
"harness|mmlu_human_aging|5": {
|
70 |
+
"acc": 0.5650224215246636,
|
71 |
+
"acc_stderr": 0.033272833702713445,
|
72 |
+
"acc_norm": 0.5650224215246636,
|
73 |
+
"acc_norm_stderr": 0.033272833702713445
|
74 |
+
},
|
75 |
+
"harness|mmlu_human_sexuality|5": {
|
76 |
+
"acc": 0.5877862595419847,
|
77 |
+
"acc_stderr": 0.04317171194870255,
|
78 |
+
"acc_norm": 0.5877862595419847,
|
79 |
+
"acc_norm_stderr": 0.04317171194870255
|
80 |
+
},
|
81 |
+
"harness|mmlu_medical_genetics|5": {
|
82 |
+
"acc": 0.5,
|
83 |
+
"acc_stderr": 0.050251890762960605,
|
84 |
+
"acc_norm": 0.5,
|
85 |
+
"acc_norm_stderr": 0.050251890762960605
|
86 |
+
},
|
87 |
+
"harness|mmlu_high_school_geography|5": {
|
88 |
+
"acc": 0.6515151515151515,
|
89 |
+
"acc_stderr": 0.033948539651564025,
|
90 |
+
"acc_norm": 0.6515151515151515,
|
91 |
+
"acc_norm_stderr": 0.033948539651564025
|
92 |
+
},
|
93 |
+
"harness|mmlu_electrical_engineering|5": {
|
94 |
+
"acc": 0.503448275862069,
|
95 |
+
"acc_stderr": 0.04166567577101579,
|
96 |
+
"acc_norm": 0.503448275862069,
|
97 |
+
"acc_norm_stderr": 0.04166567577101579
|
98 |
+
},
|
99 |
+
"harness|mmlu_college_physics|5": {
|
100 |
+
"acc": 0.2549019607843137,
|
101 |
+
"acc_stderr": 0.043364327079931785,
|
102 |
+
"acc_norm": 0.2549019607843137,
|
103 |
+
"acc_norm_stderr": 0.043364327079931785
|
104 |
+
},
|
105 |
+
"harness|mmlu_high_school_microeconomics|5": {
|
106 |
+
"acc": 0.5756302521008403,
|
107 |
+
"acc_stderr": 0.03210479051015776,
|
108 |
+
"acc_norm": 0.5756302521008403,
|
109 |
+
"acc_norm_stderr": 0.03210479051015776
|
110 |
+
},
|
111 |
+
"harness|mmlu_high_school_macroeconomics|5": {
|
112 |
+
"acc": 0.541025641025641,
|
113 |
+
"acc_stderr": 0.025265525491284295,
|
114 |
+
"acc_norm": 0.541025641025641,
|
115 |
+
"acc_norm_stderr": 0.025265525491284295
|
116 |
+
},
|
117 |
+
"harness|mmlu_computer_security|5": {
|
118 |
+
"acc": 0.54,
|
119 |
+
"acc_stderr": 0.05009082659620332,
|
120 |
+
"acc_norm": 0.54,
|
121 |
+
"acc_norm_stderr": 0.05009082659620332
|
122 |
+
},
|
123 |
+
"harness|mmlu_global_facts|5": {
|
124 |
+
"acc": 0.31,
|
125 |
+
"acc_stderr": 0.04648231987117316,
|
126 |
+
"acc_norm": 0.31,
|
127 |
+
"acc_norm_stderr": 0.04648231987117316
|
128 |
+
},
|
129 |
+
"harness|mmlu_jurisprudence|5": {
|
130 |
+
"acc": 0.5555555555555556,
|
131 |
+
"acc_stderr": 0.04803752235190192,
|
132 |
+
"acc_norm": 0.5555555555555556,
|
133 |
+
"acc_norm_stderr": 0.04803752235190192
|
134 |
+
},
|
135 |
+
"harness|mmlu_high_school_chemistry|5": {
|
136 |
+
"acc": 0.3842364532019704,
|
137 |
+
"acc_stderr": 0.0342239856565755,
|
138 |
+
"acc_norm": 0.3842364532019704,
|
139 |
+
"acc_norm_stderr": 0.0342239856565755
|
140 |
+
},
|
141 |
+
"harness|mmlu_high_school_biology|5": {
|
142 |
+
"acc": 0.5774193548387097,
|
143 |
+
"acc_stderr": 0.02810096472427264,
|
144 |
+
"acc_norm": 0.5774193548387097,
|
145 |
+
"acc_norm_stderr": 0.02810096472427264
|
146 |
+
},
|
147 |
+
"harness|mmlu_marketing|5": {
|
148 |
+
"acc": 0.7777777777777778,
|
149 |
+
"acc_stderr": 0.027236013946196673,
|
150 |
+
"acc_norm": 0.7777777777777778,
|
151 |
+
"acc_norm_stderr": 0.027236013946196673
|
152 |
+
},
|
153 |
+
"harness|mmlu_clinical_knowledge|5": {
|
154 |
+
"acc": 0.4981132075471698,
|
155 |
+
"acc_stderr": 0.030772653642075657,
|
156 |
+
"acc_norm": 0.4981132075471698,
|
157 |
+
"acc_norm_stderr": 0.030772653642075657
|
158 |
+
},
|
159 |
+
"harness|mmlu_public_relations|5": {
|
160 |
+
"acc": 0.5272727272727272,
|
161 |
+
"acc_stderr": 0.04782001791380061,
|
162 |
+
"acc_norm": 0.5272727272727272,
|
163 |
+
"acc_norm_stderr": 0.04782001791380061
|
164 |
+
},
|
165 |
+
"harness|mmlu_high_school_mathematics|5": {
|
166 |
+
"acc": 0.25555555555555554,
|
167 |
+
"acc_stderr": 0.026593939101844082,
|
168 |
+
"acc_norm": 0.25555555555555554,
|
169 |
+
"acc_norm_stderr": 0.026593939101844082
|
170 |
+
},
|
171 |
+
"harness|mmlu_high_school_physics|5": {
|
172 |
+
"acc": 0.33774834437086093,
|
173 |
+
"acc_stderr": 0.038615575462551684,
|
174 |
+
"acc_norm": 0.33774834437086093,
|
175 |
+
"acc_norm_stderr": 0.038615575462551684
|
176 |
+
},
|
177 |
+
"harness|mmlu_sociology|5": {
|
178 |
+
"acc": 0.7064676616915423,
|
179 |
+
"acc_stderr": 0.032200241045342054,
|
180 |
+
"acc_norm": 0.7064676616915423,
|
181 |
+
"acc_norm_stderr": 0.032200241045342054
|
182 |
+
},
|
183 |
+
"harness|mmlu_college_medicine|5": {
|
184 |
+
"acc": 0.4797687861271676,
|
185 |
+
"acc_stderr": 0.03809342081273958,
|
186 |
+
"acc_norm": 0.4797687861271676,
|
187 |
+
"acc_norm_stderr": 0.03809342081273958
|
188 |
+
},
|
189 |
+
"harness|mmlu_elementary_mathematics|5": {
|
190 |
+
"acc": 0.38095238095238093,
|
191 |
+
"acc_stderr": 0.025010749116137602,
|
192 |
+
"acc_norm": 0.38095238095238093,
|
193 |
+
"acc_norm_stderr": 0.025010749116137602
|
194 |
+
},
|
195 |
+
"harness|mmlu_college_biology|5": {
|
196 |
+
"acc": 0.4236111111111111,
|
197 |
+
"acc_stderr": 0.041321250197233685,
|
198 |
+
"acc_norm": 0.4236111111111111,
|
199 |
+
"acc_norm_stderr": 0.041321250197233685
|
200 |
+
},
|
201 |
+
"harness|mmlu_college_chemistry|5": {
|
202 |
+
"acc": 0.31,
|
203 |
+
"acc_stderr": 0.04648231987117316,
|
204 |
+
"acc_norm": 0.31,
|
205 |
+
"acc_norm_stderr": 0.04648231987117316
|
206 |
+
},
|
207 |
+
"harness|mmlu_us_foreign_policy|5": {
|
208 |
+
"acc": 0.71,
|
209 |
+
"acc_stderr": 0.04560480215720683,
|
210 |
+
"acc_norm": 0.71,
|
211 |
+
"acc_norm_stderr": 0.04560480215720683
|
212 |
+
},
|
213 |
+
"harness|mmlu_moral_disputes|5": {
|
214 |
+
"acc": 0.5751445086705202,
|
215 |
+
"acc_stderr": 0.026613350840261733,
|
216 |
+
"acc_norm": 0.5751445086705202,
|
217 |
+
"acc_norm_stderr": 0.026613350840261733
|
218 |
+
},
|
219 |
+
"harness|mmlu_logical_fallacies|5": {
|
220 |
+
"acc": 0.5030674846625767,
|
221 |
+
"acc_stderr": 0.03928297078179662,
|
222 |
+
"acc_norm": 0.5030674846625767,
|
223 |
+
"acc_norm_stderr": 0.03928297078179662
|
224 |
+
},
|
225 |
+
"harness|mmlu_prehistory|5": {
|
226 |
+
"acc": 0.5370370370370371,
|
227 |
+
"acc_stderr": 0.027744313443376536,
|
228 |
+
"acc_norm": 0.5370370370370371,
|
229 |
+
"acc_norm_stderr": 0.027744313443376536
|
230 |
+
},
|
231 |
+
"harness|mmlu_college_mathematics|5": {
|
232 |
+
"acc": 0.33,
|
233 |
+
"acc_stderr": 0.04725815626252606,
|
234 |
+
"acc_norm": 0.33,
|
235 |
+
"acc_norm_stderr": 0.04725815626252606
|
236 |
+
},
|
237 |
+
"harness|mmlu_high_school_government_and_politics|5": {
|
238 |
+
"acc": 0.6217616580310881,
|
239 |
+
"acc_stderr": 0.034998072761933376,
|
240 |
+
"acc_norm": 0.6217616580310881,
|
241 |
+
"acc_norm_stderr": 0.034998072761933376
|
242 |
+
},
|
243 |
+
"harness|mmlu_econometrics|5": {
|
244 |
+
"acc": 0.37719298245614036,
|
245 |
+
"acc_stderr": 0.04559522141958216,
|
246 |
+
"acc_norm": 0.37719298245614036,
|
247 |
+
"acc_norm_stderr": 0.04559522141958216
|
248 |
+
},
|
249 |
+
"harness|mmlu_high_school_psychology|5": {
|
250 |
+
"acc": 0.6385321100917432,
|
251 |
+
"acc_stderr": 0.02059808200993736,
|
252 |
+
"acc_norm": 0.6385321100917432,
|
253 |
+
"acc_norm_stderr": 0.02059808200993736
|
254 |
+
},
|
255 |
+
"harness|mmlu_formal_logic|5": {
|
256 |
+
"acc": 0.4126984126984127,
|
257 |
+
"acc_stderr": 0.04403438954768177,
|
258 |
+
"acc_norm": 0.4126984126984127,
|
259 |
+
"acc_norm_stderr": 0.04403438954768177
|
260 |
+
},
|
261 |
+
"harness|mmlu_nutrition|5": {
|
262 |
+
"acc": 0.5261437908496732,
|
263 |
+
"acc_stderr": 0.028590752958852387,
|
264 |
+
"acc_norm": 0.5261437908496732,
|
265 |
+
"acc_norm_stderr": 0.028590752958852387
|
266 |
+
},
|
267 |
+
"harness|mmlu_business_ethics|5": {
|
268 |
+
"acc": 0.57,
|
269 |
+
"acc_stderr": 0.049756985195624284,
|
270 |
+
"acc_norm": 0.57,
|
271 |
+
"acc_norm_stderr": 0.049756985195624284
|
272 |
+
},
|
273 |
+
"harness|mmlu_international_law|5": {
|
274 |
+
"acc": 0.7520661157024794,
|
275 |
+
"acc_stderr": 0.03941897526516304,
|
276 |
+
"acc_norm": 0.7520661157024794,
|
277 |
+
"acc_norm_stderr": 0.03941897526516304
|
278 |
+
},
|
279 |
+
"harness|mmlu_astronomy|5": {
|
280 |
+
"acc": 0.5789473684210527,
|
281 |
+
"acc_stderr": 0.040179012759817494,
|
282 |
+
"acc_norm": 0.5789473684210527,
|
283 |
+
"acc_norm_stderr": 0.040179012759817494
|
284 |
+
},
|
285 |
+
"harness|mmlu_professional_psychology|5": {
|
286 |
+
"acc": 0.4738562091503268,
|
287 |
+
"acc_stderr": 0.020200164564804588,
|
288 |
+
"acc_norm": 0.4738562091503268,
|
289 |
+
"acc_norm_stderr": 0.020200164564804588
|
290 |
+
},
|
291 |
+
"harness|mmlu_professional_accounting|5": {
|
292 |
+
"acc": 0.3404255319148936,
|
293 |
+
"acc_stderr": 0.02826765748265013,
|
294 |
+
"acc_norm": 0.3404255319148936,
|
295 |
+
"acc_norm_stderr": 0.02826765748265013
|
296 |
+
},
|
297 |
+
"harness|mmlu_machine_learning|5": {
|
298 |
+
"acc": 0.38392857142857145,
|
299 |
+
"acc_stderr": 0.046161430750285455,
|
300 |
+
"acc_norm": 0.38392857142857145,
|
301 |
+
"acc_norm_stderr": 0.046161430750285455
|
302 |
+
},
|
303 |
+
"harness|mmlu_high_school_statistics|5": {
|
304 |
+
"acc": 0.4675925925925926,
|
305 |
+
"acc_stderr": 0.03402801581358966,
|
306 |
+
"acc_norm": 0.4675925925925926,
|
307 |
+
"acc_norm_stderr": 0.03402801581358966
|
308 |
+
},
|
309 |
+
"harness|mmlu_moral_scenarios|5": {
|
310 |
+
"acc": 0.21675977653631284,
|
311 |
+
"acc_stderr": 0.013780598486443363,
|
312 |
+
"acc_norm": 0.21675977653631284,
|
313 |
+
"acc_norm_stderr": 0.013780598486443363
|
314 |
+
},
|
315 |
+
"harness|mmlu_college_computer_science|5": {
|
316 |
+
"acc": 0.39,
|
317 |
+
"acc_stderr": 0.04902071300001975,
|
318 |
+
"acc_norm": 0.39,
|
319 |
+
"acc_norm_stderr": 0.04902071300001975
|
320 |
+
},
|
321 |
+
"harness|mmlu_high_school_computer_science|5": {
|
322 |
+
"acc": 0.71,
|
323 |
+
"acc_stderr": 0.04560480215720684,
|
324 |
+
"acc_norm": 0.71,
|
325 |
+
"acc_norm_stderr": 0.04560480215720684
|
326 |
+
},
|
327 |
+
"harness|mmlu_professional_medicine|5": {
|
328 |
+
"acc": 0.4411764705882353,
|
329 |
+
"acc_stderr": 0.0301619119307671,
|
330 |
+
"acc_norm": 0.4411764705882353,
|
331 |
+
"acc_norm_stderr": 0.0301619119307671
|
332 |
+
},
|
333 |
+
"harness|mmlu_security_studies|5": {
|
334 |
+
"acc": 0.6285714285714286,
|
335 |
+
"acc_stderr": 0.03093285879278986,
|
336 |
+
"acc_norm": 0.6285714285714286,
|
337 |
+
"acc_norm_stderr": 0.03093285879278986
|
338 |
+
},
|
339 |
+
"harness|mmlu_high_school_world_history|5": {
|
340 |
+
"acc": 0.70042194092827,
|
341 |
+
"acc_stderr": 0.029818024749753095,
|
342 |
+
"acc_norm": 0.70042194092827,
|
343 |
+
"acc_norm_stderr": 0.029818024749753095
|
344 |
+
},
|
345 |
+
"harness|mmlu_professional_law|5": {
|
346 |
+
"acc": 0.378748370273794,
|
347 |
+
"acc_stderr": 0.012389052105003741,
|
348 |
+
"acc_norm": 0.378748370273794,
|
349 |
+
"acc_norm_stderr": 0.012389052105003741
|
350 |
+
},
|
351 |
+
"harness|mmlu_high_school_us_history|5": {
|
352 |
+
"acc": 0.6225490196078431,
|
353 |
+
"acc_stderr": 0.03402272044340703,
|
354 |
+
"acc_norm": 0.6225490196078431,
|
355 |
+
"acc_norm_stderr": 0.03402272044340703
|
356 |
+
},
|
357 |
+
"harness|mmlu_high_school_european_history|5": {
|
358 |
+
"acc": 0.6666666666666666,
|
359 |
+
"acc_stderr": 0.03681050869161549,
|
360 |
+
"acc_norm": 0.6666666666666666,
|
361 |
+
"acc_norm_stderr": 0.03681050869161549
|
362 |
+
},
|
363 |
+
"harness|truthfulqa_mc|0": {
|
364 |
+
"mc1": 0.33659730722154224,
|
365 |
+
"mc1_stderr": 0.016542412809494877,
|
366 |
+
"mc2": 0.49995145184296846,
|
367 |
+
"mc2_stderr": 0.015887726098900913
|
368 |
+
},
|
369 |
+
"harness|commongen_v2|2": {
|
370 |
+
"acc": 0.564344746162928,
|
371 |
+
"acc_stderr": 0.017047415229476316,
|
372 |
+
"acc_norm": 0.6068476977567887,
|
373 |
+
"acc_norm_stderr": 0.016793262801287068
|
374 |
+
}
|
375 |
+
},
|
376 |
+
"versions": {
|
377 |
+
"all": 0,
|
378 |
+
"harness|arc_challenge|25": 0,
|
379 |
+
"harness|hellaswag|10": 0,
|
380 |
+
"harness|mmlu_world_religions|5": 1,
|
381 |
+
"harness|mmlu_management|5": 1,
|
382 |
+
"harness|mmlu_miscellaneous|5": 1,
|
383 |
+
"harness|mmlu_anatomy|5": 1,
|
384 |
+
"harness|mmlu_abstract_algebra|5": 1,
|
385 |
+
"harness|mmlu_conceptual_physics|5": 1,
|
386 |
+
"harness|mmlu_virology|5": 1,
|
387 |
+
"harness|mmlu_philosophy|5": 1,
|
388 |
+
"harness|mmlu_human_aging|5": 1,
|
389 |
+
"harness|mmlu_human_sexuality|5": 1,
|
390 |
+
"harness|mmlu_medical_genetics|5": 1,
|
391 |
+
"harness|mmlu_high_school_geography|5": 1,
|
392 |
+
"harness|mmlu_electrical_engineering|5": 1,
|
393 |
+
"harness|mmlu_college_physics|5": 1,
|
394 |
+
"harness|mmlu_high_school_microeconomics|5": 1,
|
395 |
+
"harness|mmlu_high_school_macroeconomics|5": 1,
|
396 |
+
"harness|mmlu_computer_security|5": 1,
|
397 |
+
"harness|mmlu_global_facts|5": 1,
|
398 |
+
"harness|mmlu_jurisprudence|5": 1,
|
399 |
+
"harness|mmlu_high_school_chemistry|5": 1,
|
400 |
+
"harness|mmlu_high_school_biology|5": 1,
|
401 |
+
"harness|mmlu_marketing|5": 1,
|
402 |
+
"harness|mmlu_clinical_knowledge|5": 1,
|
403 |
+
"harness|mmlu_public_relations|5": 1,
|
404 |
+
"harness|mmlu_high_school_mathematics|5": 1,
|
405 |
+
"harness|mmlu_high_school_physics|5": 1,
|
406 |
+
"harness|mmlu_sociology|5": 1,
|
407 |
+
"harness|mmlu_college_medicine|5": 1,
|
408 |
+
"harness|mmlu_elementary_mathematics|5": 1,
|
409 |
+
"harness|mmlu_college_biology|5": 1,
|
410 |
+
"harness|mmlu_college_chemistry|5": 1,
|
411 |
+
"harness|mmlu_us_foreign_policy|5": 1,
|
412 |
+
"harness|mmlu_moral_disputes|5": 1,
|
413 |
+
"harness|mmlu_logical_fallacies|5": 1,
|
414 |
+
"harness|mmlu_prehistory|5": 1,
|
415 |
+
"harness|mmlu_college_mathematics|5": 1,
|
416 |
+
"harness|mmlu_high_school_government_and_politics|5": 1,
|
417 |
+
"harness|mmlu_econometrics|5": 1,
|
418 |
+
"harness|mmlu_high_school_psychology|5": 1,
|
419 |
+
"harness|mmlu_formal_logic|5": 1,
|
420 |
+
"harness|mmlu_nutrition|5": 1,
|
421 |
+
"harness|mmlu_business_ethics|5": 1,
|
422 |
+
"harness|mmlu_international_law|5": 1,
|
423 |
+
"harness|mmlu_astronomy|5": 1,
|
424 |
+
"harness|mmlu_professional_psychology|5": 1,
|
425 |
+
"harness|mmlu_professional_accounting|5": 1,
|
426 |
+
"harness|mmlu_machine_learning|5": 1,
|
427 |
+
"harness|mmlu_high_school_statistics|5": 1,
|
428 |
+
"harness|mmlu_moral_scenarios|5": 1,
|
429 |
+
"harness|mmlu_college_computer_science|5": 1,
|
430 |
+
"harness|mmlu_high_school_computer_science|5": 1,
|
431 |
+
"harness|mmlu_professional_medicine|5": 1,
|
432 |
+
"harness|mmlu_security_studies|5": 1,
|
433 |
+
"harness|mmlu_high_school_world_history|5": 1,
|
434 |
+
"harness|mmlu_professional_law|5": 1,
|
435 |
+
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
+
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
+
"harness|truthfulqa_mc|0": 0,
|
438 |
+
"harness|commongen_v2|2": 1
|
439 |
+
},
|
440 |
+
"config_general": {
|
441 |
+
"model_name": "nlpai-lab/KULLM3",
|
442 |
+
"model_sha": "5a6bcd0fc7f240460eb6d57016f7b4060bc1f43b",
|
443 |
+
"model_dtype": "torch.float16",
|
444 |
+
"lighteval_sha": "",
|
445 |
+
"num_few_shot_default": 0,
|
446 |
+
"num_fewshot_seeds": 1,
|
447 |
+
"override_batch_size": 1,
|
448 |
+
"max_samples": null
|
449 |
+
}
|
450 |
+
}
|
eval-results/x2bee/POLAR-14B-DPO-v1.3/result.json
ADDED
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"daily": {
|
4 |
+
"daily": 4
|
5 |
+
},
|
6 |
+
"quarterly": {
|
7 |
+
"quarterly": 4
|
8 |
+
},
|
9 |
+
"harness|arc_challenge|25": {
|
10 |
+
"acc": 0.7465870307167235,
|
11 |
+
"acc_stderr": 0.012710896778378604,
|
12 |
+
"acc_norm": 0.7807167235494881,
|
13 |
+
"acc_norm_stderr": 0.012091245787615728
|
14 |
+
},
|
15 |
+
"harness|hellaswag|10": {
|
16 |
+
"acc": 0.6385182234614618,
|
17 |
+
"acc_stderr": 0.004794478426382617,
|
18 |
+
"acc_norm": 0.7561242780322645,
|
19 |
+
"acc_norm_stderr": 0.004285410130466119
|
20 |
+
},
|
21 |
+
"harness|mmlu_world_religions|5": {
|
22 |
+
"acc": 0.6900584795321637,
|
23 |
+
"acc_stderr": 0.035469769593931624,
|
24 |
+
"acc_norm": 0.6900584795321637,
|
25 |
+
"acc_norm_stderr": 0.035469769593931624
|
26 |
+
},
|
27 |
+
"harness|mmlu_management|5": {
|
28 |
+
"acc": 0.6601941747572816,
|
29 |
+
"acc_stderr": 0.046897659372781335,
|
30 |
+
"acc_norm": 0.6601941747572816,
|
31 |
+
"acc_norm_stderr": 0.046897659372781335
|
32 |
+
},
|
33 |
+
"harness|mmlu_miscellaneous|5": {
|
34 |
+
"acc": 0.6845466155810983,
|
35 |
+
"acc_stderr": 0.016617501738763408,
|
36 |
+
"acc_norm": 0.6845466155810983,
|
37 |
+
"acc_norm_stderr": 0.016617501738763408
|
38 |
+
},
|
39 |
+
"harness|mmlu_anatomy|5": {
|
40 |
+
"acc": 0.48148148148148145,
|
41 |
+
"acc_stderr": 0.04316378599511324,
|
42 |
+
"acc_norm": 0.48148148148148145,
|
43 |
+
"acc_norm_stderr": 0.04316378599511324
|
44 |
+
},
|
45 |
+
"harness|mmlu_abstract_algebra|5": {
|
46 |
+
"acc": 0.33,
|
47 |
+
"acc_stderr": 0.047258156262526045,
|
48 |
+
"acc_norm": 0.33,
|
49 |
+
"acc_norm_stderr": 0.047258156262526045
|
50 |
+
},
|
51 |
+
"harness|mmlu_conceptual_physics|5": {
|
52 |
+
"acc": 0.46808510638297873,
|
53 |
+
"acc_stderr": 0.03261936918467383,
|
54 |
+
"acc_norm": 0.46808510638297873,
|
55 |
+
"acc_norm_stderr": 0.03261936918467383
|
56 |
+
},
|
57 |
+
"harness|mmlu_virology|5": {
|
58 |
+
"acc": 0.4759036144578313,
|
59 |
+
"acc_stderr": 0.03887971849597264,
|
60 |
+
"acc_norm": 0.4759036144578313,
|
61 |
+
"acc_norm_stderr": 0.03887971849597264
|
62 |
+
},
|
63 |
+
"harness|mmlu_philosophy|5": {
|
64 |
+
"acc": 0.6334405144694534,
|
65 |
+
"acc_stderr": 0.02736807824397163,
|
66 |
+
"acc_norm": 0.6334405144694534,
|
67 |
+
"acc_norm_stderr": 0.02736807824397163
|
68 |
+
},
|
69 |
+
"harness|mmlu_human_aging|5": {
|
70 |
+
"acc": 0.6681614349775785,
|
71 |
+
"acc_stderr": 0.03160295143776679,
|
72 |
+
"acc_norm": 0.6681614349775785,
|
73 |
+
"acc_norm_stderr": 0.03160295143776679
|
74 |
+
},
|
75 |
+
"harness|mmlu_human_sexuality|5": {
|
76 |
+
"acc": 0.6030534351145038,
|
77 |
+
"acc_stderr": 0.04291135671009224,
|
78 |
+
"acc_norm": 0.6030534351145038,
|
79 |
+
"acc_norm_stderr": 0.04291135671009224
|
80 |
+
},
|
81 |
+
"harness|mmlu_medical_genetics|5": {
|
82 |
+
"acc": 0.51,
|
83 |
+
"acc_stderr": 0.05024183937956911,
|
84 |
+
"acc_norm": 0.51,
|
85 |
+
"acc_norm_stderr": 0.05024183937956911
|
86 |
+
},
|
87 |
+
"harness|mmlu_high_school_geography|5": {
|
88 |
+
"acc": 0.7222222222222222,
|
89 |
+
"acc_stderr": 0.03191178226713547,
|
90 |
+
"acc_norm": 0.7222222222222222,
|
91 |
+
"acc_norm_stderr": 0.03191178226713547
|
92 |
+
},
|
93 |
+
"harness|mmlu_electrical_engineering|5": {
|
94 |
+
"acc": 0.47586206896551725,
|
95 |
+
"acc_stderr": 0.041618085035015295,
|
96 |
+
"acc_norm": 0.47586206896551725,
|
97 |
+
"acc_norm_stderr": 0.041618085035015295
|
98 |
+
},
|
99 |
+
"harness|mmlu_college_physics|5": {
|
100 |
+
"acc": 0.2549019607843137,
|
101 |
+
"acc_stderr": 0.04336432707993178,
|
102 |
+
"acc_norm": 0.2549019607843137,
|
103 |
+
"acc_norm_stderr": 0.04336432707993178
|
104 |
+
},
|
105 |
+
"harness|mmlu_high_school_microeconomics|5": {
|
106 |
+
"acc": 0.592436974789916,
|
107 |
+
"acc_stderr": 0.031918633744784666,
|
108 |
+
"acc_norm": 0.592436974789916,
|
109 |
+
"acc_norm_stderr": 0.031918633744784666
|
110 |
+
},
|
111 |
+
"harness|mmlu_high_school_macroeconomics|5": {
|
112 |
+
"acc": 0.5948717948717949,
|
113 |
+
"acc_stderr": 0.024890471769938142,
|
114 |
+
"acc_norm": 0.5948717948717949,
|
115 |
+
"acc_norm_stderr": 0.024890471769938142
|
116 |
+
},
|
117 |
+
"harness|mmlu_computer_security|5": {
|
118 |
+
"acc": 0.66,
|
119 |
+
"acc_stderr": 0.04760952285695237,
|
120 |
+
"acc_norm": 0.66,
|
121 |
+
"acc_norm_stderr": 0.04760952285695237
|
122 |
+
},
|
123 |
+
"harness|mmlu_global_facts|5": {
|
124 |
+
"acc": 0.27,
|
125 |
+
"acc_stderr": 0.04461960433384739,
|
126 |
+
"acc_norm": 0.27,
|
127 |
+
"acc_norm_stderr": 0.04461960433384739
|
128 |
+
},
|
129 |
+
"harness|mmlu_jurisprudence|5": {
|
130 |
+
"acc": 0.6388888888888888,
|
131 |
+
"acc_stderr": 0.04643454608906275,
|
132 |
+
"acc_norm": 0.6388888888888888,
|
133 |
+
"acc_norm_stderr": 0.04643454608906275
|
134 |
+
},
|
135 |
+
"harness|mmlu_high_school_chemistry|5": {
|
136 |
+
"acc": 0.4433497536945813,
|
137 |
+
"acc_stderr": 0.034953345821629345,
|
138 |
+
"acc_norm": 0.4433497536945813,
|
139 |
+
"acc_norm_stderr": 0.034953345821629345
|
140 |
+
},
|
141 |
+
"harness|mmlu_high_school_biology|5": {
|
142 |
+
"acc": 0.5806451612903226,
|
143 |
+
"acc_stderr": 0.028071588901091838,
|
144 |
+
"acc_norm": 0.5806451612903226,
|
145 |
+
"acc_norm_stderr": 0.028071588901091838
|
146 |
+
},
|
147 |
+
"harness|mmlu_marketing|5": {
|
148 |
+
"acc": 0.811965811965812,
|
149 |
+
"acc_stderr": 0.025598193686652254,
|
150 |
+
"acc_norm": 0.811965811965812,
|
151 |
+
"acc_norm_stderr": 0.025598193686652254
|
152 |
+
},
|
153 |
+
"harness|mmlu_clinical_knowledge|5": {
|
154 |
+
"acc": 0.5169811320754717,
|
155 |
+
"acc_stderr": 0.030755120364119898,
|
156 |
+
"acc_norm": 0.5169811320754717,
|
157 |
+
"acc_norm_stderr": 0.030755120364119898
|
158 |
+
},
|
159 |
+
"harness|mmlu_public_relations|5": {
|
160 |
+
"acc": 0.5818181818181818,
|
161 |
+
"acc_stderr": 0.04724577405731573,
|
162 |
+
"acc_norm": 0.5818181818181818,
|
163 |
+
"acc_norm_stderr": 0.04724577405731573
|
164 |
+
},
|
165 |
+
"harness|mmlu_high_school_mathematics|5": {
|
166 |
+
"acc": 0.3888888888888889,
|
167 |
+
"acc_stderr": 0.029723278961476664,
|
168 |
+
"acc_norm": 0.3888888888888889,
|
169 |
+
"acc_norm_stderr": 0.029723278961476664
|
170 |
+
},
|
171 |
+
"harness|mmlu_high_school_physics|5": {
|
172 |
+
"acc": 0.3708609271523179,
|
173 |
+
"acc_stderr": 0.03943966699183629,
|
174 |
+
"acc_norm": 0.3708609271523179,
|
175 |
+
"acc_norm_stderr": 0.03943966699183629
|
176 |
+
},
|
177 |
+
"harness|mmlu_sociology|5": {
|
178 |
+
"acc": 0.6666666666666666,
|
179 |
+
"acc_stderr": 0.033333333333333326,
|
180 |
+
"acc_norm": 0.6666666666666666,
|
181 |
+
"acc_norm_stderr": 0.033333333333333326
|
182 |
+
},
|
183 |
+
"harness|mmlu_college_medicine|5": {
|
184 |
+
"acc": 0.47398843930635837,
|
185 |
+
"acc_stderr": 0.038073017265045125,
|
186 |
+
"acc_norm": 0.47398843930635837,
|
187 |
+
"acc_norm_stderr": 0.038073017265045125
|
188 |
+
},
|
189 |
+
"harness|mmlu_elementary_mathematics|5": {
|
190 |
+
"acc": 0.42328042328042326,
|
191 |
+
"acc_stderr": 0.025446365634406793,
|
192 |
+
"acc_norm": 0.42328042328042326,
|
193 |
+
"acc_norm_stderr": 0.025446365634406793
|
194 |
+
},
|
195 |
+
"harness|mmlu_college_biology|5": {
|
196 |
+
"acc": 0.5625,
|
197 |
+
"acc_stderr": 0.04148415739394154,
|
198 |
+
"acc_norm": 0.5625,
|
199 |
+
"acc_norm_stderr": 0.04148415739394154
|
200 |
+
},
|
201 |
+
"harness|mmlu_college_chemistry|5": {
|
202 |
+
"acc": 0.39,
|
203 |
+
"acc_stderr": 0.04902071300001975,
|
204 |
+
"acc_norm": 0.39,
|
205 |
+
"acc_norm_stderr": 0.04902071300001975
|
206 |
+
},
|
207 |
+
"harness|mmlu_us_foreign_policy|5": {
|
208 |
+
"acc": 0.78,
|
209 |
+
"acc_stderr": 0.04163331998932263,
|
210 |
+
"acc_norm": 0.78,
|
211 |
+
"acc_norm_stderr": 0.04163331998932263
|
212 |
+
},
|
213 |
+
"harness|mmlu_moral_disputes|5": {
|
214 |
+
"acc": 0.5491329479768786,
|
215 |
+
"acc_stderr": 0.026788811931562767,
|
216 |
+
"acc_norm": 0.5491329479768786,
|
217 |
+
"acc_norm_stderr": 0.026788811931562767
|
218 |
+
},
|
219 |
+
"harness|mmlu_logical_fallacies|5": {
|
220 |
+
"acc": 0.6319018404907976,
|
221 |
+
"acc_stderr": 0.03789213935838396,
|
222 |
+
"acc_norm": 0.6319018404907976,
|
223 |
+
"acc_norm_stderr": 0.03789213935838396
|
224 |
+
},
|
225 |
+
"harness|mmlu_prehistory|5": {
|
226 |
+
"acc": 0.5925925925925926,
|
227 |
+
"acc_stderr": 0.02733954664066273,
|
228 |
+
"acc_norm": 0.5925925925925926,
|
229 |
+
"acc_norm_stderr": 0.02733954664066273
|
230 |
+
},
|
231 |
+
"harness|mmlu_college_mathematics|5": {
|
232 |
+
"acc": 0.4,
|
233 |
+
"acc_stderr": 0.049236596391733084,
|
234 |
+
"acc_norm": 0.4,
|
235 |
+
"acc_norm_stderr": 0.049236596391733084
|
236 |
+
},
|
237 |
+
"harness|mmlu_high_school_government_and_politics|5": {
|
238 |
+
"acc": 0.7668393782383419,
|
239 |
+
"acc_stderr": 0.03051611137147601,
|
240 |
+
"acc_norm": 0.7668393782383419,
|
241 |
+
"acc_norm_stderr": 0.03051611137147601
|
242 |
+
},
|
243 |
+
"harness|mmlu_econometrics|5": {
|
244 |
+
"acc": 0.4473684210526316,
|
245 |
+
"acc_stderr": 0.046774730044912,
|
246 |
+
"acc_norm": 0.4473684210526316,
|
247 |
+
"acc_norm_stderr": 0.046774730044912
|
248 |
+
},
|
249 |
+
"harness|mmlu_high_school_psychology|5": {
|
250 |
+
"acc": 0.726605504587156,
|
251 |
+
"acc_stderr": 0.01910929984609827,
|
252 |
+
"acc_norm": 0.726605504587156,
|
253 |
+
"acc_norm_stderr": 0.01910929984609827
|
254 |
+
},
|
255 |
+
"harness|mmlu_formal_logic|5": {
|
256 |
+
"acc": 0.3968253968253968,
|
257 |
+
"acc_stderr": 0.04375888492727061,
|
258 |
+
"acc_norm": 0.3968253968253968,
|
259 |
+
"acc_norm_stderr": 0.04375888492727061
|
260 |
+
},
|
261 |
+
"harness|mmlu_nutrition|5": {
|
262 |
+
"acc": 0.6078431372549019,
|
263 |
+
"acc_stderr": 0.027956046165424516,
|
264 |
+
"acc_norm": 0.6078431372549019,
|
265 |
+
"acc_norm_stderr": 0.027956046165424516
|
266 |
+
},
|
267 |
+
"harness|mmlu_business_ethics|5": {
|
268 |
+
"acc": 0.55,
|
269 |
+
"acc_stderr": 0.05,
|
270 |
+
"acc_norm": 0.55,
|
271 |
+
"acc_norm_stderr": 0.05
|
272 |
+
},
|
273 |
+
"harness|mmlu_international_law|5": {
|
274 |
+
"acc": 0.6942148760330579,
|
275 |
+
"acc_stderr": 0.04205953933884122,
|
276 |
+
"acc_norm": 0.6942148760330579,
|
277 |
+
"acc_norm_stderr": 0.04205953933884122
|
278 |
+
},
|
279 |
+
"harness|mmlu_astronomy|5": {
|
280 |
+
"acc": 0.618421052631579,
|
281 |
+
"acc_stderr": 0.03953173377749194,
|
282 |
+
"acc_norm": 0.618421052631579,
|
283 |
+
"acc_norm_stderr": 0.03953173377749194
|
284 |
+
},
|
285 |
+
"harness|mmlu_professional_psychology|5": {
|
286 |
+
"acc": 0.5669934640522876,
|
287 |
+
"acc_stderr": 0.02004544247332422,
|
288 |
+
"acc_norm": 0.5669934640522876,
|
289 |
+
"acc_norm_stderr": 0.02004544247332422
|
290 |
+
},
|
291 |
+
"harness|mmlu_professional_accounting|5": {
|
292 |
+
"acc": 0.4219858156028369,
|
293 |
+
"acc_stderr": 0.029462189233370586,
|
294 |
+
"acc_norm": 0.4219858156028369,
|
295 |
+
"acc_norm_stderr": 0.029462189233370586
|
296 |
+
},
|
297 |
+
"harness|mmlu_machine_learning|5": {
|
298 |
+
"acc": 0.5089285714285714,
|
299 |
+
"acc_stderr": 0.04745033255489123,
|
300 |
+
"acc_norm": 0.5089285714285714,
|
301 |
+
"acc_norm_stderr": 0.04745033255489123
|
302 |
+
},
|
303 |
+
"harness|mmlu_high_school_statistics|5": {
|
304 |
+
"acc": 0.4351851851851852,
|
305 |
+
"acc_stderr": 0.03381200005643526,
|
306 |
+
"acc_norm": 0.4351851851851852,
|
307 |
+
"acc_norm_stderr": 0.03381200005643526
|
308 |
+
},
|
309 |
+
"harness|mmlu_moral_scenarios|5": {
|
310 |
+
"acc": 0.3787709497206704,
|
311 |
+
"acc_stderr": 0.016223533510365117,
|
312 |
+
"acc_norm": 0.3787709497206704,
|
313 |
+
"acc_norm_stderr": 0.016223533510365117
|
314 |
+
},
|
315 |
+
"harness|mmlu_college_computer_science|5": {
|
316 |
+
"acc": 0.47,
|
317 |
+
"acc_stderr": 0.05016135580465919,
|
318 |
+
"acc_norm": 0.47,
|
319 |
+
"acc_norm_stderr": 0.05016135580465919
|
320 |
+
},
|
321 |
+
"harness|mmlu_high_school_computer_science|5": {
|
322 |
+
"acc": 0.66,
|
323 |
+
"acc_stderr": 0.04760952285695238,
|
324 |
+
"acc_norm": 0.66,
|
325 |
+
"acc_norm_stderr": 0.04760952285695238
|
326 |
+
},
|
327 |
+
"harness|mmlu_professional_medicine|5": {
|
328 |
+
"acc": 0.48161764705882354,
|
329 |
+
"acc_stderr": 0.03035230339535196,
|
330 |
+
"acc_norm": 0.48161764705882354,
|
331 |
+
"acc_norm_stderr": 0.03035230339535196
|
332 |
+
},
|
333 |
+
"harness|mmlu_security_studies|5": {
|
334 |
+
"acc": 0.6448979591836734,
|
335 |
+
"acc_stderr": 0.030635655150387634,
|
336 |
+
"acc_norm": 0.6448979591836734,
|
337 |
+
"acc_norm_stderr": 0.030635655150387634
|
338 |
+
},
|
339 |
+
"harness|mmlu_high_school_world_history|5": {
|
340 |
+
"acc": 0.729957805907173,
|
341 |
+
"acc_stderr": 0.028900721906293426,
|
342 |
+
"acc_norm": 0.729957805907173,
|
343 |
+
"acc_norm_stderr": 0.028900721906293426
|
344 |
+
},
|
345 |
+
"harness|mmlu_professional_law|5": {
|
346 |
+
"acc": 0.41460234680573665,
|
347 |
+
"acc_stderr": 0.012582597058908284,
|
348 |
+
"acc_norm": 0.41460234680573665,
|
349 |
+
"acc_norm_stderr": 0.012582597058908284
|
350 |
+
},
|
351 |
+
"harness|mmlu_high_school_us_history|5": {
|
352 |
+
"acc": 0.6421568627450981,
|
353 |
+
"acc_stderr": 0.03364487286088298,
|
354 |
+
"acc_norm": 0.6421568627450981,
|
355 |
+
"acc_norm_stderr": 0.03364487286088298
|
356 |
+
},
|
357 |
+
"harness|mmlu_high_school_european_history|5": {
|
358 |
+
"acc": 0.6181818181818182,
|
359 |
+
"acc_stderr": 0.03793713171165635,
|
360 |
+
"acc_norm": 0.6181818181818182,
|
361 |
+
"acc_norm_stderr": 0.03793713171165635
|
362 |
+
},
|
363 |
+
"harness|truthfulqa_mc|0": {
|
364 |
+
"mc1": 0.6328029375764994,
|
365 |
+
"mc1_stderr": 0.01687480500145318,
|
366 |
+
"mc2": 0.7522925779273922,
|
367 |
+
"mc2_stderr": 0.014568927682929578
|
368 |
+
},
|
369 |
+
"harness|commongen_v2|2": {
|
370 |
+
"acc": 0.45218417945690675,
|
371 |
+
"acc_stderr": 0.017111567130916785,
|
372 |
+
"acc_norm": 0.45454545454545453,
|
373 |
+
"acc_norm_stderr": 0.017119172208061504
|
374 |
+
}
|
375 |
+
},
|
376 |
+
"versions": {
|
377 |
+
"all": 0,
|
378 |
+
"harness|arc_challenge|25": 0,
|
379 |
+
"harness|hellaswag|10": 0,
|
380 |
+
"harness|mmlu_world_religions|5": 1,
|
381 |
+
"harness|mmlu_management|5": 1,
|
382 |
+
"harness|mmlu_miscellaneous|5": 1,
|
383 |
+
"harness|mmlu_anatomy|5": 1,
|
384 |
+
"harness|mmlu_abstract_algebra|5": 1,
|
385 |
+
"harness|mmlu_conceptual_physics|5": 1,
|
386 |
+
"harness|mmlu_virology|5": 1,
|
387 |
+
"harness|mmlu_philosophy|5": 1,
|
388 |
+
"harness|mmlu_human_aging|5": 1,
|
389 |
+
"harness|mmlu_human_sexuality|5": 1,
|
390 |
+
"harness|mmlu_medical_genetics|5": 1,
|
391 |
+
"harness|mmlu_high_school_geography|5": 1,
|
392 |
+
"harness|mmlu_electrical_engineering|5": 1,
|
393 |
+
"harness|mmlu_college_physics|5": 1,
|
394 |
+
"harness|mmlu_high_school_microeconomics|5": 1,
|
395 |
+
"harness|mmlu_high_school_macroeconomics|5": 1,
|
396 |
+
"harness|mmlu_computer_security|5": 1,
|
397 |
+
"harness|mmlu_global_facts|5": 1,
|
398 |
+
"harness|mmlu_jurisprudence|5": 1,
|
399 |
+
"harness|mmlu_high_school_chemistry|5": 1,
|
400 |
+
"harness|mmlu_high_school_biology|5": 1,
|
401 |
+
"harness|mmlu_marketing|5": 1,
|
402 |
+
"harness|mmlu_clinical_knowledge|5": 1,
|
403 |
+
"harness|mmlu_public_relations|5": 1,
|
404 |
+
"harness|mmlu_high_school_mathematics|5": 1,
|
405 |
+
"harness|mmlu_high_school_physics|5": 1,
|
406 |
+
"harness|mmlu_sociology|5": 1,
|
407 |
+
"harness|mmlu_college_medicine|5": 1,
|
408 |
+
"harness|mmlu_elementary_mathematics|5": 1,
|
409 |
+
"harness|mmlu_college_biology|5": 1,
|
410 |
+
"harness|mmlu_college_chemistry|5": 1,
|
411 |
+
"harness|mmlu_us_foreign_policy|5": 1,
|
412 |
+
"harness|mmlu_moral_disputes|5": 1,
|
413 |
+
"harness|mmlu_logical_fallacies|5": 1,
|
414 |
+
"harness|mmlu_prehistory|5": 1,
|
415 |
+
"harness|mmlu_college_mathematics|5": 1,
|
416 |
+
"harness|mmlu_high_school_government_and_politics|5": 1,
|
417 |
+
"harness|mmlu_econometrics|5": 1,
|
418 |
+
"harness|mmlu_high_school_psychology|5": 1,
|
419 |
+
"harness|mmlu_formal_logic|5": 1,
|
420 |
+
"harness|mmlu_nutrition|5": 1,
|
421 |
+
"harness|mmlu_business_ethics|5": 1,
|
422 |
+
"harness|mmlu_international_law|5": 1,
|
423 |
+
"harness|mmlu_astronomy|5": 1,
|
424 |
+
"harness|mmlu_professional_psychology|5": 1,
|
425 |
+
"harness|mmlu_professional_accounting|5": 1,
|
426 |
+
"harness|mmlu_machine_learning|5": 1,
|
427 |
+
"harness|mmlu_high_school_statistics|5": 1,
|
428 |
+
"harness|mmlu_moral_scenarios|5": 1,
|
429 |
+
"harness|mmlu_college_computer_science|5": 1,
|
430 |
+
"harness|mmlu_high_school_computer_science|5": 1,
|
431 |
+
"harness|mmlu_professional_medicine|5": 1,
|
432 |
+
"harness|mmlu_security_studies|5": 1,
|
433 |
+
"harness|mmlu_high_school_world_history|5": 1,
|
434 |
+
"harness|mmlu_professional_law|5": 1,
|
435 |
+
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
+
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
+
"harness|truthfulqa_mc|0": 0,
|
438 |
+
"harness|commongen_v2|2": 1
|
439 |
+
},
|
440 |
+
"config_general": {
|
441 |
+
"model_name": "x2bee/POLAR-14B-DPO-v1.3",
|
442 |
+
"model_sha": "337edbed4c86db2da27e3b0e07086134f8d27a09",
|
443 |
+
"model_dtype": "torch.float16",
|
444 |
+
"lighteval_sha": "",
|
445 |
+
"num_few_shot_default": 0,
|
446 |
+
"num_fewshot_seeds": 1,
|
447 |
+
"override_batch_size": 1,
|
448 |
+
"max_samples": null
|
449 |
+
}
|
450 |
+
}
|
eval-results/x2bee/POLAR-14B-DPO-v1.4/result.json
ADDED
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"daily": {
|
4 |
+
"daily": 7
|
5 |
+
},
|
6 |
+
"quarterly": {
|
7 |
+
"quarterly": 7
|
8 |
+
},
|
9 |
+
"harness|arc_challenge|25": {
|
10 |
+
"acc": 0.7363481228668942,
|
11 |
+
"acc_stderr": 0.012875929151297058,
|
12 |
+
"acc_norm": 0.7491467576791809,
|
13 |
+
"acc_norm_stderr": 0.012668198621315433
|
14 |
+
},
|
15 |
+
"harness|hellaswag|10": {
|
16 |
+
"acc": 0.7228639713204541,
|
17 |
+
"acc_stderr": 0.004466695023677848,
|
18 |
+
"acc_norm": 0.7422824138617805,
|
19 |
+
"acc_norm_stderr": 0.004364838000335614
|
20 |
+
},
|
21 |
+
"harness|mmlu_world_religions|5": {
|
22 |
+
"acc": 0.6140350877192983,
|
23 |
+
"acc_stderr": 0.03733756969066164,
|
24 |
+
"acc_norm": 0.6140350877192983,
|
25 |
+
"acc_norm_stderr": 0.03733756969066164
|
26 |
+
},
|
27 |
+
"harness|mmlu_management|5": {
|
28 |
+
"acc": 0.6893203883495146,
|
29 |
+
"acc_stderr": 0.045821241601615506,
|
30 |
+
"acc_norm": 0.6893203883495146,
|
31 |
+
"acc_norm_stderr": 0.045821241601615506
|
32 |
+
},
|
33 |
+
"harness|mmlu_miscellaneous|5": {
|
34 |
+
"acc": 0.6526181353767561,
|
35 |
+
"acc_stderr": 0.017026671748655728,
|
36 |
+
"acc_norm": 0.6526181353767561,
|
37 |
+
"acc_norm_stderr": 0.017026671748655728
|
38 |
+
},
|
39 |
+
"harness|mmlu_anatomy|5": {
|
40 |
+
"acc": 0.5037037037037037,
|
41 |
+
"acc_stderr": 0.043192236258113324,
|
42 |
+
"acc_norm": 0.5037037037037037,
|
43 |
+
"acc_norm_stderr": 0.043192236258113324
|
44 |
+
},
|
45 |
+
"harness|mmlu_abstract_algebra|5": {
|
46 |
+
"acc": 0.37,
|
47 |
+
"acc_stderr": 0.048523658709391,
|
48 |
+
"acc_norm": 0.37,
|
49 |
+
"acc_norm_stderr": 0.048523658709391
|
50 |
+
},
|
51 |
+
"harness|mmlu_conceptual_physics|5": {
|
52 |
+
"acc": 0.451063829787234,
|
53 |
+
"acc_stderr": 0.032529096196131965,
|
54 |
+
"acc_norm": 0.451063829787234,
|
55 |
+
"acc_norm_stderr": 0.032529096196131965
|
56 |
+
},
|
57 |
+
"harness|mmlu_virology|5": {
|
58 |
+
"acc": 0.4939759036144578,
|
59 |
+
"acc_stderr": 0.03892212195333045,
|
60 |
+
"acc_norm": 0.4939759036144578,
|
61 |
+
"acc_norm_stderr": 0.03892212195333045
|
62 |
+
},
|
63 |
+
"harness|mmlu_philosophy|5": {
|
64 |
+
"acc": 0.5852090032154341,
|
65 |
+
"acc_stderr": 0.02798268045975956,
|
66 |
+
"acc_norm": 0.5852090032154341,
|
67 |
+
"acc_norm_stderr": 0.02798268045975956
|
68 |
+
},
|
69 |
+
"harness|mmlu_human_aging|5": {
|
70 |
+
"acc": 0.6412556053811659,
|
71 |
+
"acc_stderr": 0.032190792004199956,
|
72 |
+
"acc_norm": 0.6412556053811659,
|
73 |
+
"acc_norm_stderr": 0.032190792004199956
|
74 |
+
},
|
75 |
+
"harness|mmlu_human_sexuality|5": {
|
76 |
+
"acc": 0.5954198473282443,
|
77 |
+
"acc_stderr": 0.043046937953806645,
|
78 |
+
"acc_norm": 0.5954198473282443,
|
79 |
+
"acc_norm_stderr": 0.043046937953806645
|
80 |
+
},
|
81 |
+
"harness|mmlu_medical_genetics|5": {
|
82 |
+
"acc": 0.47,
|
83 |
+
"acc_stderr": 0.05016135580465919,
|
84 |
+
"acc_norm": 0.47,
|
85 |
+
"acc_norm_stderr": 0.05016135580465919
|
86 |
+
},
|
87 |
+
"harness|mmlu_high_school_geography|5": {
|
88 |
+
"acc": 0.6616161616161617,
|
89 |
+
"acc_stderr": 0.033711241426263014,
|
90 |
+
"acc_norm": 0.6616161616161617,
|
91 |
+
"acc_norm_stderr": 0.033711241426263014
|
92 |
+
},
|
93 |
+
"harness|mmlu_electrical_engineering|5": {
|
94 |
+
"acc": 0.4827586206896552,
|
95 |
+
"acc_stderr": 0.041641887201693775,
|
96 |
+
"acc_norm": 0.4827586206896552,
|
97 |
+
"acc_norm_stderr": 0.041641887201693775
|
98 |
+
},
|
99 |
+
"harness|mmlu_college_physics|5": {
|
100 |
+
"acc": 0.2549019607843137,
|
101 |
+
"acc_stderr": 0.04336432707993178,
|
102 |
+
"acc_norm": 0.2549019607843137,
|
103 |
+
"acc_norm_stderr": 0.04336432707993178
|
104 |
+
},
|
105 |
+
"harness|mmlu_high_school_microeconomics|5": {
|
106 |
+
"acc": 0.5882352941176471,
|
107 |
+
"acc_stderr": 0.031968769891957786,
|
108 |
+
"acc_norm": 0.5882352941176471,
|
109 |
+
"acc_norm_stderr": 0.031968769891957786
|
110 |
+
},
|
111 |
+
"harness|mmlu_high_school_macroeconomics|5": {
|
112 |
+
"acc": 0.6025641025641025,
|
113 |
+
"acc_stderr": 0.024811920017903836,
|
114 |
+
"acc_norm": 0.6025641025641025,
|
115 |
+
"acc_norm_stderr": 0.024811920017903836
|
116 |
+
},
|
117 |
+
"harness|mmlu_computer_security|5": {
|
118 |
+
"acc": 0.66,
|
119 |
+
"acc_stderr": 0.04760952285695237,
|
120 |
+
"acc_norm": 0.66,
|
121 |
+
"acc_norm_stderr": 0.04760952285695237
|
122 |
+
},
|
123 |
+
"harness|mmlu_global_facts|5": {
|
124 |
+
"acc": 0.35,
|
125 |
+
"acc_stderr": 0.047937248544110196,
|
126 |
+
"acc_norm": 0.35,
|
127 |
+
"acc_norm_stderr": 0.047937248544110196
|
128 |
+
},
|
129 |
+
"harness|mmlu_jurisprudence|5": {
|
130 |
+
"acc": 0.5925925925925926,
|
131 |
+
"acc_stderr": 0.04750077341199984,
|
132 |
+
"acc_norm": 0.5925925925925926,
|
133 |
+
"acc_norm_stderr": 0.04750077341199984
|
134 |
+
},
|
135 |
+
"harness|mmlu_high_school_chemistry|5": {
|
136 |
+
"acc": 0.43842364532019706,
|
137 |
+
"acc_stderr": 0.03491207857486518,
|
138 |
+
"acc_norm": 0.43842364532019706,
|
139 |
+
"acc_norm_stderr": 0.03491207857486518
|
140 |
+
},
|
141 |
+
"harness|mmlu_high_school_biology|5": {
|
142 |
+
"acc": 0.567741935483871,
|
143 |
+
"acc_stderr": 0.028181739720019413,
|
144 |
+
"acc_norm": 0.567741935483871,
|
145 |
+
"acc_norm_stderr": 0.028181739720019413
|
146 |
+
},
|
147 |
+
"harness|mmlu_marketing|5": {
|
148 |
+
"acc": 0.7948717948717948,
|
149 |
+
"acc_stderr": 0.026453508054040356,
|
150 |
+
"acc_norm": 0.7948717948717948,
|
151 |
+
"acc_norm_stderr": 0.026453508054040356
|
152 |
+
},
|
153 |
+
"harness|mmlu_clinical_knowledge|5": {
|
154 |
+
"acc": 0.5169811320754717,
|
155 |
+
"acc_stderr": 0.030755120364119905,
|
156 |
+
"acc_norm": 0.5169811320754717,
|
157 |
+
"acc_norm_stderr": 0.030755120364119905
|
158 |
+
},
|
159 |
+
"harness|mmlu_public_relations|5": {
|
160 |
+
"acc": 0.5727272727272728,
|
161 |
+
"acc_stderr": 0.047381987035454834,
|
162 |
+
"acc_norm": 0.5727272727272728,
|
163 |
+
"acc_norm_stderr": 0.047381987035454834
|
164 |
+
},
|
165 |
+
"harness|mmlu_high_school_mathematics|5": {
|
166 |
+
"acc": 0.3962962962962963,
|
167 |
+
"acc_stderr": 0.029822619458533997,
|
168 |
+
"acc_norm": 0.3962962962962963,
|
169 |
+
"acc_norm_stderr": 0.029822619458533997
|
170 |
+
},
|
171 |
+
"harness|mmlu_high_school_physics|5": {
|
172 |
+
"acc": 0.3708609271523179,
|
173 |
+
"acc_stderr": 0.03943966699183629,
|
174 |
+
"acc_norm": 0.3708609271523179,
|
175 |
+
"acc_norm_stderr": 0.03943966699183629
|
176 |
+
},
|
177 |
+
"harness|mmlu_sociology|5": {
|
178 |
+
"acc": 0.6766169154228856,
|
179 |
+
"acc_stderr": 0.03307615947979035,
|
180 |
+
"acc_norm": 0.6766169154228856,
|
181 |
+
"acc_norm_stderr": 0.03307615947979035
|
182 |
+
},
|
183 |
+
"harness|mmlu_college_medicine|5": {
|
184 |
+
"acc": 0.49710982658959535,
|
185 |
+
"acc_stderr": 0.038124005659748335,
|
186 |
+
"acc_norm": 0.49710982658959535,
|
187 |
+
"acc_norm_stderr": 0.038124005659748335
|
188 |
+
},
|
189 |
+
"harness|mmlu_elementary_mathematics|5": {
|
190 |
+
"acc": 0.42592592592592593,
|
191 |
+
"acc_stderr": 0.02546714904546955,
|
192 |
+
"acc_norm": 0.42592592592592593,
|
193 |
+
"acc_norm_stderr": 0.02546714904546955
|
194 |
+
},
|
195 |
+
"harness|mmlu_college_biology|5": {
|
196 |
+
"acc": 0.5555555555555556,
|
197 |
+
"acc_stderr": 0.04155319955593146,
|
198 |
+
"acc_norm": 0.5555555555555556,
|
199 |
+
"acc_norm_stderr": 0.04155319955593146
|
200 |
+
},
|
201 |
+
"harness|mmlu_college_chemistry|5": {
|
202 |
+
"acc": 0.4,
|
203 |
+
"acc_stderr": 0.04923659639173309,
|
204 |
+
"acc_norm": 0.4,
|
205 |
+
"acc_norm_stderr": 0.04923659639173309
|
206 |
+
},
|
207 |
+
"harness|mmlu_us_foreign_policy|5": {
|
208 |
+
"acc": 0.73,
|
209 |
+
"acc_stderr": 0.044619604333847394,
|
210 |
+
"acc_norm": 0.73,
|
211 |
+
"acc_norm_stderr": 0.044619604333847394
|
212 |
+
},
|
213 |
+
"harness|mmlu_moral_disputes|5": {
|
214 |
+
"acc": 0.5549132947976878,
|
215 |
+
"acc_stderr": 0.02675625512966377,
|
216 |
+
"acc_norm": 0.5549132947976878,
|
217 |
+
"acc_norm_stderr": 0.02675625512966377
|
218 |
+
},
|
219 |
+
"harness|mmlu_logical_fallacies|5": {
|
220 |
+
"acc": 0.588957055214724,
|
221 |
+
"acc_stderr": 0.038656978537853624,
|
222 |
+
"acc_norm": 0.588957055214724,
|
223 |
+
"acc_norm_stderr": 0.038656978537853624
|
224 |
+
},
|
225 |
+
"harness|mmlu_prehistory|5": {
|
226 |
+
"acc": 0.5771604938271605,
|
227 |
+
"acc_stderr": 0.027487472980871595,
|
228 |
+
"acc_norm": 0.5771604938271605,
|
229 |
+
"acc_norm_stderr": 0.027487472980871595
|
230 |
+
},
|
231 |
+
"harness|mmlu_college_mathematics|5": {
|
232 |
+
"acc": 0.4,
|
233 |
+
"acc_stderr": 0.049236596391733084,
|
234 |
+
"acc_norm": 0.4,
|
235 |
+
"acc_norm_stderr": 0.049236596391733084
|
236 |
+
},
|
237 |
+
"harness|mmlu_high_school_government_and_politics|5": {
|
238 |
+
"acc": 0.7305699481865285,
|
239 |
+
"acc_stderr": 0.032018671228777947,
|
240 |
+
"acc_norm": 0.7305699481865285,
|
241 |
+
"acc_norm_stderr": 0.032018671228777947
|
242 |
+
},
|
243 |
+
"harness|mmlu_econometrics|5": {
|
244 |
+
"acc": 0.42105263157894735,
|
245 |
+
"acc_stderr": 0.046446020912223177,
|
246 |
+
"acc_norm": 0.42105263157894735,
|
247 |
+
"acc_norm_stderr": 0.046446020912223177
|
248 |
+
},
|
249 |
+
"harness|mmlu_high_school_psychology|5": {
|
250 |
+
"acc": 0.7064220183486238,
|
251 |
+
"acc_stderr": 0.019525151122639663,
|
252 |
+
"acc_norm": 0.7064220183486238,
|
253 |
+
"acc_norm_stderr": 0.019525151122639663
|
254 |
+
},
|
255 |
+
"harness|mmlu_formal_logic|5": {
|
256 |
+
"acc": 0.3968253968253968,
|
257 |
+
"acc_stderr": 0.04375888492727061,
|
258 |
+
"acc_norm": 0.3968253968253968,
|
259 |
+
"acc_norm_stderr": 0.04375888492727061
|
260 |
+
},
|
261 |
+
"harness|mmlu_nutrition|5": {
|
262 |
+
"acc": 0.545751633986928,
|
263 |
+
"acc_stderr": 0.02850980780262659,
|
264 |
+
"acc_norm": 0.545751633986928,
|
265 |
+
"acc_norm_stderr": 0.02850980780262659
|
266 |
+
},
|
267 |
+
"harness|mmlu_business_ethics|5": {
|
268 |
+
"acc": 0.55,
|
269 |
+
"acc_stderr": 0.05000000000000001,
|
270 |
+
"acc_norm": 0.55,
|
271 |
+
"acc_norm_stderr": 0.05000000000000001
|
272 |
+
},
|
273 |
+
"harness|mmlu_international_law|5": {
|
274 |
+
"acc": 0.6859504132231405,
|
275 |
+
"acc_stderr": 0.04236964753041019,
|
276 |
+
"acc_norm": 0.6859504132231405,
|
277 |
+
"acc_norm_stderr": 0.04236964753041019
|
278 |
+
},
|
279 |
+
"harness|mmlu_astronomy|5": {
|
280 |
+
"acc": 0.6052631578947368,
|
281 |
+
"acc_stderr": 0.039777499346220734,
|
282 |
+
"acc_norm": 0.6052631578947368,
|
283 |
+
"acc_norm_stderr": 0.039777499346220734
|
284 |
+
},
|
285 |
+
"harness|mmlu_professional_psychology|5": {
|
286 |
+
"acc": 0.5392156862745098,
|
287 |
+
"acc_stderr": 0.02016552331390791,
|
288 |
+
"acc_norm": 0.5392156862745098,
|
289 |
+
"acc_norm_stderr": 0.02016552331390791
|
290 |
+
},
|
291 |
+
"harness|mmlu_professional_accounting|5": {
|
292 |
+
"acc": 0.35815602836879434,
|
293 |
+
"acc_stderr": 0.02860208586275942,
|
294 |
+
"acc_norm": 0.35815602836879434,
|
295 |
+
"acc_norm_stderr": 0.02860208586275942
|
296 |
+
},
|
297 |
+
"harness|mmlu_machine_learning|5": {
|
298 |
+
"acc": 0.4107142857142857,
|
299 |
+
"acc_stderr": 0.04669510663875192,
|
300 |
+
"acc_norm": 0.4107142857142857,
|
301 |
+
"acc_norm_stderr": 0.04669510663875192
|
302 |
+
},
|
303 |
+
"harness|mmlu_high_school_statistics|5": {
|
304 |
+
"acc": 0.44907407407407407,
|
305 |
+
"acc_stderr": 0.03392238405321617,
|
306 |
+
"acc_norm": 0.44907407407407407,
|
307 |
+
"acc_norm_stderr": 0.03392238405321617
|
308 |
+
},
|
309 |
+
"harness|mmlu_moral_scenarios|5": {
|
310 |
+
"acc": 0.3452513966480447,
|
311 |
+
"acc_stderr": 0.015901432608930354,
|
312 |
+
"acc_norm": 0.3452513966480447,
|
313 |
+
"acc_norm_stderr": 0.015901432608930354
|
314 |
+
},
|
315 |
+
"harness|mmlu_college_computer_science|5": {
|
316 |
+
"acc": 0.43,
|
317 |
+
"acc_stderr": 0.049756985195624284,
|
318 |
+
"acc_norm": 0.43,
|
319 |
+
"acc_norm_stderr": 0.049756985195624284
|
320 |
+
},
|
321 |
+
"harness|mmlu_high_school_computer_science|5": {
|
322 |
+
"acc": 0.66,
|
323 |
+
"acc_stderr": 0.04760952285695238,
|
324 |
+
"acc_norm": 0.66,
|
325 |
+
"acc_norm_stderr": 0.04760952285695238
|
326 |
+
},
|
327 |
+
"harness|mmlu_professional_medicine|5": {
|
328 |
+
"acc": 0.45588235294117646,
|
329 |
+
"acc_stderr": 0.030254372573976694,
|
330 |
+
"acc_norm": 0.45588235294117646,
|
331 |
+
"acc_norm_stderr": 0.030254372573976694
|
332 |
+
},
|
333 |
+
"harness|mmlu_security_studies|5": {
|
334 |
+
"acc": 0.6204081632653061,
|
335 |
+
"acc_stderr": 0.031067211262872457,
|
336 |
+
"acc_norm": 0.6204081632653061,
|
337 |
+
"acc_norm_stderr": 0.031067211262872457
|
338 |
+
},
|
339 |
+
"harness|mmlu_high_school_world_history|5": {
|
340 |
+
"acc": 0.6582278481012658,
|
341 |
+
"acc_stderr": 0.030874537537553617,
|
342 |
+
"acc_norm": 0.6582278481012658,
|
343 |
+
"acc_norm_stderr": 0.030874537537553617
|
344 |
+
},
|
345 |
+
"harness|mmlu_professional_law|5": {
|
346 |
+
"acc": 0.4152542372881356,
|
347 |
+
"acc_stderr": 0.012585471793400667,
|
348 |
+
"acc_norm": 0.4152542372881356,
|
349 |
+
"acc_norm_stderr": 0.012585471793400667
|
350 |
+
},
|
351 |
+
"harness|mmlu_high_school_us_history|5": {
|
352 |
+
"acc": 0.5343137254901961,
|
353 |
+
"acc_stderr": 0.03501038327635896,
|
354 |
+
"acc_norm": 0.5343137254901961,
|
355 |
+
"acc_norm_stderr": 0.03501038327635896
|
356 |
+
},
|
357 |
+
"harness|mmlu_high_school_european_history|5": {
|
358 |
+
"acc": 0.5454545454545454,
|
359 |
+
"acc_stderr": 0.038881769216741004,
|
360 |
+
"acc_norm": 0.5454545454545454,
|
361 |
+
"acc_norm_stderr": 0.038881769216741004
|
362 |
+
},
|
363 |
+
"harness|truthfulqa_mc|0": {
|
364 |
+
"mc1": 0.4663402692778458,
|
365 |
+
"mc1_stderr": 0.01746379386716811,
|
366 |
+
"mc2": NaN,
|
367 |
+
"mc2_stderr": NaN
|
368 |
+
},
|
369 |
+
"harness|commongen_v2|2": {
|
370 |
+
"acc": 0.44037780401416765,
|
371 |
+
"acc_stderr": 0.01706769977431298,
|
372 |
+
"acc_norm": 0.44510035419126326,
|
373 |
+
"acc_norm_stderr": 0.01708641743100547
|
374 |
+
}
|
375 |
+
},
|
376 |
+
"versions": {
|
377 |
+
"all": 0,
|
378 |
+
"harness|arc_challenge|25": 0,
|
379 |
+
"harness|hellaswag|10": 0,
|
380 |
+
"harness|mmlu_world_religions|5": 1,
|
381 |
+
"harness|mmlu_management|5": 1,
|
382 |
+
"harness|mmlu_miscellaneous|5": 1,
|
383 |
+
"harness|mmlu_anatomy|5": 1,
|
384 |
+
"harness|mmlu_abstract_algebra|5": 1,
|
385 |
+
"harness|mmlu_conceptual_physics|5": 1,
|
386 |
+
"harness|mmlu_virology|5": 1,
|
387 |
+
"harness|mmlu_philosophy|5": 1,
|
388 |
+
"harness|mmlu_human_aging|5": 1,
|
389 |
+
"harness|mmlu_human_sexuality|5": 1,
|
390 |
+
"harness|mmlu_medical_genetics|5": 1,
|
391 |
+
"harness|mmlu_high_school_geography|5": 1,
|
392 |
+
"harness|mmlu_electrical_engineering|5": 1,
|
393 |
+
"harness|mmlu_college_physics|5": 1,
|
394 |
+
"harness|mmlu_high_school_microeconomics|5": 1,
|
395 |
+
"harness|mmlu_high_school_macroeconomics|5": 1,
|
396 |
+
"harness|mmlu_computer_security|5": 1,
|
397 |
+
"harness|mmlu_global_facts|5": 1,
|
398 |
+
"harness|mmlu_jurisprudence|5": 1,
|
399 |
+
"harness|mmlu_high_school_chemistry|5": 1,
|
400 |
+
"harness|mmlu_high_school_biology|5": 1,
|
401 |
+
"harness|mmlu_marketing|5": 1,
|
402 |
+
"harness|mmlu_clinical_knowledge|5": 1,
|
403 |
+
"harness|mmlu_public_relations|5": 1,
|
404 |
+
"harness|mmlu_high_school_mathematics|5": 1,
|
405 |
+
"harness|mmlu_high_school_physics|5": 1,
|
406 |
+
"harness|mmlu_sociology|5": 1,
|
407 |
+
"harness|mmlu_college_medicine|5": 1,
|
408 |
+
"harness|mmlu_elementary_mathematics|5": 1,
|
409 |
+
"harness|mmlu_college_biology|5": 1,
|
410 |
+
"harness|mmlu_college_chemistry|5": 1,
|
411 |
+
"harness|mmlu_us_foreign_policy|5": 1,
|
412 |
+
"harness|mmlu_moral_disputes|5": 1,
|
413 |
+
"harness|mmlu_logical_fallacies|5": 1,
|
414 |
+
"harness|mmlu_prehistory|5": 1,
|
415 |
+
"harness|mmlu_college_mathematics|5": 1,
|
416 |
+
"harness|mmlu_high_school_government_and_politics|5": 1,
|
417 |
+
"harness|mmlu_econometrics|5": 1,
|
418 |
+
"harness|mmlu_high_school_psychology|5": 1,
|
419 |
+
"harness|mmlu_formal_logic|5": 1,
|
420 |
+
"harness|mmlu_nutrition|5": 1,
|
421 |
+
"harness|mmlu_business_ethics|5": 1,
|
422 |
+
"harness|mmlu_international_law|5": 1,
|
423 |
+
"harness|mmlu_astronomy|5": 1,
|
424 |
+
"harness|mmlu_professional_psychology|5": 1,
|
425 |
+
"harness|mmlu_professional_accounting|5": 1,
|
426 |
+
"harness|mmlu_machine_learning|5": 1,
|
427 |
+
"harness|mmlu_high_school_statistics|5": 1,
|
428 |
+
"harness|mmlu_moral_scenarios|5": 1,
|
429 |
+
"harness|mmlu_college_computer_science|5": 1,
|
430 |
+
"harness|mmlu_high_school_computer_science|5": 1,
|
431 |
+
"harness|mmlu_professional_medicine|5": 1,
|
432 |
+
"harness|mmlu_security_studies|5": 1,
|
433 |
+
"harness|mmlu_high_school_world_history|5": 1,
|
434 |
+
"harness|mmlu_professional_law|5": 1,
|
435 |
+
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
+
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
+
"harness|truthfulqa_mc|0": 0,
|
438 |
+
"harness|commongen_v2|2": 1
|
439 |
+
},
|
440 |
+
"config_general": {
|
441 |
+
"model_name": "x2bee/POLAR-14B-DPO-v1.4",
|
442 |
+
"model_sha": "a6e64075fafaa3d5e393ff89c3cb26f9615e6de9",
|
443 |
+
"model_dtype": "torch.float16",
|
444 |
+
"lighteval_sha": "",
|
445 |
+
"num_few_shot_default": 0,
|
446 |
+
"num_fewshot_seeds": 1,
|
447 |
+
"override_batch_size": 1,
|
448 |
+
"max_samples": null
|
449 |
+
}
|
450 |
+
}
|
eval-results/x2bee/POLAR-14B-HES-DPO-v1.5/result.json
ADDED
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"daily": {
|
4 |
+
"daily": 5
|
5 |
+
},
|
6 |
+
"quarterly": {
|
7 |
+
"quarterly": 5
|
8 |
+
},
|
9 |
+
"harness|arc_challenge|25": {
|
10 |
+
"acc": 0.6638225255972696,
|
11 |
+
"acc_stderr": 0.013804855026205756,
|
12 |
+
"acc_norm": 0.7278156996587031,
|
13 |
+
"acc_norm_stderr": 0.013006600406423709
|
14 |
+
},
|
15 |
+
"harness|hellaswag|10": {
|
16 |
+
"acc": 0.45648277235610435,
|
17 |
+
"acc_stderr": 0.004970846697552306,
|
18 |
+
"acc_norm": 0.6349332802230632,
|
19 |
+
"acc_norm_stderr": 0.004804649197163697
|
20 |
+
},
|
21 |
+
"harness|mmlu_world_religions|5": {
|
22 |
+
"acc": 0.7309941520467836,
|
23 |
+
"acc_stderr": 0.0340105262010409,
|
24 |
+
"acc_norm": 0.7309941520467836,
|
25 |
+
"acc_norm_stderr": 0.0340105262010409
|
26 |
+
},
|
27 |
+
"harness|mmlu_management|5": {
|
28 |
+
"acc": 0.7766990291262136,
|
29 |
+
"acc_stderr": 0.04123553189891431,
|
30 |
+
"acc_norm": 0.7766990291262136,
|
31 |
+
"acc_norm_stderr": 0.04123553189891431
|
32 |
+
},
|
33 |
+
"harness|mmlu_miscellaneous|5": {
|
34 |
+
"acc": 0.7343550446998723,
|
35 |
+
"acc_stderr": 0.01579430248788872,
|
36 |
+
"acc_norm": 0.7343550446998723,
|
37 |
+
"acc_norm_stderr": 0.01579430248788872
|
38 |
+
},
|
39 |
+
"harness|mmlu_anatomy|5": {
|
40 |
+
"acc": 0.45185185185185184,
|
41 |
+
"acc_stderr": 0.04299268905480863,
|
42 |
+
"acc_norm": 0.45185185185185184,
|
43 |
+
"acc_norm_stderr": 0.04299268905480863
|
44 |
+
},
|
45 |
+
"harness|mmlu_abstract_algebra|5": {
|
46 |
+
"acc": 0.35,
|
47 |
+
"acc_stderr": 0.04793724854411019,
|
48 |
+
"acc_norm": 0.35,
|
49 |
+
"acc_norm_stderr": 0.04793724854411019
|
50 |
+
},
|
51 |
+
"harness|mmlu_conceptual_physics|5": {
|
52 |
+
"acc": 0.5276595744680851,
|
53 |
+
"acc_stderr": 0.03263597118409769,
|
54 |
+
"acc_norm": 0.5276595744680851,
|
55 |
+
"acc_norm_stderr": 0.03263597118409769
|
56 |
+
},
|
57 |
+
"harness|mmlu_virology|5": {
|
58 |
+
"acc": 0.4759036144578313,
|
59 |
+
"acc_stderr": 0.03887971849597264,
|
60 |
+
"acc_norm": 0.4759036144578313,
|
61 |
+
"acc_norm_stderr": 0.03887971849597264
|
62 |
+
},
|
63 |
+
"harness|mmlu_philosophy|5": {
|
64 |
+
"acc": 0.6559485530546624,
|
65 |
+
"acc_stderr": 0.026981478043648043,
|
66 |
+
"acc_norm": 0.6559485530546624,
|
67 |
+
"acc_norm_stderr": 0.026981478043648043
|
68 |
+
},
|
69 |
+
"harness|mmlu_human_aging|5": {
|
70 |
+
"acc": 0.6412556053811659,
|
71 |
+
"acc_stderr": 0.032190792004199956,
|
72 |
+
"acc_norm": 0.6412556053811659,
|
73 |
+
"acc_norm_stderr": 0.032190792004199956
|
74 |
+
},
|
75 |
+
"harness|mmlu_human_sexuality|5": {
|
76 |
+
"acc": 0.648854961832061,
|
77 |
+
"acc_stderr": 0.04186445163013751,
|
78 |
+
"acc_norm": 0.648854961832061,
|
79 |
+
"acc_norm_stderr": 0.04186445163013751
|
80 |
+
},
|
81 |
+
"harness|mmlu_medical_genetics|5": {
|
82 |
+
"acc": 0.54,
|
83 |
+
"acc_stderr": 0.05009082659620333,
|
84 |
+
"acc_norm": 0.54,
|
85 |
+
"acc_norm_stderr": 0.05009082659620333
|
86 |
+
},
|
87 |
+
"harness|mmlu_high_school_geography|5": {
|
88 |
+
"acc": 0.7777777777777778,
|
89 |
+
"acc_stderr": 0.029620227874790465,
|
90 |
+
"acc_norm": 0.7777777777777778,
|
91 |
+
"acc_norm_stderr": 0.029620227874790465
|
92 |
+
},
|
93 |
+
"harness|mmlu_electrical_engineering|5": {
|
94 |
+
"acc": 0.5103448275862069,
|
95 |
+
"acc_stderr": 0.04165774775728762,
|
96 |
+
"acc_norm": 0.5103448275862069,
|
97 |
+
"acc_norm_stderr": 0.04165774775728762
|
98 |
+
},
|
99 |
+
"harness|mmlu_college_physics|5": {
|
100 |
+
"acc": 0.3627450980392157,
|
101 |
+
"acc_stderr": 0.04784060704105655,
|
102 |
+
"acc_norm": 0.3627450980392157,
|
103 |
+
"acc_norm_stderr": 0.04784060704105655
|
104 |
+
},
|
105 |
+
"harness|mmlu_high_school_microeconomics|5": {
|
106 |
+
"acc": 0.6680672268907563,
|
107 |
+
"acc_stderr": 0.03058869701378364,
|
108 |
+
"acc_norm": 0.6680672268907563,
|
109 |
+
"acc_norm_stderr": 0.03058869701378364
|
110 |
+
},
|
111 |
+
"harness|mmlu_high_school_macroeconomics|5": {
|
112 |
+
"acc": 0.6384615384615384,
|
113 |
+
"acc_stderr": 0.024359581465397,
|
114 |
+
"acc_norm": 0.6384615384615384,
|
115 |
+
"acc_norm_stderr": 0.024359581465397
|
116 |
+
},
|
117 |
+
"harness|mmlu_computer_security|5": {
|
118 |
+
"acc": 0.65,
|
119 |
+
"acc_stderr": 0.04793724854411021,
|
120 |
+
"acc_norm": 0.65,
|
121 |
+
"acc_norm_stderr": 0.04793724854411021
|
122 |
+
},
|
123 |
+
"harness|mmlu_global_facts|5": {
|
124 |
+
"acc": 0.37,
|
125 |
+
"acc_stderr": 0.04852365870939099,
|
126 |
+
"acc_norm": 0.37,
|
127 |
+
"acc_norm_stderr": 0.04852365870939099
|
128 |
+
},
|
129 |
+
"harness|mmlu_jurisprudence|5": {
|
130 |
+
"acc": 0.6851851851851852,
|
131 |
+
"acc_stderr": 0.04489931073591312,
|
132 |
+
"acc_norm": 0.6851851851851852,
|
133 |
+
"acc_norm_stderr": 0.04489931073591312
|
134 |
+
},
|
135 |
+
"harness|mmlu_high_school_chemistry|5": {
|
136 |
+
"acc": 0.46798029556650245,
|
137 |
+
"acc_stderr": 0.035107665979592154,
|
138 |
+
"acc_norm": 0.46798029556650245,
|
139 |
+
"acc_norm_stderr": 0.035107665979592154
|
140 |
+
},
|
141 |
+
"harness|mmlu_high_school_biology|5": {
|
142 |
+
"acc": 0.6548387096774193,
|
143 |
+
"acc_stderr": 0.02704574657353432,
|
144 |
+
"acc_norm": 0.6548387096774193,
|
145 |
+
"acc_norm_stderr": 0.02704574657353432
|
146 |
+
},
|
147 |
+
"harness|mmlu_marketing|5": {
|
148 |
+
"acc": 0.8162393162393162,
|
149 |
+
"acc_stderr": 0.025372139671722933,
|
150 |
+
"acc_norm": 0.8162393162393162,
|
151 |
+
"acc_norm_stderr": 0.025372139671722933
|
152 |
+
},
|
153 |
+
"harness|mmlu_clinical_knowledge|5": {
|
154 |
+
"acc": 0.5773584905660377,
|
155 |
+
"acc_stderr": 0.03040233144576954,
|
156 |
+
"acc_norm": 0.5773584905660377,
|
157 |
+
"acc_norm_stderr": 0.03040233144576954
|
158 |
+
},
|
159 |
+
"harness|mmlu_public_relations|5": {
|
160 |
+
"acc": 0.6454545454545455,
|
161 |
+
"acc_stderr": 0.045820048415054174,
|
162 |
+
"acc_norm": 0.6454545454545455,
|
163 |
+
"acc_norm_stderr": 0.045820048415054174
|
164 |
+
},
|
165 |
+
"harness|mmlu_high_school_mathematics|5": {
|
166 |
+
"acc": 0.4074074074074074,
|
167 |
+
"acc_stderr": 0.029958249250082118,
|
168 |
+
"acc_norm": 0.4074074074074074,
|
169 |
+
"acc_norm_stderr": 0.029958249250082118
|
170 |
+
},
|
171 |
+
"harness|mmlu_high_school_physics|5": {
|
172 |
+
"acc": 0.3509933774834437,
|
173 |
+
"acc_stderr": 0.03896981964257375,
|
174 |
+
"acc_norm": 0.3509933774834437,
|
175 |
+
"acc_norm_stderr": 0.03896981964257375
|
176 |
+
},
|
177 |
+
"harness|mmlu_sociology|5": {
|
178 |
+
"acc": 0.7263681592039801,
|
179 |
+
"acc_stderr": 0.03152439186555404,
|
180 |
+
"acc_norm": 0.7263681592039801,
|
181 |
+
"acc_norm_stderr": 0.03152439186555404
|
182 |
+
},
|
183 |
+
"harness|mmlu_college_medicine|5": {
|
184 |
+
"acc": 0.5375722543352601,
|
185 |
+
"acc_stderr": 0.0380168510452446,
|
186 |
+
"acc_norm": 0.5375722543352601,
|
187 |
+
"acc_norm_stderr": 0.0380168510452446
|
188 |
+
},
|
189 |
+
"harness|mmlu_elementary_mathematics|5": {
|
190 |
+
"acc": 0.4365079365079365,
|
191 |
+
"acc_stderr": 0.025542846817400496,
|
192 |
+
"acc_norm": 0.4365079365079365,
|
193 |
+
"acc_norm_stderr": 0.025542846817400496
|
194 |
+
},
|
195 |
+
"harness|mmlu_college_biology|5": {
|
196 |
+
"acc": 0.5694444444444444,
|
197 |
+
"acc_stderr": 0.04140685639111503,
|
198 |
+
"acc_norm": 0.5694444444444444,
|
199 |
+
"acc_norm_stderr": 0.04140685639111503
|
200 |
+
},
|
201 |
+
"harness|mmlu_college_chemistry|5": {
|
202 |
+
"acc": 0.43,
|
203 |
+
"acc_stderr": 0.049756985195624284,
|
204 |
+
"acc_norm": 0.43,
|
205 |
+
"acc_norm_stderr": 0.049756985195624284
|
206 |
+
},
|
207 |
+
"harness|mmlu_us_foreign_policy|5": {
|
208 |
+
"acc": 0.78,
|
209 |
+
"acc_stderr": 0.04163331998932263,
|
210 |
+
"acc_norm": 0.78,
|
211 |
+
"acc_norm_stderr": 0.04163331998932263
|
212 |
+
},
|
213 |
+
"harness|mmlu_moral_disputes|5": {
|
214 |
+
"acc": 0.6098265895953757,
|
215 |
+
"acc_stderr": 0.026261677607806642,
|
216 |
+
"acc_norm": 0.6098265895953757,
|
217 |
+
"acc_norm_stderr": 0.026261677607806642
|
218 |
+
},
|
219 |
+
"harness|mmlu_logical_fallacies|5": {
|
220 |
+
"acc": 0.656441717791411,
|
221 |
+
"acc_stderr": 0.03731133519673893,
|
222 |
+
"acc_norm": 0.656441717791411,
|
223 |
+
"acc_norm_stderr": 0.03731133519673893
|
224 |
+
},
|
225 |
+
"harness|mmlu_prehistory|5": {
|
226 |
+
"acc": 0.6574074074074074,
|
227 |
+
"acc_stderr": 0.02640614597362568,
|
228 |
+
"acc_norm": 0.6574074074074074,
|
229 |
+
"acc_norm_stderr": 0.02640614597362568
|
230 |
+
},
|
231 |
+
"harness|mmlu_college_mathematics|5": {
|
232 |
+
"acc": 0.38,
|
233 |
+
"acc_stderr": 0.04878317312145632,
|
234 |
+
"acc_norm": 0.38,
|
235 |
+
"acc_norm_stderr": 0.04878317312145632
|
236 |
+
},
|
237 |
+
"harness|mmlu_high_school_government_and_politics|5": {
|
238 |
+
"acc": 0.7668393782383419,
|
239 |
+
"acc_stderr": 0.03051611137147601,
|
240 |
+
"acc_norm": 0.7668393782383419,
|
241 |
+
"acc_norm_stderr": 0.03051611137147601
|
242 |
+
},
|
243 |
+
"harness|mmlu_econometrics|5": {
|
244 |
+
"acc": 0.45614035087719296,
|
245 |
+
"acc_stderr": 0.046854730419077895,
|
246 |
+
"acc_norm": 0.45614035087719296,
|
247 |
+
"acc_norm_stderr": 0.046854730419077895
|
248 |
+
},
|
249 |
+
"harness|mmlu_high_school_psychology|5": {
|
250 |
+
"acc": 0.7853211009174312,
|
251 |
+
"acc_stderr": 0.017604304149256494,
|
252 |
+
"acc_norm": 0.7853211009174312,
|
253 |
+
"acc_norm_stderr": 0.017604304149256494
|
254 |
+
},
|
255 |
+
"harness|mmlu_formal_logic|5": {
|
256 |
+
"acc": 0.4523809523809524,
|
257 |
+
"acc_stderr": 0.044518079590553275,
|
258 |
+
"acc_norm": 0.4523809523809524,
|
259 |
+
"acc_norm_stderr": 0.044518079590553275
|
260 |
+
},
|
261 |
+
"harness|mmlu_nutrition|5": {
|
262 |
+
"acc": 0.6405228758169934,
|
263 |
+
"acc_stderr": 0.027475969910660952,
|
264 |
+
"acc_norm": 0.6405228758169934,
|
265 |
+
"acc_norm_stderr": 0.027475969910660952
|
266 |
+
},
|
267 |
+
"harness|mmlu_business_ethics|5": {
|
268 |
+
"acc": 0.66,
|
269 |
+
"acc_stderr": 0.04760952285695237,
|
270 |
+
"acc_norm": 0.66,
|
271 |
+
"acc_norm_stderr": 0.04760952285695237
|
272 |
+
},
|
273 |
+
"harness|mmlu_international_law|5": {
|
274 |
+
"acc": 0.7933884297520661,
|
275 |
+
"acc_stderr": 0.03695980128098824,
|
276 |
+
"acc_norm": 0.7933884297520661,
|
277 |
+
"acc_norm_stderr": 0.03695980128098824
|
278 |
+
},
|
279 |
+
"harness|mmlu_astronomy|5": {
|
280 |
+
"acc": 0.6842105263157895,
|
281 |
+
"acc_stderr": 0.0378272898086547,
|
282 |
+
"acc_norm": 0.6842105263157895,
|
283 |
+
"acc_norm_stderr": 0.0378272898086547
|
284 |
+
},
|
285 |
+
"harness|mmlu_professional_psychology|5": {
|
286 |
+
"acc": 0.5964052287581699,
|
287 |
+
"acc_stderr": 0.019848280168401164,
|
288 |
+
"acc_norm": 0.5964052287581699,
|
289 |
+
"acc_norm_stderr": 0.019848280168401164
|
290 |
+
},
|
291 |
+
"harness|mmlu_professional_accounting|5": {
|
292 |
+
"acc": 0.4397163120567376,
|
293 |
+
"acc_stderr": 0.02960991207559411,
|
294 |
+
"acc_norm": 0.4397163120567376,
|
295 |
+
"acc_norm_stderr": 0.02960991207559411
|
296 |
+
},
|
297 |
+
"harness|mmlu_machine_learning|5": {
|
298 |
+
"acc": 0.39285714285714285,
|
299 |
+
"acc_stderr": 0.04635550135609976,
|
300 |
+
"acc_norm": 0.39285714285714285,
|
301 |
+
"acc_norm_stderr": 0.04635550135609976
|
302 |
+
},
|
303 |
+
"harness|mmlu_high_school_statistics|5": {
|
304 |
+
"acc": 0.5787037037037037,
|
305 |
+
"acc_stderr": 0.03367462138896078,
|
306 |
+
"acc_norm": 0.5787037037037037,
|
307 |
+
"acc_norm_stderr": 0.03367462138896078
|
308 |
+
},
|
309 |
+
"harness|mmlu_moral_scenarios|5": {
|
310 |
+
"acc": 0.264804469273743,
|
311 |
+
"acc_stderr": 0.01475690648326066,
|
312 |
+
"acc_norm": 0.264804469273743,
|
313 |
+
"acc_norm_stderr": 0.01475690648326066
|
314 |
+
},
|
315 |
+
"harness|mmlu_college_computer_science|5": {
|
316 |
+
"acc": 0.52,
|
317 |
+
"acc_stderr": 0.050211673156867795,
|
318 |
+
"acc_norm": 0.52,
|
319 |
+
"acc_norm_stderr": 0.050211673156867795
|
320 |
+
},
|
321 |
+
"harness|mmlu_high_school_computer_science|5": {
|
322 |
+
"acc": 0.7,
|
323 |
+
"acc_stderr": 0.04605661864718381,
|
324 |
+
"acc_norm": 0.7,
|
325 |
+
"acc_norm_stderr": 0.04605661864718381
|
326 |
+
},
|
327 |
+
"harness|mmlu_professional_medicine|5": {
|
328 |
+
"acc": 0.5588235294117647,
|
329 |
+
"acc_stderr": 0.03016191193076711,
|
330 |
+
"acc_norm": 0.5588235294117647,
|
331 |
+
"acc_norm_stderr": 0.03016191193076711
|
332 |
+
},
|
333 |
+
"harness|mmlu_security_studies|5": {
|
334 |
+
"acc": 0.6448979591836734,
|
335 |
+
"acc_stderr": 0.030635655150387634,
|
336 |
+
"acc_norm": 0.6448979591836734,
|
337 |
+
"acc_norm_stderr": 0.030635655150387634
|
338 |
+
},
|
339 |
+
"harness|mmlu_high_school_world_history|5": {
|
340 |
+
"acc": 0.7426160337552743,
|
341 |
+
"acc_stderr": 0.028458820991460302,
|
342 |
+
"acc_norm": 0.7426160337552743,
|
343 |
+
"acc_norm_stderr": 0.028458820991460302
|
344 |
+
},
|
345 |
+
"harness|mmlu_professional_law|5": {
|
346 |
+
"acc": 0.44654498044328556,
|
347 |
+
"acc_stderr": 0.012697046024399661,
|
348 |
+
"acc_norm": 0.44654498044328556,
|
349 |
+
"acc_norm_stderr": 0.012697046024399661
|
350 |
+
},
|
351 |
+
"harness|mmlu_high_school_us_history|5": {
|
352 |
+
"acc": 0.6225490196078431,
|
353 |
+
"acc_stderr": 0.03402272044340703,
|
354 |
+
"acc_norm": 0.6225490196078431,
|
355 |
+
"acc_norm_stderr": 0.03402272044340703
|
356 |
+
},
|
357 |
+
"harness|mmlu_high_school_european_history|5": {
|
358 |
+
"acc": 0.6303030303030303,
|
359 |
+
"acc_stderr": 0.03769430314512569,
|
360 |
+
"acc_norm": 0.6303030303030303,
|
361 |
+
"acc_norm_stderr": 0.03769430314512569
|
362 |
+
},
|
363 |
+
"harness|truthfulqa_mc|0": {
|
364 |
+
"mc1": 0.6634026927784578,
|
365 |
+
"mc1_stderr": 0.0165424128094949,
|
366 |
+
"mc2": 0.7515104740134964,
|
367 |
+
"mc2_stderr": 0.014200593490054807
|
368 |
+
},
|
369 |
+
"harness|commongen_v2|2": {
|
370 |
+
"acc": 0.5147579693034239,
|
371 |
+
"acc_stderr": 0.01718286443499856,
|
372 |
+
"acc_norm": 0.526564344746163,
|
373 |
+
"acc_norm_stderr": 0.017166075717577747
|
374 |
+
}
|
375 |
+
},
|
376 |
+
"versions": {
|
377 |
+
"all": 0,
|
378 |
+
"harness|arc_challenge|25": 0,
|
379 |
+
"harness|hellaswag|10": 0,
|
380 |
+
"harness|mmlu_world_religions|5": 1,
|
381 |
+
"harness|mmlu_management|5": 1,
|
382 |
+
"harness|mmlu_miscellaneous|5": 1,
|
383 |
+
"harness|mmlu_anatomy|5": 1,
|
384 |
+
"harness|mmlu_abstract_algebra|5": 1,
|
385 |
+
"harness|mmlu_conceptual_physics|5": 1,
|
386 |
+
"harness|mmlu_virology|5": 1,
|
387 |
+
"harness|mmlu_philosophy|5": 1,
|
388 |
+
"harness|mmlu_human_aging|5": 1,
|
389 |
+
"harness|mmlu_human_sexuality|5": 1,
|
390 |
+
"harness|mmlu_medical_genetics|5": 1,
|
391 |
+
"harness|mmlu_high_school_geography|5": 1,
|
392 |
+
"harness|mmlu_electrical_engineering|5": 1,
|
393 |
+
"harness|mmlu_college_physics|5": 1,
|
394 |
+
"harness|mmlu_high_school_microeconomics|5": 1,
|
395 |
+
"harness|mmlu_high_school_macroeconomics|5": 1,
|
396 |
+
"harness|mmlu_computer_security|5": 1,
|
397 |
+
"harness|mmlu_global_facts|5": 1,
|
398 |
+
"harness|mmlu_jurisprudence|5": 1,
|
399 |
+
"harness|mmlu_high_school_chemistry|5": 1,
|
400 |
+
"harness|mmlu_high_school_biology|5": 1,
|
401 |
+
"harness|mmlu_marketing|5": 1,
|
402 |
+
"harness|mmlu_clinical_knowledge|5": 1,
|
403 |
+
"harness|mmlu_public_relations|5": 1,
|
404 |
+
"harness|mmlu_high_school_mathematics|5": 1,
|
405 |
+
"harness|mmlu_high_school_physics|5": 1,
|
406 |
+
"harness|mmlu_sociology|5": 1,
|
407 |
+
"harness|mmlu_college_medicine|5": 1,
|
408 |
+
"harness|mmlu_elementary_mathematics|5": 1,
|
409 |
+
"harness|mmlu_college_biology|5": 1,
|
410 |
+
"harness|mmlu_college_chemistry|5": 1,
|
411 |
+
"harness|mmlu_us_foreign_policy|5": 1,
|
412 |
+
"harness|mmlu_moral_disputes|5": 1,
|
413 |
+
"harness|mmlu_logical_fallacies|5": 1,
|
414 |
+
"harness|mmlu_prehistory|5": 1,
|
415 |
+
"harness|mmlu_college_mathematics|5": 1,
|
416 |
+
"harness|mmlu_high_school_government_and_politics|5": 1,
|
417 |
+
"harness|mmlu_econometrics|5": 1,
|
418 |
+
"harness|mmlu_high_school_psychology|5": 1,
|
419 |
+
"harness|mmlu_formal_logic|5": 1,
|
420 |
+
"harness|mmlu_nutrition|5": 1,
|
421 |
+
"harness|mmlu_business_ethics|5": 1,
|
422 |
+
"harness|mmlu_international_law|5": 1,
|
423 |
+
"harness|mmlu_astronomy|5": 1,
|
424 |
+
"harness|mmlu_professional_psychology|5": 1,
|
425 |
+
"harness|mmlu_professional_accounting|5": 1,
|
426 |
+
"harness|mmlu_machine_learning|5": 1,
|
427 |
+
"harness|mmlu_high_school_statistics|5": 1,
|
428 |
+
"harness|mmlu_moral_scenarios|5": 1,
|
429 |
+
"harness|mmlu_college_computer_science|5": 1,
|
430 |
+
"harness|mmlu_high_school_computer_science|5": 1,
|
431 |
+
"harness|mmlu_professional_medicine|5": 1,
|
432 |
+
"harness|mmlu_security_studies|5": 1,
|
433 |
+
"harness|mmlu_high_school_world_history|5": 1,
|
434 |
+
"harness|mmlu_professional_law|5": 1,
|
435 |
+
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
+
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
+
"harness|truthfulqa_mc|0": 0,
|
438 |
+
"harness|commongen_v2|2": 1
|
439 |
+
},
|
440 |
+
"config_general": {
|
441 |
+
"model_name": "x2bee/POLAR-14B-HES-DPO-v1.5",
|
442 |
+
"model_sha": "f0bc8e2566ba28c8232d7c690098e634ea894e8d",
|
443 |
+
"model_dtype": "torch.float16",
|
444 |
+
"lighteval_sha": "",
|
445 |
+
"num_few_shot_default": 0,
|
446 |
+
"num_fewshot_seeds": 1,
|
447 |
+
"override_batch_size": 1,
|
448 |
+
"max_samples": null
|
449 |
+
}
|
450 |
+
}
|
eval-results/x2bee/POLAR-14B-SON-SFT-v0.1/result.json
ADDED
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"daily": {
|
4 |
+
"daily": 3
|
5 |
+
},
|
6 |
+
"quarterly": {
|
7 |
+
"quarterly": 3
|
8 |
+
},
|
9 |
+
"harness|arc_challenge|25": {
|
10 |
+
"acc": 0.6646757679180887,
|
11 |
+
"acc_stderr": 0.013796182947785564,
|
12 |
+
"acc_norm": 0.7244027303754266,
|
13 |
+
"acc_norm_stderr": 0.01305716965576184
|
14 |
+
},
|
15 |
+
"harness|hellaswag|10": {
|
16 |
+
"acc": 0.46036646086436966,
|
17 |
+
"acc_stderr": 0.004974080638364276,
|
18 |
+
"acc_norm": 0.6195976897032464,
|
19 |
+
"acc_norm_stderr": 0.004844935327599196
|
20 |
+
},
|
21 |
+
"harness|mmlu_world_religions|5": {
|
22 |
+
"acc": 0.7602339181286549,
|
23 |
+
"acc_stderr": 0.03274485211946956,
|
24 |
+
"acc_norm": 0.7602339181286549,
|
25 |
+
"acc_norm_stderr": 0.03274485211946956
|
26 |
+
},
|
27 |
+
"harness|mmlu_management|5": {
|
28 |
+
"acc": 0.7766990291262136,
|
29 |
+
"acc_stderr": 0.04123553189891431,
|
30 |
+
"acc_norm": 0.7766990291262136,
|
31 |
+
"acc_norm_stderr": 0.04123553189891431
|
32 |
+
},
|
33 |
+
"harness|mmlu_miscellaneous|5": {
|
34 |
+
"acc": 0.7381864623243933,
|
35 |
+
"acc_stderr": 0.01572083867844526,
|
36 |
+
"acc_norm": 0.7381864623243933,
|
37 |
+
"acc_norm_stderr": 0.01572083867844526
|
38 |
+
},
|
39 |
+
"harness|mmlu_anatomy|5": {
|
40 |
+
"acc": 0.5037037037037037,
|
41 |
+
"acc_stderr": 0.04319223625811331,
|
42 |
+
"acc_norm": 0.5037037037037037,
|
43 |
+
"acc_norm_stderr": 0.04319223625811331
|
44 |
+
},
|
45 |
+
"harness|mmlu_abstract_algebra|5": {
|
46 |
+
"acc": 0.35,
|
47 |
+
"acc_stderr": 0.04793724854411019,
|
48 |
+
"acc_norm": 0.35,
|
49 |
+
"acc_norm_stderr": 0.04793724854411019
|
50 |
+
},
|
51 |
+
"harness|mmlu_conceptual_physics|5": {
|
52 |
+
"acc": 0.5404255319148936,
|
53 |
+
"acc_stderr": 0.032579014820998335,
|
54 |
+
"acc_norm": 0.5404255319148936,
|
55 |
+
"acc_norm_stderr": 0.032579014820998335
|
56 |
+
},
|
57 |
+
"harness|mmlu_virology|5": {
|
58 |
+
"acc": 0.5180722891566265,
|
59 |
+
"acc_stderr": 0.038899512528272166,
|
60 |
+
"acc_norm": 0.5180722891566265,
|
61 |
+
"acc_norm_stderr": 0.038899512528272166
|
62 |
+
},
|
63 |
+
"harness|mmlu_philosophy|5": {
|
64 |
+
"acc": 0.6559485530546624,
|
65 |
+
"acc_stderr": 0.026981478043648043,
|
66 |
+
"acc_norm": 0.6559485530546624,
|
67 |
+
"acc_norm_stderr": 0.026981478043648043
|
68 |
+
},
|
69 |
+
"harness|mmlu_human_aging|5": {
|
70 |
+
"acc": 0.6591928251121076,
|
71 |
+
"acc_stderr": 0.0318114974705536,
|
72 |
+
"acc_norm": 0.6591928251121076,
|
73 |
+
"acc_norm_stderr": 0.0318114974705536
|
74 |
+
},
|
75 |
+
"harness|mmlu_human_sexuality|5": {
|
76 |
+
"acc": 0.6564885496183206,
|
77 |
+
"acc_stderr": 0.041649760719448786,
|
78 |
+
"acc_norm": 0.6564885496183206,
|
79 |
+
"acc_norm_stderr": 0.041649760719448786
|
80 |
+
},
|
81 |
+
"harness|mmlu_medical_genetics|5": {
|
82 |
+
"acc": 0.52,
|
83 |
+
"acc_stderr": 0.050211673156867795,
|
84 |
+
"acc_norm": 0.52,
|
85 |
+
"acc_norm_stderr": 0.050211673156867795
|
86 |
+
},
|
87 |
+
"harness|mmlu_high_school_geography|5": {
|
88 |
+
"acc": 0.7575757575757576,
|
89 |
+
"acc_stderr": 0.030532892233932036,
|
90 |
+
"acc_norm": 0.7575757575757576,
|
91 |
+
"acc_norm_stderr": 0.030532892233932036
|
92 |
+
},
|
93 |
+
"harness|mmlu_electrical_engineering|5": {
|
94 |
+
"acc": 0.5586206896551724,
|
95 |
+
"acc_stderr": 0.04137931034482757,
|
96 |
+
"acc_norm": 0.5586206896551724,
|
97 |
+
"acc_norm_stderr": 0.04137931034482757
|
98 |
+
},
|
99 |
+
"harness|mmlu_college_physics|5": {
|
100 |
+
"acc": 0.3137254901960784,
|
101 |
+
"acc_stderr": 0.04617034827006717,
|
102 |
+
"acc_norm": 0.3137254901960784,
|
103 |
+
"acc_norm_stderr": 0.04617034827006717
|
104 |
+
},
|
105 |
+
"harness|mmlu_high_school_microeconomics|5": {
|
106 |
+
"acc": 0.6512605042016807,
|
107 |
+
"acc_stderr": 0.03095663632856655,
|
108 |
+
"acc_norm": 0.6512605042016807,
|
109 |
+
"acc_norm_stderr": 0.03095663632856655
|
110 |
+
},
|
111 |
+
"harness|mmlu_high_school_macroeconomics|5": {
|
112 |
+
"acc": 0.6230769230769231,
|
113 |
+
"acc_stderr": 0.024570975364225995,
|
114 |
+
"acc_norm": 0.6230769230769231,
|
115 |
+
"acc_norm_stderr": 0.024570975364225995
|
116 |
+
},
|
117 |
+
"harness|mmlu_computer_security|5": {
|
118 |
+
"acc": 0.73,
|
119 |
+
"acc_stderr": 0.04461960433384739,
|
120 |
+
"acc_norm": 0.73,
|
121 |
+
"acc_norm_stderr": 0.04461960433384739
|
122 |
+
},
|
123 |
+
"harness|mmlu_global_facts|5": {
|
124 |
+
"acc": 0.32,
|
125 |
+
"acc_stderr": 0.04688261722621505,
|
126 |
+
"acc_norm": 0.32,
|
127 |
+
"acc_norm_stderr": 0.04688261722621505
|
128 |
+
},
|
129 |
+
"harness|mmlu_jurisprudence|5": {
|
130 |
+
"acc": 0.7037037037037037,
|
131 |
+
"acc_stderr": 0.04414343666854933,
|
132 |
+
"acc_norm": 0.7037037037037037,
|
133 |
+
"acc_norm_stderr": 0.04414343666854933
|
134 |
+
},
|
135 |
+
"harness|mmlu_high_school_chemistry|5": {
|
136 |
+
"acc": 0.4630541871921182,
|
137 |
+
"acc_stderr": 0.035083705204426656,
|
138 |
+
"acc_norm": 0.4630541871921182,
|
139 |
+
"acc_norm_stderr": 0.035083705204426656
|
140 |
+
},
|
141 |
+
"harness|mmlu_high_school_biology|5": {
|
142 |
+
"acc": 0.603225806451613,
|
143 |
+
"acc_stderr": 0.027831231605767944,
|
144 |
+
"acc_norm": 0.603225806451613,
|
145 |
+
"acc_norm_stderr": 0.027831231605767944
|
146 |
+
},
|
147 |
+
"harness|mmlu_marketing|5": {
|
148 |
+
"acc": 0.8205128205128205,
|
149 |
+
"acc_stderr": 0.025140935950335435,
|
150 |
+
"acc_norm": 0.8205128205128205,
|
151 |
+
"acc_norm_stderr": 0.025140935950335435
|
152 |
+
},
|
153 |
+
"harness|mmlu_clinical_knowledge|5": {
|
154 |
+
"acc": 0.5962264150943396,
|
155 |
+
"acc_stderr": 0.03019761160019795,
|
156 |
+
"acc_norm": 0.5962264150943396,
|
157 |
+
"acc_norm_stderr": 0.03019761160019795
|
158 |
+
},
|
159 |
+
"harness|mmlu_public_relations|5": {
|
160 |
+
"acc": 0.6181818181818182,
|
161 |
+
"acc_stderr": 0.046534298079135075,
|
162 |
+
"acc_norm": 0.6181818181818182,
|
163 |
+
"acc_norm_stderr": 0.046534298079135075
|
164 |
+
},
|
165 |
+
"harness|mmlu_high_school_mathematics|5": {
|
166 |
+
"acc": 0.37407407407407406,
|
167 |
+
"acc_stderr": 0.029502861128955293,
|
168 |
+
"acc_norm": 0.37407407407407406,
|
169 |
+
"acc_norm_stderr": 0.029502861128955293
|
170 |
+
},
|
171 |
+
"harness|mmlu_high_school_physics|5": {
|
172 |
+
"acc": 0.32450331125827814,
|
173 |
+
"acc_stderr": 0.038227469376587525,
|
174 |
+
"acc_norm": 0.32450331125827814,
|
175 |
+
"acc_norm_stderr": 0.038227469376587525
|
176 |
+
},
|
177 |
+
"harness|mmlu_sociology|5": {
|
178 |
+
"acc": 0.7164179104477612,
|
179 |
+
"acc_stderr": 0.03187187537919796,
|
180 |
+
"acc_norm": 0.7164179104477612,
|
181 |
+
"acc_norm_stderr": 0.03187187537919796
|
182 |
+
},
|
183 |
+
"harness|mmlu_college_medicine|5": {
|
184 |
+
"acc": 0.5375722543352601,
|
185 |
+
"acc_stderr": 0.03801685104524458,
|
186 |
+
"acc_norm": 0.5375722543352601,
|
187 |
+
"acc_norm_stderr": 0.03801685104524458
|
188 |
+
},
|
189 |
+
"harness|mmlu_elementary_mathematics|5": {
|
190 |
+
"acc": 0.42857142857142855,
|
191 |
+
"acc_stderr": 0.025487187147859372,
|
192 |
+
"acc_norm": 0.42857142857142855,
|
193 |
+
"acc_norm_stderr": 0.025487187147859372
|
194 |
+
},
|
195 |
+
"harness|mmlu_college_biology|5": {
|
196 |
+
"acc": 0.5902777777777778,
|
197 |
+
"acc_stderr": 0.04112490974670787,
|
198 |
+
"acc_norm": 0.5902777777777778,
|
199 |
+
"acc_norm_stderr": 0.04112490974670787
|
200 |
+
},
|
201 |
+
"harness|mmlu_college_chemistry|5": {
|
202 |
+
"acc": 0.45,
|
203 |
+
"acc_stderr": 0.049999999999999996,
|
204 |
+
"acc_norm": 0.45,
|
205 |
+
"acc_norm_stderr": 0.049999999999999996
|
206 |
+
},
|
207 |
+
"harness|mmlu_us_foreign_policy|5": {
|
208 |
+
"acc": 0.78,
|
209 |
+
"acc_stderr": 0.04163331998932263,
|
210 |
+
"acc_norm": 0.78,
|
211 |
+
"acc_norm_stderr": 0.04163331998932263
|
212 |
+
},
|
213 |
+
"harness|mmlu_moral_disputes|5": {
|
214 |
+
"acc": 0.6184971098265896,
|
215 |
+
"acc_stderr": 0.026152198619726803,
|
216 |
+
"acc_norm": 0.6184971098265896,
|
217 |
+
"acc_norm_stderr": 0.026152198619726803
|
218 |
+
},
|
219 |
+
"harness|mmlu_logical_fallacies|5": {
|
220 |
+
"acc": 0.6441717791411042,
|
221 |
+
"acc_stderr": 0.03761521380046734,
|
222 |
+
"acc_norm": 0.6441717791411042,
|
223 |
+
"acc_norm_stderr": 0.03761521380046734
|
224 |
+
},
|
225 |
+
"harness|mmlu_prehistory|5": {
|
226 |
+
"acc": 0.6944444444444444,
|
227 |
+
"acc_stderr": 0.025630824975621365,
|
228 |
+
"acc_norm": 0.6944444444444444,
|
229 |
+
"acc_norm_stderr": 0.025630824975621365
|
230 |
+
},
|
231 |
+
"harness|mmlu_college_mathematics|5": {
|
232 |
+
"acc": 0.43,
|
233 |
+
"acc_stderr": 0.049756985195624284,
|
234 |
+
"acc_norm": 0.43,
|
235 |
+
"acc_norm_stderr": 0.049756985195624284
|
236 |
+
},
|
237 |
+
"harness|mmlu_high_school_government_and_politics|5": {
|
238 |
+
"acc": 0.7927461139896373,
|
239 |
+
"acc_stderr": 0.029252823291803638,
|
240 |
+
"acc_norm": 0.7927461139896373,
|
241 |
+
"acc_norm_stderr": 0.029252823291803638
|
242 |
+
},
|
243 |
+
"harness|mmlu_econometrics|5": {
|
244 |
+
"acc": 0.43859649122807015,
|
245 |
+
"acc_stderr": 0.04668000738510455,
|
246 |
+
"acc_norm": 0.43859649122807015,
|
247 |
+
"acc_norm_stderr": 0.04668000738510455
|
248 |
+
},
|
249 |
+
"harness|mmlu_high_school_psychology|5": {
|
250 |
+
"acc": 0.7853211009174312,
|
251 |
+
"acc_stderr": 0.017604304149256494,
|
252 |
+
"acc_norm": 0.7853211009174312,
|
253 |
+
"acc_norm_stderr": 0.017604304149256494
|
254 |
+
},
|
255 |
+
"harness|mmlu_formal_logic|5": {
|
256 |
+
"acc": 0.3968253968253968,
|
257 |
+
"acc_stderr": 0.04375888492727062,
|
258 |
+
"acc_norm": 0.3968253968253968,
|
259 |
+
"acc_norm_stderr": 0.04375888492727062
|
260 |
+
},
|
261 |
+
"harness|mmlu_nutrition|5": {
|
262 |
+
"acc": 0.6437908496732027,
|
263 |
+
"acc_stderr": 0.027420477662629245,
|
264 |
+
"acc_norm": 0.6437908496732027,
|
265 |
+
"acc_norm_stderr": 0.027420477662629245
|
266 |
+
},
|
267 |
+
"harness|mmlu_business_ethics|5": {
|
268 |
+
"acc": 0.63,
|
269 |
+
"acc_stderr": 0.04852365870939099,
|
270 |
+
"acc_norm": 0.63,
|
271 |
+
"acc_norm_stderr": 0.04852365870939099
|
272 |
+
},
|
273 |
+
"harness|mmlu_international_law|5": {
|
274 |
+
"acc": 0.7603305785123967,
|
275 |
+
"acc_stderr": 0.03896878985070415,
|
276 |
+
"acc_norm": 0.7603305785123967,
|
277 |
+
"acc_norm_stderr": 0.03896878985070415
|
278 |
+
},
|
279 |
+
"harness|mmlu_astronomy|5": {
|
280 |
+
"acc": 0.625,
|
281 |
+
"acc_stderr": 0.039397364351956274,
|
282 |
+
"acc_norm": 0.625,
|
283 |
+
"acc_norm_stderr": 0.039397364351956274
|
284 |
+
},
|
285 |
+
"harness|mmlu_professional_psychology|5": {
|
286 |
+
"acc": 0.619281045751634,
|
287 |
+
"acc_stderr": 0.019643801557924806,
|
288 |
+
"acc_norm": 0.619281045751634,
|
289 |
+
"acc_norm_stderr": 0.019643801557924806
|
290 |
+
},
|
291 |
+
"harness|mmlu_professional_accounting|5": {
|
292 |
+
"acc": 0.45390070921985815,
|
293 |
+
"acc_stderr": 0.029700453247291467,
|
294 |
+
"acc_norm": 0.45390070921985815,
|
295 |
+
"acc_norm_stderr": 0.029700453247291467
|
296 |
+
},
|
297 |
+
"harness|mmlu_machine_learning|5": {
|
298 |
+
"acc": 0.41964285714285715,
|
299 |
+
"acc_stderr": 0.04684099321077106,
|
300 |
+
"acc_norm": 0.41964285714285715,
|
301 |
+
"acc_norm_stderr": 0.04684099321077106
|
302 |
+
},
|
303 |
+
"harness|mmlu_high_school_statistics|5": {
|
304 |
+
"acc": 0.5555555555555556,
|
305 |
+
"acc_stderr": 0.03388857118502326,
|
306 |
+
"acc_norm": 0.5555555555555556,
|
307 |
+
"acc_norm_stderr": 0.03388857118502326
|
308 |
+
},
|
309 |
+
"harness|mmlu_moral_scenarios|5": {
|
310 |
+
"acc": 0.3575418994413408,
|
311 |
+
"acc_stderr": 0.016029394474894893,
|
312 |
+
"acc_norm": 0.3575418994413408,
|
313 |
+
"acc_norm_stderr": 0.016029394474894893
|
314 |
+
},
|
315 |
+
"harness|mmlu_college_computer_science|5": {
|
316 |
+
"acc": 0.52,
|
317 |
+
"acc_stderr": 0.050211673156867795,
|
318 |
+
"acc_norm": 0.52,
|
319 |
+
"acc_norm_stderr": 0.050211673156867795
|
320 |
+
},
|
321 |
+
"harness|mmlu_high_school_computer_science|5": {
|
322 |
+
"acc": 0.75,
|
323 |
+
"acc_stderr": 0.04351941398892446,
|
324 |
+
"acc_norm": 0.75,
|
325 |
+
"acc_norm_stderr": 0.04351941398892446
|
326 |
+
},
|
327 |
+
"harness|mmlu_professional_medicine|5": {
|
328 |
+
"acc": 0.5735294117647058,
|
329 |
+
"acc_stderr": 0.03004261583271486,
|
330 |
+
"acc_norm": 0.5735294117647058,
|
331 |
+
"acc_norm_stderr": 0.03004261583271486
|
332 |
+
},
|
333 |
+
"harness|mmlu_security_studies|5": {
|
334 |
+
"acc": 0.6816326530612244,
|
335 |
+
"acc_stderr": 0.02982253379398204,
|
336 |
+
"acc_norm": 0.6816326530612244,
|
337 |
+
"acc_norm_stderr": 0.02982253379398204
|
338 |
+
},
|
339 |
+
"harness|mmlu_high_school_world_history|5": {
|
340 |
+
"acc": 0.7468354430379747,
|
341 |
+
"acc_stderr": 0.028304657943035293,
|
342 |
+
"acc_norm": 0.7468354430379747,
|
343 |
+
"acc_norm_stderr": 0.028304657943035293
|
344 |
+
},
|
345 |
+
"harness|mmlu_professional_law|5": {
|
346 |
+
"acc": 0.455019556714472,
|
347 |
+
"acc_stderr": 0.012718456618701789,
|
348 |
+
"acc_norm": 0.455019556714472,
|
349 |
+
"acc_norm_stderr": 0.012718456618701789
|
350 |
+
},
|
351 |
+
"harness|mmlu_high_school_us_history|5": {
|
352 |
+
"acc": 0.6666666666666666,
|
353 |
+
"acc_stderr": 0.033086111132364364,
|
354 |
+
"acc_norm": 0.6666666666666666,
|
355 |
+
"acc_norm_stderr": 0.033086111132364364
|
356 |
+
},
|
357 |
+
"harness|mmlu_high_school_european_history|5": {
|
358 |
+
"acc": 0.6484848484848484,
|
359 |
+
"acc_stderr": 0.037282069986826503,
|
360 |
+
"acc_norm": 0.6484848484848484,
|
361 |
+
"acc_norm_stderr": 0.037282069986826503
|
362 |
+
},
|
363 |
+
"harness|truthfulqa_mc|0": {
|
364 |
+
"mc1": 0.605875152998776,
|
365 |
+
"mc1_stderr": 0.017106588140700332,
|
366 |
+
"mc2": 0.7254831072808595,
|
367 |
+
"mc2_stderr": 0.014162522228042162
|
368 |
+
},
|
369 |
+
"harness|commongen_v2|2": {
|
370 |
+
"acc": 0.5926800472255017,
|
371 |
+
"acc_stderr": 0.01689245669519127,
|
372 |
+
"acc_norm": 0.6269185360094451,
|
373 |
+
"acc_norm_stderr": 0.016627318275137453
|
374 |
+
}
|
375 |
+
},
|
376 |
+
"versions": {
|
377 |
+
"all": 0,
|
378 |
+
"harness|arc_challenge|25": 0,
|
379 |
+
"harness|hellaswag|10": 0,
|
380 |
+
"harness|mmlu_world_religions|5": 1,
|
381 |
+
"harness|mmlu_management|5": 1,
|
382 |
+
"harness|mmlu_miscellaneous|5": 1,
|
383 |
+
"harness|mmlu_anatomy|5": 1,
|
384 |
+
"harness|mmlu_abstract_algebra|5": 1,
|
385 |
+
"harness|mmlu_conceptual_physics|5": 1,
|
386 |
+
"harness|mmlu_virology|5": 1,
|
387 |
+
"harness|mmlu_philosophy|5": 1,
|
388 |
+
"harness|mmlu_human_aging|5": 1,
|
389 |
+
"harness|mmlu_human_sexuality|5": 1,
|
390 |
+
"harness|mmlu_medical_genetics|5": 1,
|
391 |
+
"harness|mmlu_high_school_geography|5": 1,
|
392 |
+
"harness|mmlu_electrical_engineering|5": 1,
|
393 |
+
"harness|mmlu_college_physics|5": 1,
|
394 |
+
"harness|mmlu_high_school_microeconomics|5": 1,
|
395 |
+
"harness|mmlu_high_school_macroeconomics|5": 1,
|
396 |
+
"harness|mmlu_computer_security|5": 1,
|
397 |
+
"harness|mmlu_global_facts|5": 1,
|
398 |
+
"harness|mmlu_jurisprudence|5": 1,
|
399 |
+
"harness|mmlu_high_school_chemistry|5": 1,
|
400 |
+
"harness|mmlu_high_school_biology|5": 1,
|
401 |
+
"harness|mmlu_marketing|5": 1,
|
402 |
+
"harness|mmlu_clinical_knowledge|5": 1,
|
403 |
+
"harness|mmlu_public_relations|5": 1,
|
404 |
+
"harness|mmlu_high_school_mathematics|5": 1,
|
405 |
+
"harness|mmlu_high_school_physics|5": 1,
|
406 |
+
"harness|mmlu_sociology|5": 1,
|
407 |
+
"harness|mmlu_college_medicine|5": 1,
|
408 |
+
"harness|mmlu_elementary_mathematics|5": 1,
|
409 |
+
"harness|mmlu_college_biology|5": 1,
|
410 |
+
"harness|mmlu_college_chemistry|5": 1,
|
411 |
+
"harness|mmlu_us_foreign_policy|5": 1,
|
412 |
+
"harness|mmlu_moral_disputes|5": 1,
|
413 |
+
"harness|mmlu_logical_fallacies|5": 1,
|
414 |
+
"harness|mmlu_prehistory|5": 1,
|
415 |
+
"harness|mmlu_college_mathematics|5": 1,
|
416 |
+
"harness|mmlu_high_school_government_and_politics|5": 1,
|
417 |
+
"harness|mmlu_econometrics|5": 1,
|
418 |
+
"harness|mmlu_high_school_psychology|5": 1,
|
419 |
+
"harness|mmlu_formal_logic|5": 1,
|
420 |
+
"harness|mmlu_nutrition|5": 1,
|
421 |
+
"harness|mmlu_business_ethics|5": 1,
|
422 |
+
"harness|mmlu_international_law|5": 1,
|
423 |
+
"harness|mmlu_astronomy|5": 1,
|
424 |
+
"harness|mmlu_professional_psychology|5": 1,
|
425 |
+
"harness|mmlu_professional_accounting|5": 1,
|
426 |
+
"harness|mmlu_machine_learning|5": 1,
|
427 |
+
"harness|mmlu_high_school_statistics|5": 1,
|
428 |
+
"harness|mmlu_moral_scenarios|5": 1,
|
429 |
+
"harness|mmlu_college_computer_science|5": 1,
|
430 |
+
"harness|mmlu_high_school_computer_science|5": 1,
|
431 |
+
"harness|mmlu_professional_medicine|5": 1,
|
432 |
+
"harness|mmlu_security_studies|5": 1,
|
433 |
+
"harness|mmlu_high_school_world_history|5": 1,
|
434 |
+
"harness|mmlu_professional_law|5": 1,
|
435 |
+
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
+
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
+
"harness|truthfulqa_mc|0": 0,
|
438 |
+
"harness|commongen_v2|2": 1
|
439 |
+
},
|
440 |
+
"config_general": {
|
441 |
+
"model_name": "x2bee/POLAR-14B-SON-SFT-v0.1",
|
442 |
+
"model_sha": "01286a13088332c1eda4279b5bcfa7a0a33e145f",
|
443 |
+
"model_dtype": "torch.float16",
|
444 |
+
"lighteval_sha": "",
|
445 |
+
"num_few_shot_default": 0,
|
446 |
+
"num_fewshot_seeds": 1,
|
447 |
+
"override_batch_size": 1,
|
448 |
+
"max_samples": null
|
449 |
+
}
|
450 |
+
}
|
eval-results/x2bee/POLAR-14B-v0.2/result.json
ADDED
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"daily": {
|
4 |
+
"daily": 2
|
5 |
+
},
|
6 |
+
"quarterly": {
|
7 |
+
"quarterly": 2
|
8 |
+
},
|
9 |
+
"harness|arc_challenge|25": {
|
10 |
+
"acc": 0.7465870307167235,
|
11 |
+
"acc_stderr": 0.012710896778378602,
|
12 |
+
"acc_norm": 0.7687713310580204,
|
13 |
+
"acc_norm_stderr": 0.012320858834772264
|
14 |
+
},
|
15 |
+
"harness|hellaswag|10": {
|
16 |
+
"acc": 0.681736705835491,
|
17 |
+
"acc_stderr": 0.004648503177353952,
|
18 |
+
"acc_norm": 0.7999402509460267,
|
19 |
+
"acc_norm_stderr": 0.003992272261659531
|
20 |
+
},
|
21 |
+
"harness|mmlu_world_religions|5": {
|
22 |
+
"acc": 0.6549707602339181,
|
23 |
+
"acc_stderr": 0.036459813773888065,
|
24 |
+
"acc_norm": 0.6549707602339181,
|
25 |
+
"acc_norm_stderr": 0.036459813773888065
|
26 |
+
},
|
27 |
+
"harness|mmlu_management|5": {
|
28 |
+
"acc": 0.7378640776699029,
|
29 |
+
"acc_stderr": 0.043546310772605956,
|
30 |
+
"acc_norm": 0.7378640776699029,
|
31 |
+
"acc_norm_stderr": 0.043546310772605956
|
32 |
+
},
|
33 |
+
"harness|mmlu_miscellaneous|5": {
|
34 |
+
"acc": 0.6922094508301405,
|
35 |
+
"acc_stderr": 0.016506045045155633,
|
36 |
+
"acc_norm": 0.6922094508301405,
|
37 |
+
"acc_norm_stderr": 0.016506045045155633
|
38 |
+
},
|
39 |
+
"harness|mmlu_anatomy|5": {
|
40 |
+
"acc": 0.4666666666666667,
|
41 |
+
"acc_stderr": 0.043097329010363554,
|
42 |
+
"acc_norm": 0.4666666666666667,
|
43 |
+
"acc_norm_stderr": 0.043097329010363554
|
44 |
+
},
|
45 |
+
"harness|mmlu_abstract_algebra|5": {
|
46 |
+
"acc": 0.35,
|
47 |
+
"acc_stderr": 0.047937248544110196,
|
48 |
+
"acc_norm": 0.35,
|
49 |
+
"acc_norm_stderr": 0.047937248544110196
|
50 |
+
},
|
51 |
+
"harness|mmlu_conceptual_physics|5": {
|
52 |
+
"acc": 0.4595744680851064,
|
53 |
+
"acc_stderr": 0.03257901482099836,
|
54 |
+
"acc_norm": 0.4595744680851064,
|
55 |
+
"acc_norm_stderr": 0.03257901482099836
|
56 |
+
},
|
57 |
+
"harness|mmlu_virology|5": {
|
58 |
+
"acc": 0.4879518072289157,
|
59 |
+
"acc_stderr": 0.03891364495835821,
|
60 |
+
"acc_norm": 0.4879518072289157,
|
61 |
+
"acc_norm_stderr": 0.03891364495835821
|
62 |
+
},
|
63 |
+
"harness|mmlu_philosophy|5": {
|
64 |
+
"acc": 0.6045016077170418,
|
65 |
+
"acc_stderr": 0.027770918531427834,
|
66 |
+
"acc_norm": 0.6045016077170418,
|
67 |
+
"acc_norm_stderr": 0.027770918531427834
|
68 |
+
},
|
69 |
+
"harness|mmlu_human_aging|5": {
|
70 |
+
"acc": 0.6233183856502242,
|
71 |
+
"acc_stderr": 0.03252113489929188,
|
72 |
+
"acc_norm": 0.6233183856502242,
|
73 |
+
"acc_norm_stderr": 0.03252113489929188
|
74 |
+
},
|
75 |
+
"harness|mmlu_human_sexuality|5": {
|
76 |
+
"acc": 0.6412213740458015,
|
77 |
+
"acc_stderr": 0.04206739313864908,
|
78 |
+
"acc_norm": 0.6412213740458015,
|
79 |
+
"acc_norm_stderr": 0.04206739313864908
|
80 |
+
},
|
81 |
+
"harness|mmlu_medical_genetics|5": {
|
82 |
+
"acc": 0.51,
|
83 |
+
"acc_stderr": 0.05024183937956911,
|
84 |
+
"acc_norm": 0.51,
|
85 |
+
"acc_norm_stderr": 0.05024183937956911
|
86 |
+
},
|
87 |
+
"harness|mmlu_high_school_geography|5": {
|
88 |
+
"acc": 0.7222222222222222,
|
89 |
+
"acc_stderr": 0.03191178226713547,
|
90 |
+
"acc_norm": 0.7222222222222222,
|
91 |
+
"acc_norm_stderr": 0.03191178226713547
|
92 |
+
},
|
93 |
+
"harness|mmlu_electrical_engineering|5": {
|
94 |
+
"acc": 0.5241379310344828,
|
95 |
+
"acc_stderr": 0.0416180850350153,
|
96 |
+
"acc_norm": 0.5241379310344828,
|
97 |
+
"acc_norm_stderr": 0.0416180850350153
|
98 |
+
},
|
99 |
+
"harness|mmlu_college_physics|5": {
|
100 |
+
"acc": 0.3235294117647059,
|
101 |
+
"acc_stderr": 0.046550104113196177,
|
102 |
+
"acc_norm": 0.3235294117647059,
|
103 |
+
"acc_norm_stderr": 0.046550104113196177
|
104 |
+
},
|
105 |
+
"harness|mmlu_high_school_microeconomics|5": {
|
106 |
+
"acc": 0.6764705882352942,
|
107 |
+
"acc_stderr": 0.030388353551886793,
|
108 |
+
"acc_norm": 0.6764705882352942,
|
109 |
+
"acc_norm_stderr": 0.030388353551886793
|
110 |
+
},
|
111 |
+
"harness|mmlu_high_school_macroeconomics|5": {
|
112 |
+
"acc": 0.6384615384615384,
|
113 |
+
"acc_stderr": 0.024359581465397,
|
114 |
+
"acc_norm": 0.6384615384615384,
|
115 |
+
"acc_norm_stderr": 0.024359581465397
|
116 |
+
},
|
117 |
+
"harness|mmlu_computer_security|5": {
|
118 |
+
"acc": 0.65,
|
119 |
+
"acc_stderr": 0.0479372485441102,
|
120 |
+
"acc_norm": 0.65,
|
121 |
+
"acc_norm_stderr": 0.0479372485441102
|
122 |
+
},
|
123 |
+
"harness|mmlu_global_facts|5": {
|
124 |
+
"acc": 0.31,
|
125 |
+
"acc_stderr": 0.04648231987117316,
|
126 |
+
"acc_norm": 0.31,
|
127 |
+
"acc_norm_stderr": 0.04648231987117316
|
128 |
+
},
|
129 |
+
"harness|mmlu_jurisprudence|5": {
|
130 |
+
"acc": 0.6296296296296297,
|
131 |
+
"acc_stderr": 0.04668408033024931,
|
132 |
+
"acc_norm": 0.6296296296296297,
|
133 |
+
"acc_norm_stderr": 0.04668408033024931
|
134 |
+
},
|
135 |
+
"harness|mmlu_high_school_chemistry|5": {
|
136 |
+
"acc": 0.4729064039408867,
|
137 |
+
"acc_stderr": 0.03512819077876105,
|
138 |
+
"acc_norm": 0.4729064039408867,
|
139 |
+
"acc_norm_stderr": 0.03512819077876105
|
140 |
+
},
|
141 |
+
"harness|mmlu_high_school_biology|5": {
|
142 |
+
"acc": 0.5709677419354838,
|
143 |
+
"acc_stderr": 0.028156036538233193,
|
144 |
+
"acc_norm": 0.5709677419354838,
|
145 |
+
"acc_norm_stderr": 0.028156036538233193
|
146 |
+
},
|
147 |
+
"harness|mmlu_marketing|5": {
|
148 |
+
"acc": 0.8034188034188035,
|
149 |
+
"acc_stderr": 0.026035386098951292,
|
150 |
+
"acc_norm": 0.8034188034188035,
|
151 |
+
"acc_norm_stderr": 0.026035386098951292
|
152 |
+
},
|
153 |
+
"harness|mmlu_clinical_knowledge|5": {
|
154 |
+
"acc": 0.5547169811320755,
|
155 |
+
"acc_stderr": 0.030588052974270655,
|
156 |
+
"acc_norm": 0.5547169811320755,
|
157 |
+
"acc_norm_stderr": 0.030588052974270655
|
158 |
+
},
|
159 |
+
"harness|mmlu_public_relations|5": {
|
160 |
+
"acc": 0.6363636363636364,
|
161 |
+
"acc_stderr": 0.04607582090719976,
|
162 |
+
"acc_norm": 0.6363636363636364,
|
163 |
+
"acc_norm_stderr": 0.04607582090719976
|
164 |
+
},
|
165 |
+
"harness|mmlu_high_school_mathematics|5": {
|
166 |
+
"acc": 0.3592592592592593,
|
167 |
+
"acc_stderr": 0.029252905927251976,
|
168 |
+
"acc_norm": 0.3592592592592593,
|
169 |
+
"acc_norm_stderr": 0.029252905927251976
|
170 |
+
},
|
171 |
+
"harness|mmlu_high_school_physics|5": {
|
172 |
+
"acc": 0.3576158940397351,
|
173 |
+
"acc_stderr": 0.03913453431177258,
|
174 |
+
"acc_norm": 0.3576158940397351,
|
175 |
+
"acc_norm_stderr": 0.03913453431177258
|
176 |
+
},
|
177 |
+
"harness|mmlu_sociology|5": {
|
178 |
+
"acc": 0.6268656716417911,
|
179 |
+
"acc_stderr": 0.034198326081760065,
|
180 |
+
"acc_norm": 0.6268656716417911,
|
181 |
+
"acc_norm_stderr": 0.034198326081760065
|
182 |
+
},
|
183 |
+
"harness|mmlu_college_medicine|5": {
|
184 |
+
"acc": 0.48554913294797686,
|
185 |
+
"acc_stderr": 0.03810871630454764,
|
186 |
+
"acc_norm": 0.48554913294797686,
|
187 |
+
"acc_norm_stderr": 0.03810871630454764
|
188 |
+
},
|
189 |
+
"harness|mmlu_elementary_mathematics|5": {
|
190 |
+
"acc": 0.4497354497354497,
|
191 |
+
"acc_stderr": 0.025620857042936648,
|
192 |
+
"acc_norm": 0.4497354497354497,
|
193 |
+
"acc_norm_stderr": 0.025620857042936648
|
194 |
+
},
|
195 |
+
"harness|mmlu_college_biology|5": {
|
196 |
+
"acc": 0.6041666666666666,
|
197 |
+
"acc_stderr": 0.04089465449325582,
|
198 |
+
"acc_norm": 0.6041666666666666,
|
199 |
+
"acc_norm_stderr": 0.04089465449325582
|
200 |
+
},
|
201 |
+
"harness|mmlu_college_chemistry|5": {
|
202 |
+
"acc": 0.32,
|
203 |
+
"acc_stderr": 0.046882617226215034,
|
204 |
+
"acc_norm": 0.32,
|
205 |
+
"acc_norm_stderr": 0.046882617226215034
|
206 |
+
},
|
207 |
+
"harness|mmlu_us_foreign_policy|5": {
|
208 |
+
"acc": 0.71,
|
209 |
+
"acc_stderr": 0.045604802157206824,
|
210 |
+
"acc_norm": 0.71,
|
211 |
+
"acc_norm_stderr": 0.045604802157206824
|
212 |
+
},
|
213 |
+
"harness|mmlu_moral_disputes|5": {
|
214 |
+
"acc": 0.5664739884393064,
|
215 |
+
"acc_stderr": 0.026680134761679217,
|
216 |
+
"acc_norm": 0.5664739884393064,
|
217 |
+
"acc_norm_stderr": 0.026680134761679217
|
218 |
+
},
|
219 |
+
"harness|mmlu_logical_fallacies|5": {
|
220 |
+
"acc": 0.6196319018404908,
|
221 |
+
"acc_stderr": 0.038142698932618374,
|
222 |
+
"acc_norm": 0.6196319018404908,
|
223 |
+
"acc_norm_stderr": 0.038142698932618374
|
224 |
+
},
|
225 |
+
"harness|mmlu_prehistory|5": {
|
226 |
+
"acc": 0.6574074074074074,
|
227 |
+
"acc_stderr": 0.026406145973625686,
|
228 |
+
"acc_norm": 0.6574074074074074,
|
229 |
+
"acc_norm_stderr": 0.026406145973625686
|
230 |
+
},
|
231 |
+
"harness|mmlu_college_mathematics|5": {
|
232 |
+
"acc": 0.37,
|
233 |
+
"acc_stderr": 0.04852365870939098,
|
234 |
+
"acc_norm": 0.37,
|
235 |
+
"acc_norm_stderr": 0.04852365870939098
|
236 |
+
},
|
237 |
+
"harness|mmlu_high_school_government_and_politics|5": {
|
238 |
+
"acc": 0.7616580310880829,
|
239 |
+
"acc_stderr": 0.030748905363909895,
|
240 |
+
"acc_norm": 0.7616580310880829,
|
241 |
+
"acc_norm_stderr": 0.030748905363909895
|
242 |
+
},
|
243 |
+
"harness|mmlu_econometrics|5": {
|
244 |
+
"acc": 0.5,
|
245 |
+
"acc_stderr": 0.047036043419179864,
|
246 |
+
"acc_norm": 0.5,
|
247 |
+
"acc_norm_stderr": 0.047036043419179864
|
248 |
+
},
|
249 |
+
"harness|mmlu_high_school_psychology|5": {
|
250 |
+
"acc": 0.7211009174311926,
|
251 |
+
"acc_stderr": 0.01922746887646353,
|
252 |
+
"acc_norm": 0.7211009174311926,
|
253 |
+
"acc_norm_stderr": 0.01922746887646353
|
254 |
+
},
|
255 |
+
"harness|mmlu_formal_logic|5": {
|
256 |
+
"acc": 0.42857142857142855,
|
257 |
+
"acc_stderr": 0.0442626668137991,
|
258 |
+
"acc_norm": 0.42857142857142855,
|
259 |
+
"acc_norm_stderr": 0.0442626668137991
|
260 |
+
},
|
261 |
+
"harness|mmlu_nutrition|5": {
|
262 |
+
"acc": 0.5816993464052288,
|
263 |
+
"acc_stderr": 0.0282451340243873,
|
264 |
+
"acc_norm": 0.5816993464052288,
|
265 |
+
"acc_norm_stderr": 0.0282451340243873
|
266 |
+
},
|
267 |
+
"harness|mmlu_business_ethics|5": {
|
268 |
+
"acc": 0.73,
|
269 |
+
"acc_stderr": 0.044619604333847394,
|
270 |
+
"acc_norm": 0.73,
|
271 |
+
"acc_norm_stderr": 0.044619604333847394
|
272 |
+
},
|
273 |
+
"harness|mmlu_international_law|5": {
|
274 |
+
"acc": 0.7107438016528925,
|
275 |
+
"acc_stderr": 0.041391127276354626,
|
276 |
+
"acc_norm": 0.7107438016528925,
|
277 |
+
"acc_norm_stderr": 0.041391127276354626
|
278 |
+
},
|
279 |
+
"harness|mmlu_astronomy|5": {
|
280 |
+
"acc": 0.6513157894736842,
|
281 |
+
"acc_stderr": 0.038781398887976104,
|
282 |
+
"acc_norm": 0.6513157894736842,
|
283 |
+
"acc_norm_stderr": 0.038781398887976104
|
284 |
+
},
|
285 |
+
"harness|mmlu_professional_psychology|5": {
|
286 |
+
"acc": 0.5686274509803921,
|
287 |
+
"acc_stderr": 0.020036393768352624,
|
288 |
+
"acc_norm": 0.5686274509803921,
|
289 |
+
"acc_norm_stderr": 0.020036393768352624
|
290 |
+
},
|
291 |
+
"harness|mmlu_professional_accounting|5": {
|
292 |
+
"acc": 0.45390070921985815,
|
293 |
+
"acc_stderr": 0.029700453247291477,
|
294 |
+
"acc_norm": 0.45390070921985815,
|
295 |
+
"acc_norm_stderr": 0.029700453247291477
|
296 |
+
},
|
297 |
+
"harness|mmlu_machine_learning|5": {
|
298 |
+
"acc": 0.4642857142857143,
|
299 |
+
"acc_stderr": 0.04733667890053756,
|
300 |
+
"acc_norm": 0.4642857142857143,
|
301 |
+
"acc_norm_stderr": 0.04733667890053756
|
302 |
+
},
|
303 |
+
"harness|mmlu_high_school_statistics|5": {
|
304 |
+
"acc": 0.5092592592592593,
|
305 |
+
"acc_stderr": 0.034093869469927006,
|
306 |
+
"acc_norm": 0.5092592592592593,
|
307 |
+
"acc_norm_stderr": 0.034093869469927006
|
308 |
+
},
|
309 |
+
"harness|mmlu_moral_scenarios|5": {
|
310 |
+
"acc": 0.37206703910614525,
|
311 |
+
"acc_stderr": 0.016165847583563295,
|
312 |
+
"acc_norm": 0.37206703910614525,
|
313 |
+
"acc_norm_stderr": 0.016165847583563295
|
314 |
+
},
|
315 |
+
"harness|mmlu_college_computer_science|5": {
|
316 |
+
"acc": 0.43,
|
317 |
+
"acc_stderr": 0.049756985195624284,
|
318 |
+
"acc_norm": 0.43,
|
319 |
+
"acc_norm_stderr": 0.049756985195624284
|
320 |
+
},
|
321 |
+
"harness|mmlu_high_school_computer_science|5": {
|
322 |
+
"acc": 0.71,
|
323 |
+
"acc_stderr": 0.045604802157206845,
|
324 |
+
"acc_norm": 0.71,
|
325 |
+
"acc_norm_stderr": 0.045604802157206845
|
326 |
+
},
|
327 |
+
"harness|mmlu_professional_medicine|5": {
|
328 |
+
"acc": 0.5404411764705882,
|
329 |
+
"acc_stderr": 0.030273325077345755,
|
330 |
+
"acc_norm": 0.5404411764705882,
|
331 |
+
"acc_norm_stderr": 0.030273325077345755
|
332 |
+
},
|
333 |
+
"harness|mmlu_security_studies|5": {
|
334 |
+
"acc": 0.6122448979591837,
|
335 |
+
"acc_stderr": 0.03119223072679566,
|
336 |
+
"acc_norm": 0.6122448979591837,
|
337 |
+
"acc_norm_stderr": 0.03119223072679566
|
338 |
+
},
|
339 |
+
"harness|mmlu_high_school_world_history|5": {
|
340 |
+
"acc": 0.7257383966244726,
|
341 |
+
"acc_stderr": 0.029041333510598025,
|
342 |
+
"acc_norm": 0.7257383966244726,
|
343 |
+
"acc_norm_stderr": 0.029041333510598025
|
344 |
+
},
|
345 |
+
"harness|mmlu_professional_law|5": {
|
346 |
+
"acc": 0.4641460234680574,
|
347 |
+
"acc_stderr": 0.01273736131873058,
|
348 |
+
"acc_norm": 0.4641460234680574,
|
349 |
+
"acc_norm_stderr": 0.01273736131873058
|
350 |
+
},
|
351 |
+
"harness|mmlu_high_school_us_history|5": {
|
352 |
+
"acc": 0.6568627450980392,
|
353 |
+
"acc_stderr": 0.03332139944668086,
|
354 |
+
"acc_norm": 0.6568627450980392,
|
355 |
+
"acc_norm_stderr": 0.03332139944668086
|
356 |
+
},
|
357 |
+
"harness|mmlu_high_school_european_history|5": {
|
358 |
+
"acc": 0.6,
|
359 |
+
"acc_stderr": 0.03825460278380025,
|
360 |
+
"acc_norm": 0.6,
|
361 |
+
"acc_norm_stderr": 0.03825460278380025
|
362 |
+
},
|
363 |
+
"harness|truthfulqa_mc|0": {
|
364 |
+
"mc1": 0.7246022031823746,
|
365 |
+
"mc1_stderr": 0.01563813566777552,
|
366 |
+
"mc2": 0.8107575910195236,
|
367 |
+
"mc2_stderr": 0.013335029489665237
|
368 |
+
},
|
369 |
+
"harness|commongen_v2|2": {
|
370 |
+
"acc": 0.525383707201889,
|
371 |
+
"acc_stderr": 0.017168187201429253,
|
372 |
+
"acc_norm": 0.5442739079102715,
|
373 |
+
"acc_norm_stderr": 0.017122829143292655
|
374 |
+
}
|
375 |
+
},
|
376 |
+
"versions": {
|
377 |
+
"all": 0,
|
378 |
+
"harness|arc_challenge|25": 0,
|
379 |
+
"harness|hellaswag|10": 0,
|
380 |
+
"harness|mmlu_world_religions|5": 1,
|
381 |
+
"harness|mmlu_management|5": 1,
|
382 |
+
"harness|mmlu_miscellaneous|5": 1,
|
383 |
+
"harness|mmlu_anatomy|5": 1,
|
384 |
+
"harness|mmlu_abstract_algebra|5": 1,
|
385 |
+
"harness|mmlu_conceptual_physics|5": 1,
|
386 |
+
"harness|mmlu_virology|5": 1,
|
387 |
+
"harness|mmlu_philosophy|5": 1,
|
388 |
+
"harness|mmlu_human_aging|5": 1,
|
389 |
+
"harness|mmlu_human_sexuality|5": 1,
|
390 |
+
"harness|mmlu_medical_genetics|5": 1,
|
391 |
+
"harness|mmlu_high_school_geography|5": 1,
|
392 |
+
"harness|mmlu_electrical_engineering|5": 1,
|
393 |
+
"harness|mmlu_college_physics|5": 1,
|
394 |
+
"harness|mmlu_high_school_microeconomics|5": 1,
|
395 |
+
"harness|mmlu_high_school_macroeconomics|5": 1,
|
396 |
+
"harness|mmlu_computer_security|5": 1,
|
397 |
+
"harness|mmlu_global_facts|5": 1,
|
398 |
+
"harness|mmlu_jurisprudence|5": 1,
|
399 |
+
"harness|mmlu_high_school_chemistry|5": 1,
|
400 |
+
"harness|mmlu_high_school_biology|5": 1,
|
401 |
+
"harness|mmlu_marketing|5": 1,
|
402 |
+
"harness|mmlu_clinical_knowledge|5": 1,
|
403 |
+
"harness|mmlu_public_relations|5": 1,
|
404 |
+
"harness|mmlu_high_school_mathematics|5": 1,
|
405 |
+
"harness|mmlu_high_school_physics|5": 1,
|
406 |
+
"harness|mmlu_sociology|5": 1,
|
407 |
+
"harness|mmlu_college_medicine|5": 1,
|
408 |
+
"harness|mmlu_elementary_mathematics|5": 1,
|
409 |
+
"harness|mmlu_college_biology|5": 1,
|
410 |
+
"harness|mmlu_college_chemistry|5": 1,
|
411 |
+
"harness|mmlu_us_foreign_policy|5": 1,
|
412 |
+
"harness|mmlu_moral_disputes|5": 1,
|
413 |
+
"harness|mmlu_logical_fallacies|5": 1,
|
414 |
+
"harness|mmlu_prehistory|5": 1,
|
415 |
+
"harness|mmlu_college_mathematics|5": 1,
|
416 |
+
"harness|mmlu_high_school_government_and_politics|5": 1,
|
417 |
+
"harness|mmlu_econometrics|5": 1,
|
418 |
+
"harness|mmlu_high_school_psychology|5": 1,
|
419 |
+
"harness|mmlu_formal_logic|5": 1,
|
420 |
+
"harness|mmlu_nutrition|5": 1,
|
421 |
+
"harness|mmlu_business_ethics|5": 1,
|
422 |
+
"harness|mmlu_international_law|5": 1,
|
423 |
+
"harness|mmlu_astronomy|5": 1,
|
424 |
+
"harness|mmlu_professional_psychology|5": 1,
|
425 |
+
"harness|mmlu_professional_accounting|5": 1,
|
426 |
+
"harness|mmlu_machine_learning|5": 1,
|
427 |
+
"harness|mmlu_high_school_statistics|5": 1,
|
428 |
+
"harness|mmlu_moral_scenarios|5": 1,
|
429 |
+
"harness|mmlu_college_computer_science|5": 1,
|
430 |
+
"harness|mmlu_high_school_computer_science|5": 1,
|
431 |
+
"harness|mmlu_professional_medicine|5": 1,
|
432 |
+
"harness|mmlu_security_studies|5": 1,
|
433 |
+
"harness|mmlu_high_school_world_history|5": 1,
|
434 |
+
"harness|mmlu_professional_law|5": 1,
|
435 |
+
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
+
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
+
"harness|truthfulqa_mc|0": 0,
|
438 |
+
"harness|commongen_v2|2": 1
|
439 |
+
},
|
440 |
+
"config_general": {
|
441 |
+
"model_name": "x2bee/POLAR-14B-v0.2",
|
442 |
+
"model_sha": "8d905623a3972e11260420130039c62e115cbbaa",
|
443 |
+
"model_dtype": "torch.float16",
|
444 |
+
"lighteval_sha": "",
|
445 |
+
"num_few_shot_default": 0,
|
446 |
+
"num_fewshot_seeds": 1,
|
447 |
+
"override_batch_size": 1,
|
448 |
+
"max_samples": null
|
449 |
+
}
|
450 |
+
}
|
eval-results/x2bee/POLAR-14B-v0.5/result.json
ADDED
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"daily": {
|
4 |
+
"daily": 1
|
5 |
+
},
|
6 |
+
"quarterly": {
|
7 |
+
"quarterly": 1
|
8 |
+
},
|
9 |
+
"harness|arc_challenge|25": {
|
10 |
+
"acc": 0.75,
|
11 |
+
"acc_stderr": 0.012653835621466646,
|
12 |
+
"acc_norm": 0.7798634812286689,
|
13 |
+
"acc_norm_stderr": 0.012108124883460988
|
14 |
+
},
|
15 |
+
"harness|hellaswag|10": {
|
16 |
+
"acc": 0.6500697072296355,
|
17 |
+
"acc_stderr": 0.004759729267943182,
|
18 |
+
"acc_norm": 0.775542720573591,
|
19 |
+
"acc_norm_stderr": 0.004163717220873764
|
20 |
+
},
|
21 |
+
"harness|mmlu_world_religions|5": {
|
22 |
+
"acc": 0.6374269005847953,
|
23 |
+
"acc_stderr": 0.036871306155620606,
|
24 |
+
"acc_norm": 0.6374269005847953,
|
25 |
+
"acc_norm_stderr": 0.036871306155620606
|
26 |
+
},
|
27 |
+
"harness|mmlu_management|5": {
|
28 |
+
"acc": 0.7087378640776699,
|
29 |
+
"acc_stderr": 0.044986763205729224,
|
30 |
+
"acc_norm": 0.7087378640776699,
|
31 |
+
"acc_norm_stderr": 0.044986763205729224
|
32 |
+
},
|
33 |
+
"harness|mmlu_miscellaneous|5": {
|
34 |
+
"acc": 0.6730523627075351,
|
35 |
+
"acc_stderr": 0.016774908180131484,
|
36 |
+
"acc_norm": 0.6730523627075351,
|
37 |
+
"acc_norm_stderr": 0.016774908180131484
|
38 |
+
},
|
39 |
+
"harness|mmlu_anatomy|5": {
|
40 |
+
"acc": 0.45185185185185184,
|
41 |
+
"acc_stderr": 0.04299268905480864,
|
42 |
+
"acc_norm": 0.45185185185185184,
|
43 |
+
"acc_norm_stderr": 0.04299268905480864
|
44 |
+
},
|
45 |
+
"harness|mmlu_abstract_algebra|5": {
|
46 |
+
"acc": 0.36,
|
47 |
+
"acc_stderr": 0.048241815132442176,
|
48 |
+
"acc_norm": 0.36,
|
49 |
+
"acc_norm_stderr": 0.048241815132442176
|
50 |
+
},
|
51 |
+
"harness|mmlu_conceptual_physics|5": {
|
52 |
+
"acc": 0.4723404255319149,
|
53 |
+
"acc_stderr": 0.03263597118409769,
|
54 |
+
"acc_norm": 0.4723404255319149,
|
55 |
+
"acc_norm_stderr": 0.03263597118409769
|
56 |
+
},
|
57 |
+
"harness|mmlu_virology|5": {
|
58 |
+
"acc": 0.46987951807228917,
|
59 |
+
"acc_stderr": 0.03885425420866766,
|
60 |
+
"acc_norm": 0.46987951807228917,
|
61 |
+
"acc_norm_stderr": 0.03885425420866766
|
62 |
+
},
|
63 |
+
"harness|mmlu_philosophy|5": {
|
64 |
+
"acc": 0.594855305466238,
|
65 |
+
"acc_stderr": 0.027882383791325963,
|
66 |
+
"acc_norm": 0.594855305466238,
|
67 |
+
"acc_norm_stderr": 0.027882383791325963
|
68 |
+
},
|
69 |
+
"harness|mmlu_human_aging|5": {
|
70 |
+
"acc": 0.6412556053811659,
|
71 |
+
"acc_stderr": 0.032190792004199956,
|
72 |
+
"acc_norm": 0.6412556053811659,
|
73 |
+
"acc_norm_stderr": 0.032190792004199956
|
74 |
+
},
|
75 |
+
"harness|mmlu_human_sexuality|5": {
|
76 |
+
"acc": 0.5954198473282443,
|
77 |
+
"acc_stderr": 0.043046937953806645,
|
78 |
+
"acc_norm": 0.5954198473282443,
|
79 |
+
"acc_norm_stderr": 0.043046937953806645
|
80 |
+
},
|
81 |
+
"harness|mmlu_medical_genetics|5": {
|
82 |
+
"acc": 0.5,
|
83 |
+
"acc_stderr": 0.050251890762960605,
|
84 |
+
"acc_norm": 0.5,
|
85 |
+
"acc_norm_stderr": 0.050251890762960605
|
86 |
+
},
|
87 |
+
"harness|mmlu_high_school_geography|5": {
|
88 |
+
"acc": 0.7272727272727273,
|
89 |
+
"acc_stderr": 0.03173071239071724,
|
90 |
+
"acc_norm": 0.7272727272727273,
|
91 |
+
"acc_norm_stderr": 0.03173071239071724
|
92 |
+
},
|
93 |
+
"harness|mmlu_electrical_engineering|5": {
|
94 |
+
"acc": 0.503448275862069,
|
95 |
+
"acc_stderr": 0.0416656757710158,
|
96 |
+
"acc_norm": 0.503448275862069,
|
97 |
+
"acc_norm_stderr": 0.0416656757710158
|
98 |
+
},
|
99 |
+
"harness|mmlu_college_physics|5": {
|
100 |
+
"acc": 0.3431372549019608,
|
101 |
+
"acc_stderr": 0.04724007352383888,
|
102 |
+
"acc_norm": 0.3431372549019608,
|
103 |
+
"acc_norm_stderr": 0.04724007352383888
|
104 |
+
},
|
105 |
+
"harness|mmlu_high_school_microeconomics|5": {
|
106 |
+
"acc": 0.6596638655462185,
|
107 |
+
"acc_stderr": 0.03077805742293167,
|
108 |
+
"acc_norm": 0.6596638655462185,
|
109 |
+
"acc_norm_stderr": 0.03077805742293167
|
110 |
+
},
|
111 |
+
"harness|mmlu_high_school_macroeconomics|5": {
|
112 |
+
"acc": 0.6102564102564103,
|
113 |
+
"acc_stderr": 0.024726967886647078,
|
114 |
+
"acc_norm": 0.6102564102564103,
|
115 |
+
"acc_norm_stderr": 0.024726967886647078
|
116 |
+
},
|
117 |
+
"harness|mmlu_computer_security|5": {
|
118 |
+
"acc": 0.67,
|
119 |
+
"acc_stderr": 0.047258156262526094,
|
120 |
+
"acc_norm": 0.67,
|
121 |
+
"acc_norm_stderr": 0.047258156262526094
|
122 |
+
},
|
123 |
+
"harness|mmlu_global_facts|5": {
|
124 |
+
"acc": 0.33,
|
125 |
+
"acc_stderr": 0.047258156262526045,
|
126 |
+
"acc_norm": 0.33,
|
127 |
+
"acc_norm_stderr": 0.047258156262526045
|
128 |
+
},
|
129 |
+
"harness|mmlu_jurisprudence|5": {
|
130 |
+
"acc": 0.6481481481481481,
|
131 |
+
"acc_stderr": 0.04616631111801714,
|
132 |
+
"acc_norm": 0.6481481481481481,
|
133 |
+
"acc_norm_stderr": 0.04616631111801714
|
134 |
+
},
|
135 |
+
"harness|mmlu_high_school_chemistry|5": {
|
136 |
+
"acc": 0.4729064039408867,
|
137 |
+
"acc_stderr": 0.03512819077876105,
|
138 |
+
"acc_norm": 0.4729064039408867,
|
139 |
+
"acc_norm_stderr": 0.03512819077876105
|
140 |
+
},
|
141 |
+
"harness|mmlu_high_school_biology|5": {
|
142 |
+
"acc": 0.5709677419354838,
|
143 |
+
"acc_stderr": 0.028156036538233193,
|
144 |
+
"acc_norm": 0.5709677419354838,
|
145 |
+
"acc_norm_stderr": 0.028156036538233193
|
146 |
+
},
|
147 |
+
"harness|mmlu_marketing|5": {
|
148 |
+
"acc": 0.7735042735042735,
|
149 |
+
"acc_stderr": 0.027421007295392943,
|
150 |
+
"acc_norm": 0.7735042735042735,
|
151 |
+
"acc_norm_stderr": 0.027421007295392943
|
152 |
+
},
|
153 |
+
"harness|mmlu_clinical_knowledge|5": {
|
154 |
+
"acc": 0.5660377358490566,
|
155 |
+
"acc_stderr": 0.030503292013342596,
|
156 |
+
"acc_norm": 0.5660377358490566,
|
157 |
+
"acc_norm_stderr": 0.030503292013342596
|
158 |
+
},
|
159 |
+
"harness|mmlu_public_relations|5": {
|
160 |
+
"acc": 0.6272727272727273,
|
161 |
+
"acc_stderr": 0.04631381319425465,
|
162 |
+
"acc_norm": 0.6272727272727273,
|
163 |
+
"acc_norm_stderr": 0.04631381319425465
|
164 |
+
},
|
165 |
+
"harness|mmlu_high_school_mathematics|5": {
|
166 |
+
"acc": 0.3333333333333333,
|
167 |
+
"acc_stderr": 0.0287420409039485,
|
168 |
+
"acc_norm": 0.3333333333333333,
|
169 |
+
"acc_norm_stderr": 0.0287420409039485
|
170 |
+
},
|
171 |
+
"harness|mmlu_high_school_physics|5": {
|
172 |
+
"acc": 0.39072847682119205,
|
173 |
+
"acc_stderr": 0.039837983066598075,
|
174 |
+
"acc_norm": 0.39072847682119205,
|
175 |
+
"acc_norm_stderr": 0.039837983066598075
|
176 |
+
},
|
177 |
+
"harness|mmlu_sociology|5": {
|
178 |
+
"acc": 0.6417910447761194,
|
179 |
+
"acc_stderr": 0.03390393042268814,
|
180 |
+
"acc_norm": 0.6417910447761194,
|
181 |
+
"acc_norm_stderr": 0.03390393042268814
|
182 |
+
},
|
183 |
+
"harness|mmlu_college_medicine|5": {
|
184 |
+
"acc": 0.5028901734104047,
|
185 |
+
"acc_stderr": 0.038124005659748335,
|
186 |
+
"acc_norm": 0.5028901734104047,
|
187 |
+
"acc_norm_stderr": 0.038124005659748335
|
188 |
+
},
|
189 |
+
"harness|mmlu_elementary_mathematics|5": {
|
190 |
+
"acc": 0.42857142857142855,
|
191 |
+
"acc_stderr": 0.025487187147859372,
|
192 |
+
"acc_norm": 0.42857142857142855,
|
193 |
+
"acc_norm_stderr": 0.025487187147859372
|
194 |
+
},
|
195 |
+
"harness|mmlu_college_biology|5": {
|
196 |
+
"acc": 0.6180555555555556,
|
197 |
+
"acc_stderr": 0.040629907841466674,
|
198 |
+
"acc_norm": 0.6180555555555556,
|
199 |
+
"acc_norm_stderr": 0.040629907841466674
|
200 |
+
},
|
201 |
+
"harness|mmlu_college_chemistry|5": {
|
202 |
+
"acc": 0.3,
|
203 |
+
"acc_stderr": 0.046056618647183814,
|
204 |
+
"acc_norm": 0.3,
|
205 |
+
"acc_norm_stderr": 0.046056618647183814
|
206 |
+
},
|
207 |
+
"harness|mmlu_us_foreign_policy|5": {
|
208 |
+
"acc": 0.72,
|
209 |
+
"acc_stderr": 0.04512608598542127,
|
210 |
+
"acc_norm": 0.72,
|
211 |
+
"acc_norm_stderr": 0.04512608598542127
|
212 |
+
},
|
213 |
+
"harness|mmlu_moral_disputes|5": {
|
214 |
+
"acc": 0.5809248554913294,
|
215 |
+
"acc_stderr": 0.026564178111422622,
|
216 |
+
"acc_norm": 0.5809248554913294,
|
217 |
+
"acc_norm_stderr": 0.026564178111422622
|
218 |
+
},
|
219 |
+
"harness|mmlu_logical_fallacies|5": {
|
220 |
+
"acc": 0.6257668711656442,
|
221 |
+
"acc_stderr": 0.03802068102899615,
|
222 |
+
"acc_norm": 0.6257668711656442,
|
223 |
+
"acc_norm_stderr": 0.03802068102899615
|
224 |
+
},
|
225 |
+
"harness|mmlu_prehistory|5": {
|
226 |
+
"acc": 0.5987654320987654,
|
227 |
+
"acc_stderr": 0.027272582849839803,
|
228 |
+
"acc_norm": 0.5987654320987654,
|
229 |
+
"acc_norm_stderr": 0.027272582849839803
|
230 |
+
},
|
231 |
+
"harness|mmlu_college_mathematics|5": {
|
232 |
+
"acc": 0.34,
|
233 |
+
"acc_stderr": 0.04760952285695235,
|
234 |
+
"acc_norm": 0.34,
|
235 |
+
"acc_norm_stderr": 0.04760952285695235
|
236 |
+
},
|
237 |
+
"harness|mmlu_high_school_government_and_politics|5": {
|
238 |
+
"acc": 0.7512953367875648,
|
239 |
+
"acc_stderr": 0.031195840877700304,
|
240 |
+
"acc_norm": 0.7512953367875648,
|
241 |
+
"acc_norm_stderr": 0.031195840877700304
|
242 |
+
},
|
243 |
+
"harness|mmlu_econometrics|5": {
|
244 |
+
"acc": 0.47368421052631576,
|
245 |
+
"acc_stderr": 0.046970851366478626,
|
246 |
+
"acc_norm": 0.47368421052631576,
|
247 |
+
"acc_norm_stderr": 0.046970851366478626
|
248 |
+
},
|
249 |
+
"harness|mmlu_high_school_psychology|5": {
|
250 |
+
"acc": 0.7229357798165138,
|
251 |
+
"acc_stderr": 0.019188482590169538,
|
252 |
+
"acc_norm": 0.7229357798165138,
|
253 |
+
"acc_norm_stderr": 0.019188482590169538
|
254 |
+
},
|
255 |
+
"harness|mmlu_formal_logic|5": {
|
256 |
+
"acc": 0.4523809523809524,
|
257 |
+
"acc_stderr": 0.044518079590553275,
|
258 |
+
"acc_norm": 0.4523809523809524,
|
259 |
+
"acc_norm_stderr": 0.044518079590553275
|
260 |
+
},
|
261 |
+
"harness|mmlu_nutrition|5": {
|
262 |
+
"acc": 0.5718954248366013,
|
263 |
+
"acc_stderr": 0.028332397483664278,
|
264 |
+
"acc_norm": 0.5718954248366013,
|
265 |
+
"acc_norm_stderr": 0.028332397483664278
|
266 |
+
},
|
267 |
+
"harness|mmlu_business_ethics|5": {
|
268 |
+
"acc": 0.68,
|
269 |
+
"acc_stderr": 0.04688261722621504,
|
270 |
+
"acc_norm": 0.68,
|
271 |
+
"acc_norm_stderr": 0.04688261722621504
|
272 |
+
},
|
273 |
+
"harness|mmlu_international_law|5": {
|
274 |
+
"acc": 0.7520661157024794,
|
275 |
+
"acc_stderr": 0.039418975265163025,
|
276 |
+
"acc_norm": 0.7520661157024794,
|
277 |
+
"acc_norm_stderr": 0.039418975265163025
|
278 |
+
},
|
279 |
+
"harness|mmlu_astronomy|5": {
|
280 |
+
"acc": 0.618421052631579,
|
281 |
+
"acc_stderr": 0.03953173377749194,
|
282 |
+
"acc_norm": 0.618421052631579,
|
283 |
+
"acc_norm_stderr": 0.03953173377749194
|
284 |
+
},
|
285 |
+
"harness|mmlu_professional_psychology|5": {
|
286 |
+
"acc": 0.5408496732026143,
|
287 |
+
"acc_stderr": 0.020160213617222516,
|
288 |
+
"acc_norm": 0.5408496732026143,
|
289 |
+
"acc_norm_stderr": 0.020160213617222516
|
290 |
+
},
|
291 |
+
"harness|mmlu_professional_accounting|5": {
|
292 |
+
"acc": 0.45390070921985815,
|
293 |
+
"acc_stderr": 0.029700453247291463,
|
294 |
+
"acc_norm": 0.45390070921985815,
|
295 |
+
"acc_norm_stderr": 0.029700453247291463
|
296 |
+
},
|
297 |
+
"harness|mmlu_machine_learning|5": {
|
298 |
+
"acc": 0.44642857142857145,
|
299 |
+
"acc_stderr": 0.04718471485219588,
|
300 |
+
"acc_norm": 0.44642857142857145,
|
301 |
+
"acc_norm_stderr": 0.04718471485219588
|
302 |
+
},
|
303 |
+
"harness|mmlu_high_school_statistics|5": {
|
304 |
+
"acc": 0.5416666666666666,
|
305 |
+
"acc_stderr": 0.03398110890294636,
|
306 |
+
"acc_norm": 0.5416666666666666,
|
307 |
+
"acc_norm_stderr": 0.03398110890294636
|
308 |
+
},
|
309 |
+
"harness|mmlu_moral_scenarios|5": {
|
310 |
+
"acc": 0.35195530726256985,
|
311 |
+
"acc_stderr": 0.01597266852368907,
|
312 |
+
"acc_norm": 0.35195530726256985,
|
313 |
+
"acc_norm_stderr": 0.01597266852368907
|
314 |
+
},
|
315 |
+
"harness|mmlu_college_computer_science|5": {
|
316 |
+
"acc": 0.44,
|
317 |
+
"acc_stderr": 0.0498887651569859,
|
318 |
+
"acc_norm": 0.44,
|
319 |
+
"acc_norm_stderr": 0.0498887651569859
|
320 |
+
},
|
321 |
+
"harness|mmlu_high_school_computer_science|5": {
|
322 |
+
"acc": 0.68,
|
323 |
+
"acc_stderr": 0.04688261722621503,
|
324 |
+
"acc_norm": 0.68,
|
325 |
+
"acc_norm_stderr": 0.04688261722621503
|
326 |
+
},
|
327 |
+
"harness|mmlu_professional_medicine|5": {
|
328 |
+
"acc": 0.5147058823529411,
|
329 |
+
"acc_stderr": 0.03035969707904612,
|
330 |
+
"acc_norm": 0.5147058823529411,
|
331 |
+
"acc_norm_stderr": 0.03035969707904612
|
332 |
+
},
|
333 |
+
"harness|mmlu_security_studies|5": {
|
334 |
+
"acc": 0.6122448979591837,
|
335 |
+
"acc_stderr": 0.031192230726795656,
|
336 |
+
"acc_norm": 0.6122448979591837,
|
337 |
+
"acc_norm_stderr": 0.031192230726795656
|
338 |
+
},
|
339 |
+
"harness|mmlu_high_school_world_history|5": {
|
340 |
+
"acc": 0.7215189873417721,
|
341 |
+
"acc_stderr": 0.029178682304842538,
|
342 |
+
"acc_norm": 0.7215189873417721,
|
343 |
+
"acc_norm_stderr": 0.029178682304842538
|
344 |
+
},
|
345 |
+
"harness|mmlu_professional_law|5": {
|
346 |
+
"acc": 0.4634941329856584,
|
347 |
+
"acc_stderr": 0.012736153390214963,
|
348 |
+
"acc_norm": 0.4634941329856584,
|
349 |
+
"acc_norm_stderr": 0.012736153390214963
|
350 |
+
},
|
351 |
+
"harness|mmlu_high_school_us_history|5": {
|
352 |
+
"acc": 0.6568627450980392,
|
353 |
+
"acc_stderr": 0.03332139944668086,
|
354 |
+
"acc_norm": 0.6568627450980392,
|
355 |
+
"acc_norm_stderr": 0.03332139944668086
|
356 |
+
},
|
357 |
+
"harness|mmlu_high_school_european_history|5": {
|
358 |
+
"acc": 0.5818181818181818,
|
359 |
+
"acc_stderr": 0.03851716319398393,
|
360 |
+
"acc_norm": 0.5818181818181818,
|
361 |
+
"acc_norm_stderr": 0.03851716319398393
|
362 |
+
},
|
363 |
+
"harness|truthfulqa_mc|0": {
|
364 |
+
"mc1": 0.7833537331701347,
|
365 |
+
"mc1_stderr": 0.014421468452506978,
|
366 |
+
"mc2": 0.8572574997405501,
|
367 |
+
"mc2_stderr": 0.01200311225898601
|
368 |
+
},
|
369 |
+
"harness|commongen_v2|2": {
|
370 |
+
"acc": 0.5159386068476978,
|
371 |
+
"acc_stderr": 0.017181617837190195,
|
372 |
+
"acc_norm": 0.5301062573789846,
|
373 |
+
"acc_norm_stderr": 0.01715916359017022
|
374 |
+
}
|
375 |
+
},
|
376 |
+
"versions": {
|
377 |
+
"all": 0,
|
378 |
+
"harness|arc_challenge|25": 0,
|
379 |
+
"harness|hellaswag|10": 0,
|
380 |
+
"harness|mmlu_world_religions|5": 1,
|
381 |
+
"harness|mmlu_management|5": 1,
|
382 |
+
"harness|mmlu_miscellaneous|5": 1,
|
383 |
+
"harness|mmlu_anatomy|5": 1,
|
384 |
+
"harness|mmlu_abstract_algebra|5": 1,
|
385 |
+
"harness|mmlu_conceptual_physics|5": 1,
|
386 |
+
"harness|mmlu_virology|5": 1,
|
387 |
+
"harness|mmlu_philosophy|5": 1,
|
388 |
+
"harness|mmlu_human_aging|5": 1,
|
389 |
+
"harness|mmlu_human_sexuality|5": 1,
|
390 |
+
"harness|mmlu_medical_genetics|5": 1,
|
391 |
+
"harness|mmlu_high_school_geography|5": 1,
|
392 |
+
"harness|mmlu_electrical_engineering|5": 1,
|
393 |
+
"harness|mmlu_college_physics|5": 1,
|
394 |
+
"harness|mmlu_high_school_microeconomics|5": 1,
|
395 |
+
"harness|mmlu_high_school_macroeconomics|5": 1,
|
396 |
+
"harness|mmlu_computer_security|5": 1,
|
397 |
+
"harness|mmlu_global_facts|5": 1,
|
398 |
+
"harness|mmlu_jurisprudence|5": 1,
|
399 |
+
"harness|mmlu_high_school_chemistry|5": 1,
|
400 |
+
"harness|mmlu_high_school_biology|5": 1,
|
401 |
+
"harness|mmlu_marketing|5": 1,
|
402 |
+
"harness|mmlu_clinical_knowledge|5": 1,
|
403 |
+
"harness|mmlu_public_relations|5": 1,
|
404 |
+
"harness|mmlu_high_school_mathematics|5": 1,
|
405 |
+
"harness|mmlu_high_school_physics|5": 1,
|
406 |
+
"harness|mmlu_sociology|5": 1,
|
407 |
+
"harness|mmlu_college_medicine|5": 1,
|
408 |
+
"harness|mmlu_elementary_mathematics|5": 1,
|
409 |
+
"harness|mmlu_college_biology|5": 1,
|
410 |
+
"harness|mmlu_college_chemistry|5": 1,
|
411 |
+
"harness|mmlu_us_foreign_policy|5": 1,
|
412 |
+
"harness|mmlu_moral_disputes|5": 1,
|
413 |
+
"harness|mmlu_logical_fallacies|5": 1,
|
414 |
+
"harness|mmlu_prehistory|5": 1,
|
415 |
+
"harness|mmlu_college_mathematics|5": 1,
|
416 |
+
"harness|mmlu_high_school_government_and_politics|5": 1,
|
417 |
+
"harness|mmlu_econometrics|5": 1,
|
418 |
+
"harness|mmlu_high_school_psychology|5": 1,
|
419 |
+
"harness|mmlu_formal_logic|5": 1,
|
420 |
+
"harness|mmlu_nutrition|5": 1,
|
421 |
+
"harness|mmlu_business_ethics|5": 1,
|
422 |
+
"harness|mmlu_international_law|5": 1,
|
423 |
+
"harness|mmlu_astronomy|5": 1,
|
424 |
+
"harness|mmlu_professional_psychology|5": 1,
|
425 |
+
"harness|mmlu_professional_accounting|5": 1,
|
426 |
+
"harness|mmlu_machine_learning|5": 1,
|
427 |
+
"harness|mmlu_high_school_statistics|5": 1,
|
428 |
+
"harness|mmlu_moral_scenarios|5": 1,
|
429 |
+
"harness|mmlu_college_computer_science|5": 1,
|
430 |
+
"harness|mmlu_high_school_computer_science|5": 1,
|
431 |
+
"harness|mmlu_professional_medicine|5": 1,
|
432 |
+
"harness|mmlu_security_studies|5": 1,
|
433 |
+
"harness|mmlu_high_school_world_history|5": 1,
|
434 |
+
"harness|mmlu_professional_law|5": 1,
|
435 |
+
"harness|mmlu_high_school_us_history|5": 1,
|
436 |
+
"harness|mmlu_high_school_european_history|5": 1,
|
437 |
+
"harness|truthfulqa_mc|0": 0,
|
438 |
+
"harness|commongen_v2|2": 1
|
439 |
+
},
|
440 |
+
"config_general": {
|
441 |
+
"model_name": "x2bee/POLAR-14B-v0.5",
|
442 |
+
"model_sha": "74a1ef65a8d650e5358be229def31688738d8c6a",
|
443 |
+
"model_dtype": "torch.float16",
|
444 |
+
"lighteval_sha": "",
|
445 |
+
"num_few_shot_default": 0,
|
446 |
+
"num_fewshot_seeds": 1,
|
447 |
+
"override_batch_size": 1,
|
448 |
+
"max_samples": null
|
449 |
+
}
|
450 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
APScheduler==3.10.1
|
2 |
+
black==23.11.0
|
3 |
+
click==8.1.3
|
4 |
+
datasets==2.14.5
|
5 |
+
gradio==4.19.2
|
6 |
+
gradio_client==0.10.1
|
7 |
+
huggingface-hub>=0.18.0
|
8 |
+
matplotlib==3.7.1
|
9 |
+
numpy==1.24.2
|
10 |
+
pandas==2.0.0
|
11 |
+
plotly==5.14.1
|
12 |
+
python-dateutil==2.8.2
|
13 |
+
requests==2.28.2
|
14 |
+
sentencepiece
|
15 |
+
tqdm==4.65.0
|
16 |
+
transformers==4.38.2
|
17 |
+
tokenizers>=0.15.0
|
18 |
+
gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
|
19 |
+
torch
|
scripts/create_request_file.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import pprint
|
4 |
+
import re
|
5 |
+
from datetime import datetime, timezone
|
6 |
+
|
7 |
+
import click
|
8 |
+
from colorama import Fore
|
9 |
+
from huggingface_hub import HfApi, snapshot_download
|
10 |
+
|
11 |
+
EVAL_REQUESTS_PATH = "eval-queue"
|
12 |
+
QUEUE_REPO = "open-ko-llm-leaderboard/requests"
|
13 |
+
|
14 |
+
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
|
15 |
+
model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
|
16 |
+
weight_types = ("Original", "Delta", "Adapter")
|
17 |
+
|
18 |
+
|
19 |
+
def get_model_size(model_info, precision: str):
|
20 |
+
size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
|
21 |
+
try:
|
22 |
+
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
23 |
+
except (AttributeError, TypeError):
|
24 |
+
try:
|
25 |
+
size_match = re.search(size_pattern, model_info.modelId.lower())
|
26 |
+
model_size = size_match.group(0)
|
27 |
+
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
|
28 |
+
except AttributeError:
|
29 |
+
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
30 |
+
|
31 |
+
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
32 |
+
model_size = size_factor * model_size
|
33 |
+
return model_size
|
34 |
+
|
35 |
+
|
36 |
+
def main():
|
37 |
+
api = HfApi()
|
38 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
39 |
+
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset")
|
40 |
+
|
41 |
+
model_name = click.prompt("Enter model name")
|
42 |
+
revision = click.prompt("Enter revision", default="main")
|
43 |
+
precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
|
44 |
+
model_type = click.prompt("Enter model type", type=click.Choice(model_types))
|
45 |
+
weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
|
46 |
+
base_model = click.prompt("Enter base model", default="")
|
47 |
+
status = click.prompt("Enter status", default="FINISHED")
|
48 |
+
|
49 |
+
try:
|
50 |
+
model_info = api.model_info(repo_id=model_name, revision=revision)
|
51 |
+
except Exception as e:
|
52 |
+
print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
|
53 |
+
return 1
|
54 |
+
|
55 |
+
model_size = get_model_size(model_info=model_info, precision=precision)
|
56 |
+
|
57 |
+
try:
|
58 |
+
license = model_info.cardData["license"]
|
59 |
+
except Exception:
|
60 |
+
license = "?"
|
61 |
+
|
62 |
+
eval_entry = {
|
63 |
+
"model": model_name,
|
64 |
+
"base_model": base_model,
|
65 |
+
"revision": revision,
|
66 |
+
"private": False,
|
67 |
+
"precision": precision,
|
68 |
+
"weight_type": weight_type,
|
69 |
+
"status": status,
|
70 |
+
"submitted_time": current_time,
|
71 |
+
"model_type": model_type,
|
72 |
+
"likes": model_info.likes,
|
73 |
+
"params": model_size,
|
74 |
+
"license": license,
|
75 |
+
}
|
76 |
+
|
77 |
+
user_name = ""
|
78 |
+
model_path = model_name
|
79 |
+
if "/" in model_name:
|
80 |
+
user_name = model_name.split("/")[0]
|
81 |
+
model_path = model_name.split("/")[1]
|
82 |
+
|
83 |
+
pprint.pprint(eval_entry)
|
84 |
+
|
85 |
+
if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
|
86 |
+
click.echo("continuing...")
|
87 |
+
|
88 |
+
out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
89 |
+
os.makedirs(out_dir, exist_ok=True)
|
90 |
+
out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
|
91 |
+
|
92 |
+
with open(out_path, "w") as f:
|
93 |
+
f.write(json.dumps(eval_entry))
|
94 |
+
|
95 |
+
api.upload_file(
|
96 |
+
path_or_fileobj=out_path,
|
97 |
+
path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
|
98 |
+
repo_id=QUEUE_REPO,
|
99 |
+
repo_type="dataset",
|
100 |
+
commit_message=f"Add {model_name} to eval queue",
|
101 |
+
)
|
102 |
+
else:
|
103 |
+
click.echo("aborting...")
|
104 |
+
|
105 |
+
|
106 |
+
if __name__ == "__main__":
|
107 |
+
main()
|
scripts/update_request_files.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import glob
|
4 |
+
import pprint
|
5 |
+
import re
|
6 |
+
from datetime import datetime, timezone
|
7 |
+
|
8 |
+
import click
|
9 |
+
from colorama import Fore
|
10 |
+
from huggingface_hub import HfApi, snapshot_download
|
11 |
+
from huggingface_hub.hf_api import ModelInfo
|
12 |
+
|
13 |
+
API = HfApi()
|
14 |
+
|
15 |
+
|
16 |
+
def get_model_size(model_info: ModelInfo, precision: str):
|
17 |
+
size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
|
18 |
+
try:
|
19 |
+
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
20 |
+
except (AttributeError, TypeError ):
|
21 |
+
try:
|
22 |
+
size_match = re.search(size_pattern, model_info.modelId.split("/")[-1].lower())
|
23 |
+
model_size = size_match.group(0)
|
24 |
+
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
|
25 |
+
except AttributeError:
|
26 |
+
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
27 |
+
|
28 |
+
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.split("/")[-1].lower()) else 1
|
29 |
+
model_size = size_factor * model_size
|
30 |
+
return model_size
|
31 |
+
|
32 |
+
|
33 |
+
def update_request_files(requests_path):
|
34 |
+
request_files = os.path.join(
|
35 |
+
requests_path, "*/*.json"
|
36 |
+
)
|
37 |
+
request_files = glob.glob(request_files)
|
38 |
+
|
39 |
+
request_files = sorted(request_files, reverse=True)
|
40 |
+
for tmp_request_file in request_files:
|
41 |
+
with open(tmp_request_file, "r") as f:
|
42 |
+
req_content = json.load(f)
|
43 |
+
new_req_content = add_model_info(req_content)
|
44 |
+
|
45 |
+
# if new content is different, update the file
|
46 |
+
if new_req_content != req_content:
|
47 |
+
with open(tmp_request_file, "w") as f:
|
48 |
+
f.write(json.dumps(new_req_content, indent=4))
|
49 |
+
|
50 |
+
def add_model_info(entry):
|
51 |
+
|
52 |
+
model = entry["model"]
|
53 |
+
revision = entry["revision"]
|
54 |
+
|
55 |
+
try:
|
56 |
+
model_info = API.model_info(repo_id=model, revision=revision)
|
57 |
+
except Exception:
|
58 |
+
print(f"Could not get model information for {model} revision {revision}")
|
59 |
+
return entry
|
60 |
+
|
61 |
+
new_entry = entry.copy()
|
62 |
+
|
63 |
+
model_size = get_model_size(model_info=model_info, precision='float16')
|
64 |
+
new_entry["params"] = model_size
|
65 |
+
|
66 |
+
new_entry["likes"] = model_info.likes
|
67 |
+
|
68 |
+
# Were the model card and license filled?
|
69 |
+
try:
|
70 |
+
license = model_info.cardData["license"]
|
71 |
+
new_entry["license"] = license
|
72 |
+
except Exception:
|
73 |
+
print(f"No license for {model} revision {revision}")
|
74 |
+
|
75 |
+
print(json.dumps(new_entry, indent=4))
|
76 |
+
return new_entry
|
77 |
+
|
78 |
+
|
79 |
+
if __name__ == "__main__":
|
80 |
+
# update_request_files("/Users/sean/workspace/leaderboard/leaderboard-test-requests")
|
81 |
+
update_request_files("/Volumes/Data-case-sensitive/requests")
|
82 |
+
|
src/__pycache__/envs.cpython-310.pyc
ADDED
Binary file (1.07 kB). View file
|
|
src/__pycache__/populate.cpython-310.pyc
ADDED
Binary file (2.93 kB). View file
|
|
src/display/__pycache__/about.cpython-310.pyc
ADDED
Binary file (5.36 kB). View file
|
|
src/display/__pycache__/css_html_js.cpython-310.pyc
ADDED
Binary file (1.69 kB). View file
|
|
src/display/__pycache__/formatting.cpython-310.pyc
ADDED
Binary file (1.78 kB). View file
|
|
src/display/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (5.94 kB). View file
|
|
src/display/about.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.display.utils import ModelType
|
2 |
+
|
3 |
+
|
4 |
+
TITLE = """<img src="https://i.postimg.cc/250G53CJ/src-display-SIL-logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
|
5 |
+
|
6 |
+
INTRODUCTION_TEXT = f"""
|
7 |
+
Welcome to the Self-Improving Leaderboard (SIL) - A Revolutionary Platform for Evaluating Large Language Models
|
8 |
+
The SIL offers a dynamic approach to assessing and ranking open-source LLMs and chatbots. Our innovative system continuously updates test datasets and recalculates rankings daily, ensuring evaluations reflect the rapid evolution of language processing capabilities.
|
9 |
+
Key Features:
|
10 |
+
• Daily-refreshed test datasets
|
11 |
+
• Adaptive ranking system
|
12 |
+
• Real-world language processing challenges
|
13 |
+
• Comprehensive model performance insights
|
14 |
+
Explore our cutting-edge evaluation process, gain deep insights into model capabilities, and see how different LLMs compare in this ever-changing landscape.
|
15 |
+
Ready to participate? Submit your model for evaluation on the 'Submit' page and join the forefront of LLM advancement. For a detailed look at our methodology, visit the 'About' page.
|
16 |
+
The SIL is proudly developed and maintained by [Your Organization/Team Name]. Together, let's push the boundaries of language AI!
|
17 |
+
"""
|
18 |
+
|
19 |
+
LLM_BENCHMARKS_TEXT = f"""
|
20 |
+
# How it works
|
21 |
+
🔄 The Self-Improving Leaderboard (SIL) operates on a dynamic evaluation system that continuously evolves to reflect real-world language processing challenges. Here's an overview of our process:
|
22 |
+
Daily Dataset Refresh
|
23 |
+
Our system generates new test data daily from diverse, reputable sources.
|
24 |
+
Advanced Large Language Models (LLMs) are utilized to synthesize additional relevant content.
|
25 |
+
The dataset is divided into two sections:
|
26 |
+
A primary dataset maintaining the integrity of sourced data
|
27 |
+
A noise-injected dataset simulating real-world data complexities
|
28 |
+
Model Evaluation
|
29 |
+
Participating models are rigorously evaluated against the refreshed dataset every 24 hours.
|
30 |
+
We employ a comprehensive set of metrics aligned with industry-standard benchmarks.
|
31 |
+
Our evaluation framework is built on the Eleuther AI Language Model Evaluation Harness, ensuring a robust and consistent assessment.
|
32 |
+
Ranking System
|
33 |
+
Model rankings are updated daily based on their performance across various tasks.
|
34 |
+
The leaderboard reflects not only the latest scores but also tracks consistency and adaptability over time.
|
35 |
+
Quarterly Comprehensive Evaluation
|
36 |
+
Every three months, we conduct an in-depth analysis of model performance.
|
37 |
+
This evaluation considers long-term trends, adaptability to evolving data, and overall efficacy.
|
38 |
+
Special recognition (e.g., medals or badges) may be awarded based on sustained excellence.
|
39 |
+
By continuously refreshing our test data and evaluation criteria, SIL aims to provide a more accurate representation of model performance in real-world scenarios, driving innovation in the field of Natural Language Processing.
|
40 |
+
|
41 |
+
## Icons
|
42 |
+
{ModelType.PT.to_str(" : ")} model
|
43 |
+
{ModelType.IFT.to_str(" : ")} model
|
44 |
+
{ModelType.RL.to_str(" : ")} model
|
45 |
+
If there is no icon, it indicates that there is insufficient information about the model.
|
46 |
+
Please provide information about the model through an issue! 🤩
|
47 |
+
|
48 |
+
## Details and Logs
|
49 |
+
- Detailed numerical results in the `results` dataset: https://huggingface.co/datasets/junkim100/SIL_results
|
50 |
+
- Community queries and running status in the `requests` dataset: https://huggingface.co/datasets/junkim100/SIL_requests
|
51 |
+
"""
|
52 |
+
|
53 |
+
EVALUATION_QUEUE_TEXT = f"""
|
54 |
+
# Evaluation Queue for the 🔄 Self-Improving Leaderboard
|
55 |
+
|
56 |
+
## <Some good practices before submitting a model>
|
57 |
+
|
58 |
+
### 1️⃣ Make sure you can load your model and tokenizer using AutoClasses
|
59 |
+
```python
|
60 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
61 |
+
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
62 |
+
model = AutoModel.from_pretrained("your model name", revision=revision)
|
63 |
+
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
64 |
+
```
|
65 |
+
|
66 |
+
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
67 |
+
|
68 |
+
⚠️ Make sure your model is public!
|
69 |
+
|
70 |
+
⚠️ Maker sure your model runs with [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)
|
71 |
+
|
72 |
+
|
73 |
+
### 2️⃣ Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
74 |
+
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
75 |
+
|
76 |
+
### 3️⃣ Make sure your model has an open license!
|
77 |
+
We'd love for as many people as possible to know they can use your model
|
78 |
+
|
79 |
+
### 4️⃣ Fill up your model card
|
80 |
+
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
81 |
+
|
82 |
+
## In case of model failure
|
83 |
+
If your model is displayed in the `FAILED` category, its execution stopped. Make sure you have followed the above steps first. If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
84 |
+
"""
|
src/display/css_html_js.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
custom_css = """
|
2 |
+
/* Hides the final AutoEvalColumn */
|
3 |
+
#llm-benchmark-tab-table table td:last-child,
|
4 |
+
#llm-benchmark-tab-table table th:last-child {
|
5 |
+
display: none;
|
6 |
+
}
|
7 |
+
|
8 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
9 |
+
table td:first-child,
|
10 |
+
table th:first-child {
|
11 |
+
max-width: 400px;
|
12 |
+
overflow: auto;
|
13 |
+
white-space: nowrap;
|
14 |
+
}
|
15 |
+
|
16 |
+
/* Full width space */
|
17 |
+
.gradio-container {
|
18 |
+
max-width: 95%!important;
|
19 |
+
}
|
20 |
+
|
21 |
+
/* Text style and margins */
|
22 |
+
.markdown-text {
|
23 |
+
font-size: 16px !important;
|
24 |
+
}
|
25 |
+
|
26 |
+
#models-to-add-text {
|
27 |
+
font-size: 18px !important;
|
28 |
+
}
|
29 |
+
|
30 |
+
#search-bar-table-box > div:first-child {
|
31 |
+
background: none;
|
32 |
+
border: none;
|
33 |
+
}
|
34 |
+
|
35 |
+
#search-bar {
|
36 |
+
padding: 0px;
|
37 |
+
}
|
38 |
+
|
39 |
+
.tab-buttons button {
|
40 |
+
font-size: 20px;
|
41 |
+
}
|
42 |
+
|
43 |
+
/* Filters style */
|
44 |
+
#filter_type{
|
45 |
+
border: 0;
|
46 |
+
padding-left: 0;
|
47 |
+
padding-top: 0;
|
48 |
+
}
|
49 |
+
#filter_type label {
|
50 |
+
display: flex;
|
51 |
+
}
|
52 |
+
#filter_type label > span{
|
53 |
+
margin-top: var(--spacing-lg);
|
54 |
+
margin-right: 0.5em;
|
55 |
+
}
|
56 |
+
#filter_type label > .wrap{
|
57 |
+
width: 103px;
|
58 |
+
}
|
59 |
+
#filter_type label > .wrap .wrap-inner{
|
60 |
+
padding: 2px;
|
61 |
+
}
|
62 |
+
#filter_type label > .wrap .wrap-inner input{
|
63 |
+
width: 1px
|
64 |
+
}
|
65 |
+
#filter-columns-type{
|
66 |
+
border:0;
|
67 |
+
padding:0.5;
|
68 |
+
}
|
69 |
+
#filter-columns-size{
|
70 |
+
border:0;
|
71 |
+
padding:0.5;
|
72 |
+
}
|
73 |
+
#box-filter > .form{
|
74 |
+
border: 0
|
75 |
+
}
|
76 |
+
"""
|
77 |
+
|
78 |
+
get_window_url_params = """
|
79 |
+
function(url_params) {
|
80 |
+
const params = new URLSearchParams(window.location.search);
|
81 |
+
url_params = Object.fromEntries(params);
|
82 |
+
return url_params;
|
83 |
+
}
|
84 |
+
"""
|
src/display/formatting.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from datetime import datetime, timezone
|
3 |
+
|
4 |
+
from huggingface_hub import HfApi
|
5 |
+
from huggingface_hub.hf_api import ModelInfo
|
6 |
+
|
7 |
+
|
8 |
+
API = HfApi()
|
9 |
+
|
10 |
+
def model_hyperlink(link, model_name):
|
11 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
12 |
+
|
13 |
+
|
14 |
+
def make_clickable_model(model_name):
|
15 |
+
link = f"https://huggingface.co/{model_name}"
|
16 |
+
|
17 |
+
details_model_name = model_name.replace("/", "__")
|
18 |
+
details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
|
19 |
+
|
20 |
+
return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
|
21 |
+
|
22 |
+
|
23 |
+
def styled_error(error):
|
24 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
25 |
+
|
26 |
+
|
27 |
+
def styled_warning(warn):
|
28 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
29 |
+
|
30 |
+
|
31 |
+
def styled_message(message):
|
32 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
33 |
+
|
34 |
+
|
35 |
+
def has_no_nan_values(df, columns):
|
36 |
+
return df[columns].notna().all(axis=1)
|
37 |
+
|
38 |
+
|
39 |
+
def has_nan_values(df, columns):
|
40 |
+
return df[columns].isna().any(axis=1)
|
src/display/utils.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, make_dataclass
|
2 |
+
from enum import Enum
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
def fields(raw_class):
|
7 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
8 |
+
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class Task:
|
12 |
+
benchmark: str
|
13 |
+
metric: str
|
14 |
+
col_name: str
|
15 |
+
|
16 |
+
class Tasks(Enum):
|
17 |
+
arc = Task("arc_challenge", "acc_norm", "ARC")
|
18 |
+
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
19 |
+
mmlu = Task("mmlu", "acc", "MMLU")
|
20 |
+
truthfulqa = Task("truthfulqa_mc", "mc2", "TruthfulQA")
|
21 |
+
# winogrande = Task("winogrande", "acc_norm", "Winogrande")
|
22 |
+
# gsm8k = Task("gsm8k", "acc_norm", "GSM8k")
|
23 |
+
commongen_v2 = Task("commongen_v2", "acc_norm", "CommonGen V2")
|
24 |
+
# eqBench = Task("eq_bench", "acc_norm", "EQ Bench")
|
25 |
+
# instFollow = Task("inst_follow", "acc_norm", "InstFollow")
|
26 |
+
# harmlessness = Task("harmlessness", "acc_norm", "Harmlessness")
|
27 |
+
# helpfulness = Task("helpfulness", "acc_norm", "Helpfulness")
|
28 |
+
|
29 |
+
class Ranks(Enum):
|
30 |
+
daily = Task("daily", "daily", "Daily Rank")
|
31 |
+
quarterly = Task("quarterly", "quarterly", "Quarterly Rank")
|
32 |
+
|
33 |
+
|
34 |
+
# These classes are for user facing column names,
|
35 |
+
# to avoid having to change them all around the code
|
36 |
+
# when a modif is needed
|
37 |
+
@dataclass
|
38 |
+
class ColumnContent:
|
39 |
+
name: str
|
40 |
+
type: str
|
41 |
+
displayed_by_default: bool
|
42 |
+
hidden: bool = False
|
43 |
+
never_hidden: bool = False
|
44 |
+
dummy: bool = False
|
45 |
+
|
46 |
+
auto_eval_column_dict = []
|
47 |
+
# Init
|
48 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
49 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
50 |
+
# Ranks
|
51 |
+
auto_eval_column_dict.append(["daily", ColumnContent, ColumnContent("Daily Rank", "number", True)])
|
52 |
+
auto_eval_column_dict.append(["quarterly", ColumnContent, ColumnContent("Quarterly Rank", "number", True)])
|
53 |
+
# Scores
|
54 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
55 |
+
for task in Tasks:
|
56 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
57 |
+
# Model information
|
58 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
59 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
60 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
61 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
62 |
+
auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "bool", False)])
|
63 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
64 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
65 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
66 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
67 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
68 |
+
auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, False)])
|
69 |
+
# Dummy column for the search bar (hidden by the custom CSS)
|
70 |
+
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
71 |
+
|
72 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
73 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
74 |
+
|
75 |
+
|
76 |
+
@dataclass(frozen=True)
|
77 |
+
class EvalQueueColumn: # Queue column
|
78 |
+
model = ColumnContent("model", "markdown", True)
|
79 |
+
revision = ColumnContent("revision", "str", True)
|
80 |
+
private = ColumnContent("private", "bool", True)
|
81 |
+
precision = ColumnContent("precision", "str", True)
|
82 |
+
weight_type = ColumnContent("weight_type", "str", "Original")
|
83 |
+
status = ColumnContent("status", "str", True)
|
84 |
+
|
85 |
+
# Define the human baselines
|
86 |
+
human_baseline_row = {
|
87 |
+
AutoEvalColumn.model.name: "<p>Human performance</p>",
|
88 |
+
}
|
89 |
+
|
90 |
+
@dataclass
|
91 |
+
class ModelDetails:
|
92 |
+
name: str
|
93 |
+
symbol: str = "" # emoji, only for the model type
|
94 |
+
|
95 |
+
|
96 |
+
class ModelType(Enum):
|
97 |
+
PT = ModelDetails(name="pretrained", symbol="🟢")
|
98 |
+
# FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
99 |
+
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
100 |
+
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
101 |
+
Unknown = ModelDetails(name="", symbol="?")
|
102 |
+
|
103 |
+
def to_str(self, separator=" "):
|
104 |
+
return f"{self.value.symbol}{separator}{self.value.name}"
|
105 |
+
|
106 |
+
@staticmethod
|
107 |
+
def from_str(type):
|
108 |
+
# if "fine-tuned" in type or "🔶" in type:
|
109 |
+
# return ModelType.FT
|
110 |
+
if "pretrained" in type or "🟢" in type:
|
111 |
+
return ModelType.PT
|
112 |
+
if "RL-tuned" in type or "🟦" in type:
|
113 |
+
return ModelType.RL
|
114 |
+
if "instruction-tuned" in type or "⭕" in type:
|
115 |
+
return ModelType.IFT
|
116 |
+
return ModelType.Unknown
|
117 |
+
|
118 |
+
class WeightType(Enum):
|
119 |
+
Adapter = ModelDetails("Adapter")
|
120 |
+
Original = ModelDetails("Original")
|
121 |
+
Delta = ModelDetails("Delta")
|
122 |
+
|
123 |
+
class Precision(Enum):
|
124 |
+
float16 = ModelDetails("float16")
|
125 |
+
# bfloat16 = ModelDetails("bfloat16")
|
126 |
+
# qt_8bit = ModelDetails("8bit")
|
127 |
+
# qt_4bit = ModelDetails("4bit")
|
128 |
+
# qt_GPTQ = ModelDetails("GPTQ")
|
129 |
+
Unknown = ModelDetails("?")
|
130 |
+
|
131 |
+
def from_str(precision):
|
132 |
+
if precision in ["torch.float16", "float16"]:
|
133 |
+
return Precision.float16
|
134 |
+
if precision in ["8bit"]:
|
135 |
+
return Precision.qt_8bit
|
136 |
+
if precision in ["4bit"]:
|
137 |
+
return Precision.qt_4bit
|
138 |
+
if precision in ["GPTQ", "None"]:
|
139 |
+
return Precision.qt_GPTQ
|
140 |
+
return Precision.Unknown
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
# Column selection
|
146 |
+
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
147 |
+
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
148 |
+
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
149 |
+
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
150 |
+
|
151 |
+
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
152 |
+
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
153 |
+
|
154 |
+
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
155 |
+
|
156 |
+
NUMERIC_INTERVALS = {
|
157 |
+
"Unknown": pd.Interval(-1, 0, closed="right"),
|
158 |
+
"0~3B": pd.Interval(0, 3, closed="right"),
|
159 |
+
"3~7B": pd.Interval(3, 7.3, closed="right"),
|
160 |
+
"7~13B": pd.Interval(7.3, 13, closed="right"),
|
161 |
+
"13~35B": pd.Interval(13, 35, closed="right"),
|
162 |
+
"35~60B": pd.Interval(35, 60, closed="right"),
|
163 |
+
"60B+": pd.Interval(60, 10000, closed="right"),
|
164 |
+
}
|
src/envs.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from huggingface_hub import HfApi
|
4 |
+
|
5 |
+
# clone / pull the lmeh eval data
|
6 |
+
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
7 |
+
|
8 |
+
REPO_ID = "junkim100/self-improving-leaderboard"
|
9 |
+
QUEUE_REPO = "junkim100/SIL_requests"
|
10 |
+
RESULTS_REPO = "junkim100/SIL_results"
|
11 |
+
|
12 |
+
PRIVATE_QUEUE_REPO = "open-ko-llm-leaderboard/private-requests"
|
13 |
+
PRIVATE_RESULTS_REPO = "open-ko-llm-leaderboard/private-results"
|
14 |
+
|
15 |
+
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
16 |
+
|
17 |
+
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
+
|
19 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
20 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
21 |
+
|
22 |
+
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
23 |
+
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
24 |
+
|
25 |
+
PATH_TO_COLLECTION = "open-ko-llm-leaderboard/ko-llm-leaderboard-best-models-659c7e45a481ceea4c883506"
|
26 |
+
|
27 |
+
# Rate limit variables
|
28 |
+
RATE_LIMIT_PERIOD = 7
|
29 |
+
RATE_LIMIT_QUOTA = 5
|
30 |
+
HAS_HIGHER_RATE_LIMIT = []
|
31 |
+
|
32 |
+
API = HfApi(token=H4_TOKEN)
|
src/leaderboard/__pycache__/filter_models.cpython-310.pyc
ADDED
Binary file (1.47 kB). View file
|
|
src/leaderboard/__pycache__/read_evals.cpython-310.pyc
ADDED
Binary file (7.78 kB). View file
|
|
src/leaderboard/filter_models.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.display.formatting import model_hyperlink
|
2 |
+
from src.display.utils import AutoEvalColumn
|
3 |
+
|
4 |
+
# Models which have been flagged by users as being problematic for a reason or another
|
5 |
+
# (Model name to forum discussion link)
|
6 |
+
FLAGGED_MODELS = {
|
7 |
+
"merged": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
|
8 |
+
"TeamUNIVA/Komodo_7B_v0.1.0": "https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/44",
|
9 |
+
}
|
10 |
+
|
11 |
+
# Models which have been requested by orgs to not be submitted on the leaderboard
|
12 |
+
DO_NOT_SUBMIT_MODELS = [
|
13 |
+
]
|
14 |
+
|
15 |
+
|
16 |
+
def flag_models(leaderboard_data: list[dict]):
|
17 |
+
for model_data in leaderboard_data:
|
18 |
+
# Merges are flagged automatically
|
19 |
+
if model_data[AutoEvalColumn.flagged.name] == True:
|
20 |
+
flag_key = "merged"
|
21 |
+
else:
|
22 |
+
flag_key = model_data["model_name_for_query"]
|
23 |
+
|
24 |
+
if flag_key in FLAGGED_MODELS:
|
25 |
+
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
26 |
+
issue_link = model_hyperlink(
|
27 |
+
FLAGGED_MODELS[flag_key],
|
28 |
+
f"See discussion #{issue_num}",
|
29 |
+
)
|
30 |
+
model_data[
|
31 |
+
AutoEvalColumn.model.name
|
32 |
+
] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
|
33 |
+
model_data[AutoEvalColumn.flagged.name] = True
|
34 |
+
else:
|
35 |
+
model_data[AutoEvalColumn.flagged.name] = False
|
36 |
+
|
37 |
+
|
38 |
+
def remove_forbidden_models(leaderboard_data: list[dict]):
|
39 |
+
indices_to_remove = []
|
40 |
+
for ix, model in enumerate(leaderboard_data):
|
41 |
+
if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
|
42 |
+
indices_to_remove.append(ix)
|
43 |
+
|
44 |
+
for ix in reversed(indices_to_remove):
|
45 |
+
leaderboard_data.pop(ix)
|
46 |
+
return leaderboard_data
|
47 |
+
|
48 |
+
|
49 |
+
def filter_models(leaderboard_data: list[dict]):
|
50 |
+
leaderboard_data = remove_forbidden_models(leaderboard_data)
|
51 |
+
flag_models(leaderboard_data)
|
src/leaderboard/read_evals.py
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import json
|
3 |
+
import math
|
4 |
+
import os
|
5 |
+
from dataclasses import dataclass
|
6 |
+
|
7 |
+
import dateutil
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from huggingface_hub import ModelCard
|
11 |
+
|
12 |
+
from src.display.formatting import make_clickable_model
|
13 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Ranks, Precision, WeightType
|
14 |
+
from src.submission.check_validity import is_model_on_hub, check_model_card
|
15 |
+
|
16 |
+
|
17 |
+
@dataclass
|
18 |
+
class EvalResult:
|
19 |
+
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
20 |
+
eval_name: str # org_model_precision (uid)
|
21 |
+
full_model: str # org/model (path on hub)
|
22 |
+
org: str
|
23 |
+
model: str
|
24 |
+
revision: str # commit hash, "" if main
|
25 |
+
results: dict
|
26 |
+
precision: Precision = Precision.Unknown
|
27 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
28 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
29 |
+
architecture: str = "Unknown" # From config file
|
30 |
+
license: str = "?"
|
31 |
+
likes: int = 0
|
32 |
+
num_params: int = 0
|
33 |
+
date: str = "" # submission date of request file
|
34 |
+
still_on_hub: bool = False
|
35 |
+
is_merge: bool = False
|
36 |
+
flagged: bool = False
|
37 |
+
|
38 |
+
@classmethod
|
39 |
+
def init_from_json_file(self, json_filepath):
|
40 |
+
"""Inits the result from the specific model result file"""
|
41 |
+
with open(json_filepath) as fp:
|
42 |
+
data = json.load(fp)
|
43 |
+
|
44 |
+
# We manage the legacy config format
|
45 |
+
config = data.get("config", data.get("config_general", None))
|
46 |
+
|
47 |
+
# Precision
|
48 |
+
precision = Precision.from_str(config.get("model_dtype"))
|
49 |
+
|
50 |
+
# Get model and org
|
51 |
+
org_and_model = config.get("model_name", config.get("model_args", None))
|
52 |
+
org_and_model = org_and_model.split("/", 1)
|
53 |
+
|
54 |
+
if len(org_and_model) == 1:
|
55 |
+
org = None
|
56 |
+
model = org_and_model[0]
|
57 |
+
result_key = f"{model}_{precision.value.name}"
|
58 |
+
else:
|
59 |
+
org = org_and_model[0]
|
60 |
+
model = org_and_model[1]
|
61 |
+
result_key = f"{org}_{model}_{precision.value.name}"
|
62 |
+
full_model = "/".join(org_and_model)
|
63 |
+
|
64 |
+
still_on_hub, error, model_config = is_model_on_hub(
|
65 |
+
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
66 |
+
)
|
67 |
+
architecture = "?"
|
68 |
+
if model_config is not None:
|
69 |
+
architectures = getattr(model_config, "architectures", None)
|
70 |
+
if architectures:
|
71 |
+
architecture = ";".join(architectures)
|
72 |
+
|
73 |
+
# If the model doesn't have a model card or a license, we consider it's deleted
|
74 |
+
if still_on_hub:
|
75 |
+
try:
|
76 |
+
if check_model_card(full_model)[0] is False:
|
77 |
+
still_on_hub = False
|
78 |
+
except Exception:
|
79 |
+
still_on_hub = False
|
80 |
+
|
81 |
+
# Check if the model is a merge
|
82 |
+
is_merge_from_metadata = False
|
83 |
+
flagged = False
|
84 |
+
if still_on_hub:
|
85 |
+
model_card = ModelCard.load(full_model)
|
86 |
+
|
87 |
+
if model_card.data.tags:
|
88 |
+
is_merge_from_metadata = "merge" in model_card.data.tags
|
89 |
+
merge_keywords = ["mergekit", "merged model", "merge model", "merging", "Carbon"]
|
90 |
+
# If the model is a merge but not saying it in the metadata, we flag it
|
91 |
+
is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
|
92 |
+
flagged = is_merge_from_model_card and not is_merge_from_metadata
|
93 |
+
|
94 |
+
|
95 |
+
# Extract results available in this file (some results are split in several files)
|
96 |
+
results = {}
|
97 |
+
for rank in Ranks:
|
98 |
+
rank = rank.value
|
99 |
+
if rank.benchmark in data["results"]:
|
100 |
+
results[rank.benchmark] = data["results"][rank.benchmark][rank.metric]
|
101 |
+
for task in Tasks:
|
102 |
+
task = task.value
|
103 |
+
|
104 |
+
# Some truthfulQA values are NaNs
|
105 |
+
if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
|
106 |
+
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
|
107 |
+
results[task.benchmark] = 0.0
|
108 |
+
continue
|
109 |
+
|
110 |
+
# New tasks have been added, we need to skip them if not exists
|
111 |
+
if task.benchmark in ["winogrande", "gsm8k", "eq_bench", "inst_follow", "harmlessness", "helpfulness"]:
|
112 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
113 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
114 |
+
results[task.benchmark] = 0.0
|
115 |
+
continue
|
116 |
+
|
117 |
+
# We average all scores of a given metric (mostly for mmlu)
|
118 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
119 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
120 |
+
continue
|
121 |
+
|
122 |
+
mean_acc = np.mean(accs) * 100.0
|
123 |
+
results[task.benchmark] = mean_acc
|
124 |
+
|
125 |
+
return self(
|
126 |
+
eval_name=result_key,
|
127 |
+
full_model=full_model,
|
128 |
+
org=org,
|
129 |
+
model=model,
|
130 |
+
results=results,
|
131 |
+
precision=precision,
|
132 |
+
revision= config.get("model_sha", ""),
|
133 |
+
still_on_hub=still_on_hub,
|
134 |
+
architecture=architecture,
|
135 |
+
is_merge=is_merge_from_metadata,
|
136 |
+
flagged=flagged,
|
137 |
+
)
|
138 |
+
|
139 |
+
def update_with_request_file(self, requests_path):
|
140 |
+
"""Finds the relevant request file for the current model and updates info with it"""
|
141 |
+
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
142 |
+
|
143 |
+
try:
|
144 |
+
with open(request_file, "r") as f:
|
145 |
+
request = json.load(f)
|
146 |
+
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
147 |
+
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
148 |
+
self.license = request.get("license", "?")
|
149 |
+
self.likes = request.get("likes", 0)
|
150 |
+
self.num_params = request.get("params", 0)
|
151 |
+
self.date = request.get("submitted_time", "")
|
152 |
+
except Exception:
|
153 |
+
print(f"Could not find request file for {self.org}/{self.model}")
|
154 |
+
|
155 |
+
def to_dict(self):
|
156 |
+
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
157 |
+
|
158 |
+
# Skip the new tasks for now
|
159 |
+
# TODO: safely remove this code when the task results are all added
|
160 |
+
skip_avg_len = 0
|
161 |
+
# if self.results['winogrande'] == 0.0:
|
162 |
+
# skip_avg_len += 1
|
163 |
+
# if self.results['gsm8k'] == 0.0:
|
164 |
+
# skip_avg_len += 1
|
165 |
+
# if self.results['eq_bench'] == 0.0:
|
166 |
+
# skip_avg_len += 1
|
167 |
+
# if self.results['inst_follow'] == 0.0:
|
168 |
+
# skip_avg_len += 1
|
169 |
+
# if self.results['harmlessness'] == 0.0:
|
170 |
+
# skip_avg_len += 1
|
171 |
+
# if self.results['helpfulness'] == 0.0:
|
172 |
+
# skip_avg_len += 1
|
173 |
+
|
174 |
+
average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
|
175 |
+
|
176 |
+
data_dict = {
|
177 |
+
"eval_name": self.eval_name, # not a column, just a save name,
|
178 |
+
AutoEvalColumn.precision.name: self.precision.value.name,
|
179 |
+
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
180 |
+
AutoEvalColumn.merged.name: self.is_merge,
|
181 |
+
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, # + "🥦" if self.is_merge,
|
182 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
183 |
+
AutoEvalColumn.architecture.name: self.architecture,
|
184 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
185 |
+
AutoEvalColumn.dummy.name: self.full_model,
|
186 |
+
AutoEvalColumn.revision.name: self.revision,
|
187 |
+
AutoEvalColumn.average.name: average,
|
188 |
+
AutoEvalColumn.license.name: self.license,
|
189 |
+
AutoEvalColumn.likes.name: self.likes,
|
190 |
+
AutoEvalColumn.params.name: self.num_params,
|
191 |
+
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
192 |
+
AutoEvalColumn.flagged.name: self.flagged
|
193 |
+
}
|
194 |
+
|
195 |
+
AllColumns = []
|
196 |
+
for task in Tasks:
|
197 |
+
AllColumns.append(task.value)
|
198 |
+
for rank in Ranks:
|
199 |
+
AllColumns.append(rank.value)
|
200 |
+
|
201 |
+
for a in AllColumns:
|
202 |
+
if a.benchmark in ["daily", "quarterly"]:
|
203 |
+
data_dict[a.col_name] = self.results[a.benchmark]
|
204 |
+
print(a.benchmark, self.results[a.benchmark], a.col_name)
|
205 |
+
else:
|
206 |
+
data_dict[a.col_name] = self.results[a.benchmark]
|
207 |
+
|
208 |
+
return data_dict
|
209 |
+
|
210 |
+
|
211 |
+
def get_request_file_for_model(requests_path, model_name, precision):
|
212 |
+
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
213 |
+
request_files = os.path.join(
|
214 |
+
requests_path,
|
215 |
+
f"{model_name}_eval_request_*.json",
|
216 |
+
)
|
217 |
+
request_files = glob.glob(request_files)
|
218 |
+
|
219 |
+
# Select correct request file (precision)
|
220 |
+
request_file = ""
|
221 |
+
request_files = sorted(request_files, reverse=True)
|
222 |
+
for tmp_request_file in request_files:
|
223 |
+
with open(tmp_request_file, "r") as f:
|
224 |
+
req_content = json.load(f)
|
225 |
+
if (
|
226 |
+
req_content["status"] in ["FINISHED"]
|
227 |
+
and req_content["precision"] == precision.split(".")[-1]
|
228 |
+
):
|
229 |
+
request_file = tmp_request_file
|
230 |
+
return request_file
|
231 |
+
|
232 |
+
|
233 |
+
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
234 |
+
"""From the path of the results folder root, extract all needed info for results"""
|
235 |
+
model_result_filepaths = []
|
236 |
+
|
237 |
+
for root, _, files in os.walk(results_path):
|
238 |
+
# We should only have json files in model results
|
239 |
+
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
240 |
+
continue
|
241 |
+
|
242 |
+
# Sort the files by date
|
243 |
+
try:
|
244 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
245 |
+
except dateutil.parser._parser.ParserError:
|
246 |
+
files = [files[-1]]
|
247 |
+
|
248 |
+
for file in files:
|
249 |
+
model_result_filepaths.append(os.path.join(root, file))
|
250 |
+
|
251 |
+
eval_results = {}
|
252 |
+
for model_result_filepath in model_result_filepaths:
|
253 |
+
# Creation of result
|
254 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
255 |
+
eval_result.update_with_request_file(requests_path)
|
256 |
+
|
257 |
+
# Store results of same eval together
|
258 |
+
eval_name = eval_result.eval_name
|
259 |
+
if eval_name in eval_results.keys():
|
260 |
+
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
261 |
+
else:
|
262 |
+
eval_results[eval_name] = eval_result
|
263 |
+
|
264 |
+
results = []
|
265 |
+
for v in eval_results.values():
|
266 |
+
try:
|
267 |
+
v.to_dict() # we test if the dict version is complete
|
268 |
+
results.append(v)
|
269 |
+
except KeyError: # not all eval values present
|
270 |
+
continue
|
271 |
+
|
272 |
+
return results
|
src/populate.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
+
from src.leaderboard.filter_models import filter_models
|
9 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
+
|
11 |
+
|
12 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
+
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
+
# all_data_json.append(baseline_row)
|
16 |
+
filter_models(all_data_json)
|
17 |
+
|
18 |
+
df = pd.DataFrame.from_records(all_data_json)
|
19 |
+
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
20 |
+
df = df.sort_values(by=["Daily Rank"], ascending=True)
|
21 |
+
|
22 |
+
# print(df[AutoEvalColumn.average.name])
|
23 |
+
try:
|
24 |
+
df = df[cols].round(decimals=2)
|
25 |
+
except:
|
26 |
+
pass
|
27 |
+
|
28 |
+
# filter out if any of the benchmarks have not been produced
|
29 |
+
try:
|
30 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
31 |
+
except:
|
32 |
+
pass
|
33 |
+
return raw_data, df
|
34 |
+
|
35 |
+
|
36 |
+
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
37 |
+
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
38 |
+
all_evals = []
|
39 |
+
|
40 |
+
for entry in entries:
|
41 |
+
if ".json" in entry:
|
42 |
+
file_path = os.path.join(save_path, entry)
|
43 |
+
with open(file_path) as fp:
|
44 |
+
data = json.load(fp)
|
45 |
+
|
46 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
47 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
48 |
+
|
49 |
+
all_evals.append(data)
|
50 |
+
elif ".md" not in entry:
|
51 |
+
# this is a folder
|
52 |
+
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
53 |
+
for sub_entry in sub_entries:
|
54 |
+
file_path = os.path.join(save_path, entry, sub_entry)
|
55 |
+
with open(file_path) as fp:
|
56 |
+
data = json.load(fp)
|
57 |
+
|
58 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
59 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
60 |
+
all_evals.append(data)
|
61 |
+
|
62 |
+
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
63 |
+
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
64 |
+
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
65 |
+
failed_list = [e for e in all_evals if e["status"] == "FAILED"]
|
66 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
67 |
+
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
68 |
+
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
69 |
+
df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
|
70 |
+
return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]
|
src/submission/__pycache__/check_validity.cpython-310.pyc
ADDED
Binary file (4.64 kB). View file
|
|
src/submission/__pycache__/submit.cpython-310.pyc
ADDED
Binary file (3.57 kB). View file
|
|