junkim100 commited on
Commit
57dfc04
·
1 Parent(s): aebe308

Initial Commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +9 -5
  2. app.py +400 -0
  3. eval-queue/.gitattributes +55 -0
  4. eval-queue/01-ai/Yi-1.5-9B-32K_eval_request_False_float16_Original.json +14 -0
  5. eval-queue/BioMistral/BioMistral-7B_eval_request_False_float16_Original.json +15 -0
  6. eval-queue/EleutherAI/polyglot-ko-1.3b_eval_request_False_float16_Original.json +14 -0
  7. eval-queue/HuggingFaceH4/zephyr-7b-beta_eval_request_False_float16_Original.json +15 -0
  8. eval-queue/README.md +3 -0
  9. eval-queue/nlpai-lab/KULLM3_eval_request_False_float16_Original.json +15 -0
  10. eval-queue/x2bee/POLAR-14B-DPO-v1.3_eval_request_False_float16_Original.json +15 -0
  11. eval-queue/x2bee/POLAR-14B-DPO-v1.4_eval_request_False_float16_Original.json +15 -0
  12. eval-queue/x2bee/POLAR-14B-HES-DPO-v1.5_eval_request_False_float16_Original.json +15 -0
  13. eval-queue/x2bee/POLAR-14B-SON-SFT-v0.1_eval_request_False_float16_Original.json +15 -0
  14. eval-queue/x2bee/POLAR-14B-v0.2_eval_request_False_float16_Original.json +15 -0
  15. eval-queue/x2bee/POLAR-14B-v0.5_eval_request_False_float16_Original.json +15 -0
  16. eval-results/.gitattributes +55 -0
  17. eval-results/01-ai/Yi-1.5-9B-32K/result_2024_07_30 20:36:30.json +450 -0
  18. eval-results/BioMistral/BioMistral-7B/BioMistral_BioMistral-7B_result_2024-05-30 01_33_58.json +450 -0
  19. eval-results/EleutherAI/polyglot-ko-1.3b/EleutherAI_polyglot-ko-1.3b_result_2023-09-24 15_21_38.json +450 -0
  20. eval-results/HuggingFaceH4/.DS_Store +0 -0
  21. eval-results/HuggingFaceH4/zephyr-7b-beta/result.json +450 -0
  22. eval-results/README.md +3 -0
  23. eval-results/nlpai-lab/KULLM3/result.json +450 -0
  24. eval-results/x2bee/POLAR-14B-DPO-v1.3/result.json +450 -0
  25. eval-results/x2bee/POLAR-14B-DPO-v1.4/result.json +450 -0
  26. eval-results/x2bee/POLAR-14B-HES-DPO-v1.5/result.json +450 -0
  27. eval-results/x2bee/POLAR-14B-SON-SFT-v0.1/result.json +450 -0
  28. eval-results/x2bee/POLAR-14B-v0.2/result.json +450 -0
  29. eval-results/x2bee/POLAR-14B-v0.5/result.json +450 -0
  30. requirements.txt +19 -0
  31. scripts/create_request_file.py +107 -0
  32. scripts/update_request_files.py +82 -0
  33. src/__pycache__/envs.cpython-310.pyc +0 -0
  34. src/__pycache__/populate.cpython-310.pyc +0 -0
  35. src/display/__pycache__/about.cpython-310.pyc +0 -0
  36. src/display/__pycache__/css_html_js.cpython-310.pyc +0 -0
  37. src/display/__pycache__/formatting.cpython-310.pyc +0 -0
  38. src/display/__pycache__/utils.cpython-310.pyc +0 -0
  39. src/display/about.py +84 -0
  40. src/display/css_html_js.py +84 -0
  41. src/display/formatting.py +40 -0
  42. src/display/utils.py +164 -0
  43. src/envs.py +32 -0
  44. src/leaderboard/__pycache__/filter_models.cpython-310.pyc +0 -0
  45. src/leaderboard/__pycache__/read_evals.cpython-310.pyc +0 -0
  46. src/leaderboard/filter_models.py +51 -0
  47. src/leaderboard/read_evals.py +272 -0
  48. src/populate.py +70 -0
  49. src/submission/__pycache__/check_validity.cpython-310.pyc +0 -0
  50. src/submission/__pycache__/submit.cpython-310.pyc +0 -0
README.md CHANGED
@@ -1,13 +1,17 @@
1
  ---
2
  title: Self Improving Leaderboard
3
- emoji: 🦀
4
- colorFrom: purple
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.40.0
8
  app_file: app.py
9
- pinned: false
10
  license: apache-2.0
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Self Improving Leaderboard
3
+ emoji: 🔄
4
+ colorFrom: green
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.36.0
8
  app_file: app.py
9
+ pinned: true
10
  license: apache-2.0
11
+ duplicated_from: upstage/open-ko-llm-leaderboard
12
+ fullWidth: true
13
+ tags:
14
+ - leaderboard
15
  ---
16
 
17
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from apscheduler.schedulers.background import BackgroundScheduler
4
+ from huggingface_hub import snapshot_download
5
+ from gradio_space_ci import configure_space_ci # FOR CI
6
+
7
+ from src.display.about import (
8
+ EVALUATION_QUEUE_TEXT,
9
+ INTRODUCTION_TEXT,
10
+ LLM_BENCHMARKS_TEXT,
11
+ TITLE,
12
+ )
13
+ from src.display.css_html_js import custom_css
14
+ from src.display.utils import (
15
+ BENCHMARK_COLS,
16
+ COLS,
17
+ EVAL_COLS,
18
+ EVAL_TYPES,
19
+ NUMERIC_INTERVALS,
20
+ TYPES,
21
+ AutoEvalColumn,
22
+ ModelType,
23
+ fields,
24
+ WeightType,
25
+ Precision
26
+ )
27
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
28
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
+ from src.submission.submit import add_new_eval
30
+ from src.tools.collections import update_collections
31
+ from src.tools.plots import (
32
+ create_metric_plot_obj,
33
+ create_plot_df,
34
+ create_scores_df,
35
+ )
36
+
37
+
38
+ def restart_space():
39
+ API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
40
+
41
+ try:
42
+ print(EVAL_REQUESTS_PATH)
43
+ snapshot_download(
44
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
45
+ )
46
+ except Exception:
47
+ restart_space()
48
+ try:
49
+ print(EVAL_RESULTS_PATH)
50
+ snapshot_download(
51
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
52
+ )
53
+ except Exception:
54
+ restart_space()
55
+
56
+
57
+ _, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
58
+ leaderboard_df = original_df.copy()
59
+
60
+ (
61
+ finished_eval_queue_df,
62
+ running_eval_queue_df,
63
+ pending_eval_queue_df,
64
+ failed_eval_queue_df,
65
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
66
+
67
+
68
+ # Searching and filtering
69
+ def update_table(
70
+ hidden_df: pd.DataFrame,
71
+ columns: list,
72
+ type_query: list,
73
+ precision_query: str,
74
+ size_query: list,
75
+ show_deleted: bool,
76
+ show_merges: bool,
77
+ show_flagged: bool,
78
+ query: str,
79
+ ):
80
+ filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
81
+ filtered_df = filter_queries(query, filtered_df)
82
+ df = select_columns(filtered_df, columns)
83
+ return df
84
+
85
+ def quarter_update_table(
86
+ hidden_df: pd.DataFrame,
87
+ columns: list,
88
+ type_query: list,
89
+ precision_query: str,
90
+ size_query: list,
91
+ show_deleted: bool,
92
+ show_merges: bool,
93
+ show_flagged: bool,
94
+ query: str,
95
+ ):
96
+ filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
97
+ filtered_df = filter_queries(query, filtered_df)
98
+ df = quarter_select_columns(filtered_df, columns)
99
+ return df
100
+
101
+
102
+ def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
103
+ query = request.query_params.get("query") or ""
104
+ return query, query # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
105
+
106
+
107
+ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
108
+ return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
109
+
110
+
111
+ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
112
+ always_here_cols = [
113
+ AutoEvalColumn.model_type_symbol.name,
114
+ AutoEvalColumn.model.name,
115
+ ]
116
+ # We use COLS to maintain sorting
117
+ filtered_df = df[
118
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
119
+ ]
120
+ return filtered_df
121
+
122
+
123
+ def filter_queries(query: str, filtered_df: pd.DataFrame):
124
+ """Added by Abishek"""
125
+ final_df = []
126
+ if query != "":
127
+ queries = [q.strip() for q in query.split(";")]
128
+ for _q in queries:
129
+ _q = _q.strip()
130
+ if _q != "":
131
+ temp_filtered_df = search_table(filtered_df, _q)
132
+ if len(temp_filtered_df) > 0:
133
+ final_df.append(temp_filtered_df)
134
+ if len(final_df) > 0:
135
+ filtered_df = pd.concat(final_df)
136
+ filtered_df = filtered_df.drop_duplicates(
137
+ subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
138
+ )
139
+
140
+ return filtered_df
141
+
142
+
143
+ def filter_models(
144
+ df: pd.DataFrame, type_query: list, size_query: list, precision_query: list
145
+ ) -> pd.DataFrame:
146
+
147
+ type_emoji = [t[0] for t in type_query]
148
+ df = df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
149
+ df = df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
150
+ numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
151
+ params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
152
+ mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
153
+ df = df.loc[mask]
154
+
155
+ return df
156
+
157
+
158
+ leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision])
159
+
160
+ print(leaderboard_df)
161
+
162
+ demo = gr.Blocks(css=custom_css)
163
+ with demo:
164
+ gr.HTML(TITLE)
165
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
166
+
167
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
168
+ with gr.TabItem("🔄 Self-Improving Benchmark", elem_id="llm-benchmark-tab-table", id=0):
169
+ with gr.Row():
170
+ with gr.Column():
171
+ with gr.Row():
172
+ search_bar = gr.Textbox(
173
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
174
+ show_label=False,
175
+ elem_id="search-bar",
176
+ )
177
+ with gr.Row():
178
+ shown_columns = gr.CheckboxGroup(
179
+ choices=[
180
+ c.name
181
+ for c in fields(AutoEvalColumn)
182
+ if not c.hidden and not c.never_hidden and not c.dummy
183
+ ],
184
+ value=[
185
+ c.name
186
+ for c in fields(AutoEvalColumn)
187
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
188
+ ],
189
+ label="Select columns to show",
190
+ elem_id="column-select",
191
+ interactive=True,
192
+ )
193
+
194
+ with gr.Column(min_width=320):
195
+ #with gr.Box(elem_id="box-filter"):
196
+ filter_columns_type = gr.CheckboxGroup(
197
+ label="Model types",
198
+ choices=[t.to_str() for t in ModelType],
199
+ value=[t.to_str() for t in ModelType],
200
+ interactive=True,
201
+ elem_id="filter-columns-type",
202
+ )
203
+ filter_columns_precision = gr.CheckboxGroup(
204
+ label="Precision",
205
+ choices=[i.value.name for i in Precision],
206
+ value=[i.value.name for i in Precision],
207
+ interactive=True,
208
+ elem_id="filter-columns-precision",
209
+ )
210
+ filter_columns_size = gr.CheckboxGroup(
211
+ label="Model sizes (in billions of parameters)",
212
+ choices=list(NUMERIC_INTERVALS.keys()),
213
+ value=list(NUMERIC_INTERVALS.keys()),
214
+ interactive=True,
215
+ elem_id="filter-columns-size",
216
+ )
217
+
218
+ leaderboard_table = gr.components.Dataframe(
219
+ value=leaderboard_df[
220
+ [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
221
+ + shown_columns.value
222
+ + [AutoEvalColumn.dummy.name]
223
+ ],
224
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
225
+ datatype=TYPES,
226
+ elem_id="leaderboard-table",
227
+ interactive=False,
228
+ visible=True,
229
+ #column_widths=["2%", "33%"]
230
+ )
231
+
232
+ # Dummy leaderboard for handling the case when the user uses backspace key
233
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
234
+ value=original_df[COLS],
235
+ headers=COLS,
236
+ datatype=TYPES,
237
+ visible=False,
238
+ )
239
+ search_bar.submit(
240
+ update_table,
241
+ [
242
+ hidden_leaderboard_table_for_search,
243
+ shown_columns,
244
+ filter_columns_type,
245
+ filter_columns_precision,
246
+ filter_columns_size,
247
+ search_bar,
248
+ ],
249
+ leaderboard_table,
250
+ )
251
+
252
+ # Define a hidden component that will trigger a reload only if a query parameter has be set
253
+ hidden_search_bar = gr.Textbox(value="", visible=False)
254
+ hidden_search_bar.change(
255
+ update_table,
256
+ [
257
+ hidden_leaderboard_table_for_search,
258
+ shown_columns,
259
+ filter_columns_type,
260
+ filter_columns_precision,
261
+ filter_columns_size,
262
+ search_bar,
263
+ ],
264
+ leaderboard_table,
265
+ )
266
+ # Check query parameter once at startup and update search bar + hidden component
267
+ demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
268
+
269
+ for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
270
+ selector.change(
271
+ update_table,
272
+ [
273
+ hidden_leaderboard_table_for_search,
274
+ shown_columns,
275
+ filter_columns_type,
276
+ filter_columns_precision,
277
+ filter_columns_size,
278
+ search_bar,
279
+ ],
280
+ leaderboard_table,
281
+ queue=True,
282
+ )
283
+
284
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
285
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
286
+
287
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
288
+ with gr.Column():
289
+ with gr.Row():
290
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
291
+
292
+ with gr.Column():
293
+ with gr.Accordion(
294
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
295
+ open=False,
296
+ ):
297
+ with gr.Row():
298
+ finished_eval_table = gr.components.Dataframe(
299
+ value=finished_eval_queue_df,
300
+ headers=EVAL_COLS,
301
+ datatype=EVAL_TYPES,
302
+ row_count=5,
303
+ )
304
+ with gr.Accordion(
305
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
306
+ open=False,
307
+ ):
308
+ with gr.Row():
309
+ running_eval_table = gr.components.Dataframe(
310
+ value=running_eval_queue_df,
311
+ headers=EVAL_COLS,
312
+ datatype=EVAL_TYPES,
313
+ row_count=5,
314
+ )
315
+
316
+ with gr.Accordion(
317
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
318
+ open=False,
319
+ ):
320
+ with gr.Row():
321
+ pending_eval_table = gr.components.Dataframe(
322
+ value=pending_eval_queue_df,
323
+ headers=EVAL_COLS,
324
+ datatype=EVAL_TYPES,
325
+ row_count=5,
326
+ )
327
+ with gr.Accordion(
328
+ f"❌ Failed Evaluations ({len(failed_eval_queue_df)})",
329
+ open=False,
330
+ ):
331
+ with gr.Row():
332
+ pending_eval_table = gr.components.Dataframe(
333
+ value=failed_eval_queue_df,
334
+ headers=EVAL_COLS,
335
+ datatype=EVAL_TYPES,
336
+ row_count=5,
337
+ )
338
+ with gr.Row():
339
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
340
+
341
+ with gr.Row():
342
+ with gr.Column():
343
+ model_name_textbox = gr.Textbox(label="Model name")
344
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
345
+ private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
346
+ model_type = gr.Dropdown(
347
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
348
+ label="Model type",
349
+ multiselect=False,
350
+ value=ModelType.IFT.to_str(" : "),
351
+ interactive=True,
352
+ )
353
+
354
+ with gr.Column():
355
+ precision = gr.Dropdown(
356
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
357
+ label="Precision",
358
+ multiselect=False,
359
+ value="float16",
360
+ interactive=True,
361
+ )
362
+ weight_type = gr.Dropdown(
363
+ choices=[i.value.name for i in WeightType],
364
+ label="Weights type",
365
+ multiselect=False,
366
+ value="Original",
367
+ interactive=True,
368
+ )
369
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
370
+
371
+ submit_button = gr.Button("Submit Evalulation!")
372
+ submission_result = gr.Markdown()
373
+ submit_button.click(
374
+ add_new_eval,
375
+ [
376
+ model_name_textbox,
377
+ base_model_name_textbox,
378
+ revision_name_textbox,
379
+ precision,
380
+ private,
381
+ weight_type,
382
+ model_type,
383
+ ],
384
+ submission_result,
385
+ )
386
+
387
+ scheduler = BackgroundScheduler()
388
+ scheduler.add_job(restart_space, "interval", seconds=1800)
389
+ scheduler.start()
390
+
391
+ # Both launches the space and its CI
392
+ configure_space_ci(
393
+ demo.queue(default_concurrency_limit=40),
394
+ trusted_authors=[], # add manually trusted authors
395
+ private="True", # ephemeral spaces will have same visibility as the main space. Otherwise, set to `True` or `False` explicitly.
396
+ variables={}, # We overwrite HF_HOME as tmp CI spaces will have no cache
397
+ secrets=["HF_TOKEN", "H4_TOKEN"], # which secret do I want to copy from the main space? Can be a `List[str]`.
398
+ hardware=None, # "cpu-basic" by default. Otherwise set to "auto" to have same hardware as the main space or any valid string value.
399
+ storage=None, # no storage by default. Otherwise set to "auto" to have same storage as the main space or any valid string value.
400
+ ).launch()
eval-queue/.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
eval-queue/01-ai/Yi-1.5-9B-32K_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "01-ai/Yi-1.5-9B-32K",
3
+ "base_model": "",
4
+ "revision": "c0239dbc923b8a2b5ca849763bdd592d39c60850",
5
+ "private": false,
6
+ "precision": "float16",
7
+ "weight_type": "Original",
8
+ "status": "FINISHED",
9
+ "submitted_time": "2024-07-29T13:10:13Z",
10
+ "model_type": "\ud83d\udfe2 : pretrained",
11
+ "likes": 18,
12
+ "params": 8.829,
13
+ "license": "apache-2.0"
14
+ }
eval-queue/BioMistral/BioMistral-7B_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "BioMistral/BioMistral-7B",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "private": false,
6
+ "precision": "float16",
7
+ "weight_type": "Original",
8
+ "status": "FINISHED",
9
+ "submitted_time": "2024-05-30 01:33:58",
10
+ "model_type": "\u2b55 : instruction-tuned",
11
+ "job_id": "2031",
12
+ "params": 7.0,
13
+ "likes": 354,
14
+ "license": "apache-2.0"
15
+ }
eval-queue/EleutherAI/polyglot-ko-1.3b_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "EleutherAI/polyglot-ko-1.3b",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "private": false,
6
+ "precision": "float16",
7
+ "weight_type": "Original",
8
+ "status": "FINISHED",
9
+ "submitted_time": "2024-07-25T11:04:40Z",
10
+ "model_type": "\ud83d\udfe2 : pretrained",
11
+ "likes": 71,
12
+ "params": 1.432,
13
+ "license": "apache-2.0"
14
+ }
eval-queue/HuggingFaceH4/zephyr-7b-beta_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "HuggingFaceH4/zephyr-7b-beta",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "private": false,
6
+ "precision": "float16",
7
+ "weight_type": "Original",
8
+ "status": "FINISHED",
9
+ "submitted_time": "2023-11-01 04:21:47",
10
+ "model_type": "\u2b55 : instruction-tuned",
11
+ "job_id": "401",
12
+ "params": 7.242,
13
+ "likes": 1162,
14
+ "license": "mit"
15
+ }
eval-queue/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
eval-queue/nlpai-lab/KULLM3_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "nlpai-lab/KULLM3",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "private": false,
6
+ "precision": "float16",
7
+ "weight_type": "Original",
8
+ "status": "FINISHED",
9
+ "submitted_time": "2024-04-08 05:16:47",
10
+ "model_type": "\u2b55 : instruction-tuned",
11
+ "job_id": "1751",
12
+ "params": 10.732000350952148,
13
+ "likes": 13,
14
+ "license": "cc-by-nc-4.0"
15
+ }
eval-queue/x2bee/POLAR-14B-DPO-v1.3_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "x2bee/POLAR-14B-DPO-v1.3",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "private": false,
6
+ "precision": "float16",
7
+ "weight_type": "Original",
8
+ "status": "FINISHED",
9
+ "submitted_time": "2024-05-23 11:59:50",
10
+ "model_type": "\u2b55 : instruction-tuned",
11
+ "job_id": "1987",
12
+ "params": 14.220999717712402,
13
+ "likes": 0,
14
+ "license": "apache-2.0"
15
+ }
eval-queue/x2bee/POLAR-14B-DPO-v1.4_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "x2bee/POLAR-14B-DPO-v1.4",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "private": false,
6
+ "precision": "float16",
7
+ "weight_type": "Original",
8
+ "status": "FINISHED",
9
+ "submitted_time": "2024-05-27 15:02:47",
10
+ "model_type": "\u2b55 : instruction-tuned",
11
+ "job_id": "2004",
12
+ "params": 14.220999717712402,
13
+ "likes": 0,
14
+ "license": "apache-2.0"
15
+ }
eval-queue/x2bee/POLAR-14B-HES-DPO-v1.5_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "x2bee/POLAR-14B-HES-DPO-v1.5",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "private": false,
6
+ "precision": "float16",
7
+ "weight_type": "Original",
8
+ "status": "FINISHED",
9
+ "submitted_time": "2024-05-29 23:53:33",
10
+ "model_type": "\u2b55 : instruction-tuned",
11
+ "job_id": "2029",
12
+ "params": 14.220999717712402,
13
+ "likes": 0,
14
+ "license": "apache-2.0"
15
+ }
eval-queue/x2bee/POLAR-14B-SON-SFT-v0.1_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "x2bee/POLAR-14B-SON-SFT-v0.1",
3
+ "base_model": "x2bee/POLAR-14B-v0.2",
4
+ "revision": "main",
5
+ "private": false,
6
+ "precision": "float16",
7
+ "weight_type": "Original",
8
+ "status": "FINISHED",
9
+ "submitted_time": "2024-05-27 13:52:58",
10
+ "model_type": "\u2b55 : instruction-tuned",
11
+ "job_id": "2003",
12
+ "params": 14.220999717712402,
13
+ "likes": 0,
14
+ "license": "apache-2.0"
15
+ }
eval-queue/x2bee/POLAR-14B-v0.2_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "x2bee/POLAR-14B-v0.2",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "private": false,
6
+ "precision": "float16",
7
+ "weight_type": "Original",
8
+ "status": "FINISHED",
9
+ "submitted_time": "2024-05-02 00:34:33",
10
+ "model_type": "\ud83d\udfe2 : pretrained",
11
+ "job_id": "1874",
12
+ "params": 14.220999717712402,
13
+ "likes": 0,
14
+ "license": "apache-2.0"
15
+ }
eval-queue/x2bee/POLAR-14B-v0.5_eval_request_False_float16_Original.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "x2bee/POLAR-14B-v0.5",
3
+ "base_model": "",
4
+ "revision": "main",
5
+ "private": false,
6
+ "precision": "float16",
7
+ "weight_type": "Original",
8
+ "status": "FINISHED",
9
+ "submitted_time": "2024-06-05 00:49:59",
10
+ "model_type": "\ud83d\udfe2 : pretrained",
11
+ "job_id": "2041",
12
+ "params": 14.220999717712402,
13
+ "likes": 0,
14
+ "license": "apache-2.0"
15
+ }
eval-results/.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
eval-results/01-ai/Yi-1.5-9B-32K/result_2024_07_30 20:36:30.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "daily": {
4
+ "daily": 7
5
+ },
6
+ "quarterly": {
7
+ "quarterly": 7
8
+ },
9
+ "harness|arc_challenge|25": {
10
+ "acc": 0.29948805460750855,
11
+ "acc_stderr": 0.013385021637313567,
12
+ "acc_norm": 0.3506825938566553,
13
+ "acc_norm_stderr": 0.013944635930726089
14
+ },
15
+ "harness|hellaswag|10": {
16
+ "acc": 0.3333001394144593,
17
+ "acc_stderr": 0.004704293898729902,
18
+ "acc_norm": 0.4137621987651862,
19
+ "acc_norm_stderr": 0.004915003499517831
20
+ },
21
+ "harness|mmlu_world_religions|5": {
22
+ "acc": 0.47953216374269003,
23
+ "acc_stderr": 0.0383161053282193,
24
+ "acc_norm": 0.47953216374269003,
25
+ "acc_norm_stderr": 0.0383161053282193
26
+ },
27
+ "harness|mmlu_management|5": {
28
+ "acc": 0.5631067961165048,
29
+ "acc_stderr": 0.049111471073657764,
30
+ "acc_norm": 0.5631067961165048,
31
+ "acc_norm_stderr": 0.049111471073657764
32
+ },
33
+ "harness|mmlu_miscellaneous|5": {
34
+ "acc": 0.47509578544061304,
35
+ "acc_stderr": 0.01785777070490102,
36
+ "acc_norm": 0.47509578544061304,
37
+ "acc_norm_stderr": 0.01785777070490102
38
+ },
39
+ "harness|mmlu_anatomy|5": {
40
+ "acc": 0.28888888888888886,
41
+ "acc_stderr": 0.0391545063041425,
42
+ "acc_norm": 0.28888888888888886,
43
+ "acc_norm_stderr": 0.0391545063041425
44
+ },
45
+ "harness|mmlu_abstract_algebra|5": {
46
+ "acc": 0.31,
47
+ "acc_stderr": 0.04648231987117316,
48
+ "acc_norm": 0.31,
49
+ "acc_norm_stderr": 0.04648231987117316
50
+ },
51
+ "harness|mmlu_conceptual_physics|5": {
52
+ "acc": 0.46808510638297873,
53
+ "acc_stderr": 0.03261936918467382,
54
+ "acc_norm": 0.46808510638297873,
55
+ "acc_norm_stderr": 0.03261936918467382
56
+ },
57
+ "harness|mmlu_virology|5": {
58
+ "acc": 0.45180722891566266,
59
+ "acc_stderr": 0.03874371556587953,
60
+ "acc_norm": 0.45180722891566266,
61
+ "acc_norm_stderr": 0.03874371556587953
62
+ },
63
+ "harness|mmlu_philosophy|5": {
64
+ "acc": 0.47266881028938906,
65
+ "acc_stderr": 0.028355633568328188,
66
+ "acc_norm": 0.47266881028938906,
67
+ "acc_norm_stderr": 0.028355633568328188
68
+ },
69
+ "harness|mmlu_human_aging|5": {
70
+ "acc": 0.45739910313901344,
71
+ "acc_stderr": 0.033435777055830646,
72
+ "acc_norm": 0.45739910313901344,
73
+ "acc_norm_stderr": 0.033435777055830646
74
+ },
75
+ "harness|mmlu_human_sexuality|5": {
76
+ "acc": 0.5267175572519084,
77
+ "acc_stderr": 0.04379024936553894,
78
+ "acc_norm": 0.5267175572519084,
79
+ "acc_norm_stderr": 0.04379024936553894
80
+ },
81
+ "harness|mmlu_medical_genetics|5": {
82
+ "acc": 0.39,
83
+ "acc_stderr": 0.04902071300001975,
84
+ "acc_norm": 0.39,
85
+ "acc_norm_stderr": 0.04902071300001975
86
+ },
87
+ "harness|mmlu_high_school_geography|5": {
88
+ "acc": 0.5555555555555556,
89
+ "acc_stderr": 0.035402943770953675,
90
+ "acc_norm": 0.5555555555555556,
91
+ "acc_norm_stderr": 0.035402943770953675
92
+ },
93
+ "harness|mmlu_electrical_engineering|5": {
94
+ "acc": 0.5724137931034483,
95
+ "acc_stderr": 0.04122737111370332,
96
+ "acc_norm": 0.5724137931034483,
97
+ "acc_norm_stderr": 0.04122737111370332
98
+ },
99
+ "harness|mmlu_college_physics|5": {
100
+ "acc": 0.3137254901960784,
101
+ "acc_stderr": 0.04617034827006716,
102
+ "acc_norm": 0.3137254901960784,
103
+ "acc_norm_stderr": 0.04617034827006716
104
+ },
105
+ "harness|mmlu_high_school_microeconomics|5": {
106
+ "acc": 0.5,
107
+ "acc_stderr": 0.032478490123081544,
108
+ "acc_norm": 0.5,
109
+ "acc_norm_stderr": 0.032478490123081544
110
+ },
111
+ "harness|mmlu_high_school_macroeconomics|5": {
112
+ "acc": 0.47692307692307695,
113
+ "acc_stderr": 0.025323990861736125,
114
+ "acc_norm": 0.47692307692307695,
115
+ "acc_norm_stderr": 0.025323990861736125
116
+ },
117
+ "harness|mmlu_computer_security|5": {
118
+ "acc": 0.53,
119
+ "acc_stderr": 0.05016135580465919,
120
+ "acc_norm": 0.53,
121
+ "acc_norm_stderr": 0.05016135580465919
122
+ },
123
+ "harness|mmlu_global_facts|5": {
124
+ "acc": 0.37,
125
+ "acc_stderr": 0.048523658709391,
126
+ "acc_norm": 0.37,
127
+ "acc_norm_stderr": 0.048523658709391
128
+ },
129
+ "harness|mmlu_jurisprudence|5": {
130
+ "acc": 0.5740740740740741,
131
+ "acc_stderr": 0.047803436269367894,
132
+ "acc_norm": 0.5740740740740741,
133
+ "acc_norm_stderr": 0.047803436269367894
134
+ },
135
+ "harness|mmlu_high_school_chemistry|5": {
136
+ "acc": 0.4187192118226601,
137
+ "acc_stderr": 0.03471192860518468,
138
+ "acc_norm": 0.4187192118226601,
139
+ "acc_norm_stderr": 0.03471192860518468
140
+ },
141
+ "harness|mmlu_high_school_biology|5": {
142
+ "acc": 0.47419354838709676,
143
+ "acc_stderr": 0.02840609505765332,
144
+ "acc_norm": 0.47419354838709676,
145
+ "acc_norm_stderr": 0.02840609505765332
146
+ },
147
+ "harness|mmlu_marketing|5": {
148
+ "acc": 0.6752136752136753,
149
+ "acc_stderr": 0.03067902276549883,
150
+ "acc_norm": 0.6752136752136753,
151
+ "acc_norm_stderr": 0.03067902276549883
152
+ },
153
+ "harness|mmlu_clinical_knowledge|5": {
154
+ "acc": 0.44150943396226416,
155
+ "acc_stderr": 0.030561590426731833,
156
+ "acc_norm": 0.44150943396226416,
157
+ "acc_norm_stderr": 0.030561590426731833
158
+ },
159
+ "harness|mmlu_public_relations|5": {
160
+ "acc": 0.4727272727272727,
161
+ "acc_stderr": 0.04782001791380063,
162
+ "acc_norm": 0.4727272727272727,
163
+ "acc_norm_stderr": 0.04782001791380063
164
+ },
165
+ "harness|mmlu_high_school_mathematics|5": {
166
+ "acc": 0.4185185185185185,
167
+ "acc_stderr": 0.030078013075022066,
168
+ "acc_norm": 0.4185185185185185,
169
+ "acc_norm_stderr": 0.030078013075022066
170
+ },
171
+ "harness|mmlu_high_school_physics|5": {
172
+ "acc": 0.304635761589404,
173
+ "acc_stderr": 0.03757949922943343,
174
+ "acc_norm": 0.304635761589404,
175
+ "acc_norm_stderr": 0.03757949922943343
176
+ },
177
+ "harness|mmlu_sociology|5": {
178
+ "acc": 0.6069651741293532,
179
+ "acc_stderr": 0.0345368246603156,
180
+ "acc_norm": 0.6069651741293532,
181
+ "acc_norm_stderr": 0.0345368246603156
182
+ },
183
+ "harness|mmlu_college_medicine|5": {
184
+ "acc": 0.4046242774566474,
185
+ "acc_stderr": 0.03742461193887248,
186
+ "acc_norm": 0.4046242774566474,
187
+ "acc_norm_stderr": 0.03742461193887248
188
+ },
189
+ "harness|mmlu_elementary_mathematics|5": {
190
+ "acc": 0.5476190476190477,
191
+ "acc_stderr": 0.02563425811555495,
192
+ "acc_norm": 0.5476190476190477,
193
+ "acc_norm_stderr": 0.02563425811555495
194
+ },
195
+ "harness|mmlu_college_biology|5": {
196
+ "acc": 0.3472222222222222,
197
+ "acc_stderr": 0.039812405437178615,
198
+ "acc_norm": 0.3472222222222222,
199
+ "acc_norm_stderr": 0.039812405437178615
200
+ },
201
+ "harness|mmlu_college_chemistry|5": {
202
+ "acc": 0.33,
203
+ "acc_stderr": 0.04725815626252605,
204
+ "acc_norm": 0.33,
205
+ "acc_norm_stderr": 0.04725815626252605
206
+ },
207
+ "harness|mmlu_us_foreign_policy|5": {
208
+ "acc": 0.57,
209
+ "acc_stderr": 0.04975698519562426,
210
+ "acc_norm": 0.57,
211
+ "acc_norm_stderr": 0.04975698519562426
212
+ },
213
+ "harness|mmlu_moral_disputes|5": {
214
+ "acc": 0.49710982658959535,
215
+ "acc_stderr": 0.026918645383239015,
216
+ "acc_norm": 0.49710982658959535,
217
+ "acc_norm_stderr": 0.026918645383239015
218
+ },
219
+ "harness|mmlu_logical_fallacies|5": {
220
+ "acc": 0.5276073619631901,
221
+ "acc_stderr": 0.03922378290610991,
222
+ "acc_norm": 0.5276073619631901,
223
+ "acc_norm_stderr": 0.03922378290610991
224
+ },
225
+ "harness|mmlu_prehistory|5": {
226
+ "acc": 0.49691358024691357,
227
+ "acc_stderr": 0.027820214158594377,
228
+ "acc_norm": 0.49691358024691357,
229
+ "acc_norm_stderr": 0.027820214158594377
230
+ },
231
+ "harness|mmlu_college_mathematics|5": {
232
+ "acc": 0.45,
233
+ "acc_stderr": 0.05,
234
+ "acc_norm": 0.45,
235
+ "acc_norm_stderr": 0.05
236
+ },
237
+ "harness|mmlu_high_school_government_and_politics|5": {
238
+ "acc": 0.49222797927461137,
239
+ "acc_stderr": 0.03608003225569654,
240
+ "acc_norm": 0.49222797927461137,
241
+ "acc_norm_stderr": 0.03608003225569654
242
+ },
243
+ "harness|mmlu_econometrics|5": {
244
+ "acc": 0.41228070175438597,
245
+ "acc_stderr": 0.046306532033665956,
246
+ "acc_norm": 0.41228070175438597,
247
+ "acc_norm_stderr": 0.046306532033665956
248
+ },
249
+ "harness|mmlu_high_school_psychology|5": {
250
+ "acc": 0.5027522935779817,
251
+ "acc_stderr": 0.02143699835976532,
252
+ "acc_norm": 0.5027522935779817,
253
+ "acc_norm_stderr": 0.02143699835976532
254
+ },
255
+ "harness|mmlu_formal_logic|5": {
256
+ "acc": 0.40476190476190477,
257
+ "acc_stderr": 0.04390259265377561,
258
+ "acc_norm": 0.40476190476190477,
259
+ "acc_norm_stderr": 0.04390259265377561
260
+ },
261
+ "harness|mmlu_nutrition|5": {
262
+ "acc": 0.49019607843137253,
263
+ "acc_stderr": 0.028624412550167958,
264
+ "acc_norm": 0.49019607843137253,
265
+ "acc_norm_stderr": 0.028624412550167958
266
+ },
267
+ "harness|mmlu_business_ethics|5": {
268
+ "acc": 0.5,
269
+ "acc_stderr": 0.050251890762960605,
270
+ "acc_norm": 0.5,
271
+ "acc_norm_stderr": 0.050251890762960605
272
+ },
273
+ "harness|mmlu_international_law|5": {
274
+ "acc": 0.7355371900826446,
275
+ "acc_stderr": 0.04026187527591205,
276
+ "acc_norm": 0.7355371900826446,
277
+ "acc_norm_stderr": 0.04026187527591205
278
+ },
279
+ "harness|mmlu_astronomy|5": {
280
+ "acc": 0.45394736842105265,
281
+ "acc_stderr": 0.04051646342874142,
282
+ "acc_norm": 0.45394736842105265,
283
+ "acc_norm_stderr": 0.04051646342874142
284
+ },
285
+ "harness|mmlu_professional_psychology|5": {
286
+ "acc": 0.39705882352941174,
287
+ "acc_stderr": 0.019794488900024113,
288
+ "acc_norm": 0.39705882352941174,
289
+ "acc_norm_stderr": 0.019794488900024113
290
+ },
291
+ "harness|mmlu_professional_accounting|5": {
292
+ "acc": 0.40070921985815605,
293
+ "acc_stderr": 0.029233465745573086,
294
+ "acc_norm": 0.40070921985815605,
295
+ "acc_norm_stderr": 0.029233465745573086
296
+ },
297
+ "harness|mmlu_machine_learning|5": {
298
+ "acc": 0.39285714285714285,
299
+ "acc_stderr": 0.04635550135609976,
300
+ "acc_norm": 0.39285714285714285,
301
+ "acc_norm_stderr": 0.04635550135609976
302
+ },
303
+ "harness|mmlu_high_school_statistics|5": {
304
+ "acc": 0.4675925925925926,
305
+ "acc_stderr": 0.034028015813589656,
306
+ "acc_norm": 0.4675925925925926,
307
+ "acc_norm_stderr": 0.034028015813589656
308
+ },
309
+ "harness|mmlu_moral_scenarios|5": {
310
+ "acc": 0.3329608938547486,
311
+ "acc_stderr": 0.015761716178397552,
312
+ "acc_norm": 0.3329608938547486,
313
+ "acc_norm_stderr": 0.015761716178397552
314
+ },
315
+ "harness|mmlu_college_computer_science|5": {
316
+ "acc": 0.43,
317
+ "acc_stderr": 0.049756985195624284,
318
+ "acc_norm": 0.43,
319
+ "acc_norm_stderr": 0.049756985195624284
320
+ },
321
+ "harness|mmlu_high_school_computer_science|5": {
322
+ "acc": 0.76,
323
+ "acc_stderr": 0.042923469599092816,
324
+ "acc_norm": 0.76,
325
+ "acc_norm_stderr": 0.042923469599092816
326
+ },
327
+ "harness|mmlu_professional_medicine|5": {
328
+ "acc": 0.35294117647058826,
329
+ "acc_stderr": 0.029029422815681404,
330
+ "acc_norm": 0.35294117647058826,
331
+ "acc_norm_stderr": 0.029029422815681404
332
+ },
333
+ "harness|mmlu_security_studies|5": {
334
+ "acc": 0.6163265306122448,
335
+ "acc_stderr": 0.031130880396235943,
336
+ "acc_norm": 0.6163265306122448,
337
+ "acc_norm_stderr": 0.031130880396235943
338
+ },
339
+ "harness|mmlu_high_school_world_history|5": {
340
+ "acc": 0.5654008438818565,
341
+ "acc_stderr": 0.03226759995510145,
342
+ "acc_norm": 0.5654008438818565,
343
+ "acc_norm_stderr": 0.03226759995510145
344
+ },
345
+ "harness|mmlu_professional_law|5": {
346
+ "acc": 0.36571056062581486,
347
+ "acc_stderr": 0.012301028188840567,
348
+ "acc_norm": 0.36571056062581486,
349
+ "acc_norm_stderr": 0.012301028188840567
350
+ },
351
+ "harness|mmlu_high_school_us_history|5": {
352
+ "acc": 0.4852941176470588,
353
+ "acc_stderr": 0.03507793834791324,
354
+ "acc_norm": 0.4852941176470588,
355
+ "acc_norm_stderr": 0.03507793834791324
356
+ },
357
+ "harness|mmlu_high_school_european_history|5": {
358
+ "acc": 0.5151515151515151,
359
+ "acc_stderr": 0.03902551007374448,
360
+ "acc_norm": 0.5151515151515151,
361
+ "acc_norm_stderr": 0.03902551007374448
362
+ },
363
+ "harness|truthfulqa_mc|0": {
364
+ "mc1": 0.2937576499388005,
365
+ "mc1_stderr": 0.015945068581236614,
366
+ "mc2": 0.4670848140389129,
367
+ "mc2_stderr": 0.01585178282587417
368
+ },
369
+ "harness|commongen_v2|2": {
370
+ "acc": 0.47107438016528924,
371
+ "acc_stderr": 0.017161563949916348,
372
+ "acc_norm": 0.5171192443919717,
373
+ "acc_norm_stderr": 0.017180275246085626
374
+ }
375
+ },
376
+ "versions": {
377
+ "all": 0,
378
+ "harness|arc_challenge|25": 0,
379
+ "harness|hellaswag|10": 0,
380
+ "harness|mmlu_world_religions|5": 1,
381
+ "harness|mmlu_management|5": 1,
382
+ "harness|mmlu_miscellaneous|5": 1,
383
+ "harness|mmlu_anatomy|5": 1,
384
+ "harness|mmlu_abstract_algebra|5": 1,
385
+ "harness|mmlu_conceptual_physics|5": 1,
386
+ "harness|mmlu_virology|5": 1,
387
+ "harness|mmlu_philosophy|5": 1,
388
+ "harness|mmlu_human_aging|5": 1,
389
+ "harness|mmlu_human_sexuality|5": 1,
390
+ "harness|mmlu_medical_genetics|5": 1,
391
+ "harness|mmlu_high_school_geography|5": 1,
392
+ "harness|mmlu_electrical_engineering|5": 1,
393
+ "harness|mmlu_college_physics|5": 1,
394
+ "harness|mmlu_high_school_microeconomics|5": 1,
395
+ "harness|mmlu_high_school_macroeconomics|5": 1,
396
+ "harness|mmlu_computer_security|5": 1,
397
+ "harness|mmlu_global_facts|5": 1,
398
+ "harness|mmlu_jurisprudence|5": 1,
399
+ "harness|mmlu_high_school_chemistry|5": 1,
400
+ "harness|mmlu_high_school_biology|5": 1,
401
+ "harness|mmlu_marketing|5": 1,
402
+ "harness|mmlu_clinical_knowledge|5": 1,
403
+ "harness|mmlu_public_relations|5": 1,
404
+ "harness|mmlu_high_school_mathematics|5": 1,
405
+ "harness|mmlu_high_school_physics|5": 1,
406
+ "harness|mmlu_sociology|5": 1,
407
+ "harness|mmlu_college_medicine|5": 1,
408
+ "harness|mmlu_elementary_mathematics|5": 1,
409
+ "harness|mmlu_college_biology|5": 1,
410
+ "harness|mmlu_college_chemistry|5": 1,
411
+ "harness|mmlu_us_foreign_policy|5": 1,
412
+ "harness|mmlu_moral_disputes|5": 1,
413
+ "harness|mmlu_logical_fallacies|5": 1,
414
+ "harness|mmlu_prehistory|5": 1,
415
+ "harness|mmlu_college_mathematics|5": 1,
416
+ "harness|mmlu_high_school_government_and_politics|5": 1,
417
+ "harness|mmlu_econometrics|5": 1,
418
+ "harness|mmlu_high_school_psychology|5": 1,
419
+ "harness|mmlu_formal_logic|5": 1,
420
+ "harness|mmlu_nutrition|5": 1,
421
+ "harness|mmlu_business_ethics|5": 1,
422
+ "harness|mmlu_international_law|5": 1,
423
+ "harness|mmlu_astronomy|5": 1,
424
+ "harness|mmlu_professional_psychology|5": 1,
425
+ "harness|mmlu_professional_accounting|5": 1,
426
+ "harness|mmlu_machine_learning|5": 1,
427
+ "harness|mmlu_high_school_statistics|5": 1,
428
+ "harness|mmlu_moral_scenarios|5": 1,
429
+ "harness|mmlu_college_computer_science|5": 1,
430
+ "harness|mmlu_high_school_computer_science|5": 1,
431
+ "harness|mmlu_professional_medicine|5": 1,
432
+ "harness|mmlu_security_studies|5": 1,
433
+ "harness|mmlu_high_school_world_history|5": 1,
434
+ "harness|mmlu_professional_law|5": 1,
435
+ "harness|mmlu_high_school_us_history|5": 1,
436
+ "harness|mmlu_high_school_european_history|5": 1,
437
+ "harness|truthfulqa_mc|0": 0,
438
+ "harness|commongen_v2|2": 1
439
+ },
440
+ "config_general": {
441
+ "model_name": "01-ai/Yi-1.5-9B-32K",
442
+ "model_sha": "c0239dbc923b8a2b5ca849763bdd592d39c60850",
443
+ "model_dtype": "torch.float16",
444
+ "lighteval_sha": "",
445
+ "num_few_shot_default": 0,
446
+ "num_fewshot_seeds": 1,
447
+ "override_batch_size": 1,
448
+ "max_samples": null
449
+ }
450
+ }
eval-results/BioMistral/BioMistral-7B/BioMistral_BioMistral-7B_result_2024-05-30 01_33_58.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "daily": {
4
+ "daily": 10
5
+ },
6
+ "quarterly": {
7
+ "quarterly": 10
8
+ },
9
+ "harness|arc_challenge|25": {
10
+ "acc": 0.257679180887372,
11
+ "acc_stderr": 0.012780770562768416,
12
+ "acc_norm": 0.3122866894197952,
13
+ "acc_norm_stderr": 0.013542598541688065
14
+ },
15
+ "harness|hellaswag|10": {
16
+ "acc": 0.3229436367257518,
17
+ "acc_stderr": 0.004666457279979418,
18
+ "acc_norm": 0.39255128460466043,
19
+ "acc_norm_stderr": 0.004873203269366306
20
+ },
21
+ "harness|mmlu_world_religions|5": {
22
+ "acc": 0.34502923976608185,
23
+ "acc_stderr": 0.036459813773888065,
24
+ "acc_norm": 0.34502923976608185,
25
+ "acc_norm_stderr": 0.036459813773888065
26
+ },
27
+ "harness|mmlu_management|5": {
28
+ "acc": 0.4368932038834951,
29
+ "acc_stderr": 0.04911147107365778,
30
+ "acc_norm": 0.4368932038834951,
31
+ "acc_norm_stderr": 0.04911147107365778
32
+ },
33
+ "harness|mmlu_miscellaneous|5": {
34
+ "acc": 0.3780332056194125,
35
+ "acc_stderr": 0.017339844462104625,
36
+ "acc_norm": 0.3780332056194125,
37
+ "acc_norm_stderr": 0.017339844462104625
38
+ },
39
+ "harness|mmlu_anatomy|5": {
40
+ "acc": 0.3037037037037037,
41
+ "acc_stderr": 0.039725528847851355,
42
+ "acc_norm": 0.3037037037037037,
43
+ "acc_norm_stderr": 0.039725528847851355
44
+ },
45
+ "harness|mmlu_abstract_algebra|5": {
46
+ "acc": 0.37,
47
+ "acc_stderr": 0.04852365870939099,
48
+ "acc_norm": 0.37,
49
+ "acc_norm_stderr": 0.04852365870939099
50
+ },
51
+ "harness|mmlu_conceptual_physics|5": {
52
+ "acc": 0.28085106382978725,
53
+ "acc_stderr": 0.02937917046412482,
54
+ "acc_norm": 0.28085106382978725,
55
+ "acc_norm_stderr": 0.02937917046412482
56
+ },
57
+ "harness|mmlu_virology|5": {
58
+ "acc": 0.3373493975903614,
59
+ "acc_stderr": 0.03680783690727581,
60
+ "acc_norm": 0.3373493975903614,
61
+ "acc_norm_stderr": 0.03680783690727581
62
+ },
63
+ "harness|mmlu_philosophy|5": {
64
+ "acc": 0.3954983922829582,
65
+ "acc_stderr": 0.027770918531427838,
66
+ "acc_norm": 0.3954983922829582,
67
+ "acc_norm_stderr": 0.027770918531427838
68
+ },
69
+ "harness|mmlu_human_aging|5": {
70
+ "acc": 0.34977578475336324,
71
+ "acc_stderr": 0.03200736719484503,
72
+ "acc_norm": 0.34977578475336324,
73
+ "acc_norm_stderr": 0.03200736719484503
74
+ },
75
+ "harness|mmlu_human_sexuality|5": {
76
+ "acc": 0.3969465648854962,
77
+ "acc_stderr": 0.04291135671009224,
78
+ "acc_norm": 0.3969465648854962,
79
+ "acc_norm_stderr": 0.04291135671009224
80
+ },
81
+ "harness|mmlu_medical_genetics|5": {
82
+ "acc": 0.42,
83
+ "acc_stderr": 0.049604496374885836,
84
+ "acc_norm": 0.42,
85
+ "acc_norm_stderr": 0.049604496374885836
86
+ },
87
+ "harness|mmlu_high_school_geography|5": {
88
+ "acc": 0.4292929292929293,
89
+ "acc_stderr": 0.03526552724601199,
90
+ "acc_norm": 0.4292929292929293,
91
+ "acc_norm_stderr": 0.03526552724601199
92
+ },
93
+ "harness|mmlu_electrical_engineering|5": {
94
+ "acc": 0.4,
95
+ "acc_stderr": 0.04082482904638628,
96
+ "acc_norm": 0.4,
97
+ "acc_norm_stderr": 0.04082482904638628
98
+ },
99
+ "harness|mmlu_college_physics|5": {
100
+ "acc": 0.30392156862745096,
101
+ "acc_stderr": 0.045766654032077636,
102
+ "acc_norm": 0.30392156862745096,
103
+ "acc_norm_stderr": 0.045766654032077636
104
+ },
105
+ "harness|mmlu_high_school_microeconomics|5": {
106
+ "acc": 0.40336134453781514,
107
+ "acc_stderr": 0.031866081214088314,
108
+ "acc_norm": 0.40336134453781514,
109
+ "acc_norm_stderr": 0.031866081214088314
110
+ },
111
+ "harness|mmlu_high_school_macroeconomics|5": {
112
+ "acc": 0.40512820512820513,
113
+ "acc_stderr": 0.024890471769938145,
114
+ "acc_norm": 0.40512820512820513,
115
+ "acc_norm_stderr": 0.024890471769938145
116
+ },
117
+ "harness|mmlu_computer_security|5": {
118
+ "acc": 0.48,
119
+ "acc_stderr": 0.050211673156867795,
120
+ "acc_norm": 0.48,
121
+ "acc_norm_stderr": 0.050211673156867795
122
+ },
123
+ "harness|mmlu_global_facts|5": {
124
+ "acc": 0.32,
125
+ "acc_stderr": 0.04688261722621505,
126
+ "acc_norm": 0.32,
127
+ "acc_norm_stderr": 0.04688261722621505
128
+ },
129
+ "harness|mmlu_jurisprudence|5": {
130
+ "acc": 0.49074074074074076,
131
+ "acc_stderr": 0.04832853553437055,
132
+ "acc_norm": 0.49074074074074076,
133
+ "acc_norm_stderr": 0.04832853553437055
134
+ },
135
+ "harness|mmlu_high_school_chemistry|5": {
136
+ "acc": 0.37438423645320196,
137
+ "acc_stderr": 0.03405155380561952,
138
+ "acc_norm": 0.37438423645320196,
139
+ "acc_norm_stderr": 0.03405155380561952
140
+ },
141
+ "harness|mmlu_high_school_biology|5": {
142
+ "acc": 0.36774193548387096,
143
+ "acc_stderr": 0.027430866579973474,
144
+ "acc_norm": 0.36774193548387096,
145
+ "acc_norm_stderr": 0.027430866579973474
146
+ },
147
+ "harness|mmlu_marketing|5": {
148
+ "acc": 0.5598290598290598,
149
+ "acc_stderr": 0.0325207417206305,
150
+ "acc_norm": 0.5598290598290598,
151
+ "acc_norm_stderr": 0.0325207417206305
152
+ },
153
+ "harness|mmlu_clinical_knowledge|5": {
154
+ "acc": 0.3886792452830189,
155
+ "acc_stderr": 0.030000485448675986,
156
+ "acc_norm": 0.3886792452830189,
157
+ "acc_norm_stderr": 0.030000485448675986
158
+ },
159
+ "harness|mmlu_public_relations|5": {
160
+ "acc": 0.44545454545454544,
161
+ "acc_stderr": 0.047605488214603246,
162
+ "acc_norm": 0.44545454545454544,
163
+ "acc_norm_stderr": 0.047605488214603246
164
+ },
165
+ "harness|mmlu_high_school_mathematics|5": {
166
+ "acc": 0.34444444444444444,
167
+ "acc_stderr": 0.028972648884844267,
168
+ "acc_norm": 0.34444444444444444,
169
+ "acc_norm_stderr": 0.028972648884844267
170
+ },
171
+ "harness|mmlu_high_school_physics|5": {
172
+ "acc": 0.3443708609271523,
173
+ "acc_stderr": 0.038796870240733264,
174
+ "acc_norm": 0.3443708609271523,
175
+ "acc_norm_stderr": 0.038796870240733264
176
+ },
177
+ "harness|mmlu_sociology|5": {
178
+ "acc": 0.4577114427860697,
179
+ "acc_stderr": 0.035228658640995975,
180
+ "acc_norm": 0.4577114427860697,
181
+ "acc_norm_stderr": 0.035228658640995975
182
+ },
183
+ "harness|mmlu_college_medicine|5": {
184
+ "acc": 0.3815028901734104,
185
+ "acc_stderr": 0.03703851193099521,
186
+ "acc_norm": 0.3815028901734104,
187
+ "acc_norm_stderr": 0.03703851193099521
188
+ },
189
+ "harness|mmlu_elementary_mathematics|5": {
190
+ "acc": 0.35714285714285715,
191
+ "acc_stderr": 0.02467786284133278,
192
+ "acc_norm": 0.35714285714285715,
193
+ "acc_norm_stderr": 0.02467786284133278
194
+ },
195
+ "harness|mmlu_college_biology|5": {
196
+ "acc": 0.3333333333333333,
197
+ "acc_stderr": 0.03942082639927213,
198
+ "acc_norm": 0.3333333333333333,
199
+ "acc_norm_stderr": 0.03942082639927213
200
+ },
201
+ "harness|mmlu_college_chemistry|5": {
202
+ "acc": 0.47,
203
+ "acc_stderr": 0.05016135580465919,
204
+ "acc_norm": 0.47,
205
+ "acc_norm_stderr": 0.05016135580465919
206
+ },
207
+ "harness|mmlu_us_foreign_policy|5": {
208
+ "acc": 0.54,
209
+ "acc_stderr": 0.05009082659620333,
210
+ "acc_norm": 0.54,
211
+ "acc_norm_stderr": 0.05009082659620333
212
+ },
213
+ "harness|mmlu_moral_disputes|5": {
214
+ "acc": 0.44508670520231214,
215
+ "acc_stderr": 0.02675625512966377,
216
+ "acc_norm": 0.44508670520231214,
217
+ "acc_norm_stderr": 0.02675625512966377
218
+ },
219
+ "harness|mmlu_logical_fallacies|5": {
220
+ "acc": 0.34355828220858897,
221
+ "acc_stderr": 0.03731133519673893,
222
+ "acc_norm": 0.34355828220858897,
223
+ "acc_norm_stderr": 0.03731133519673893
224
+ },
225
+ "harness|mmlu_prehistory|5": {
226
+ "acc": 0.37037037037037035,
227
+ "acc_stderr": 0.02686949074481525,
228
+ "acc_norm": 0.37037037037037035,
229
+ "acc_norm_stderr": 0.02686949074481525
230
+ },
231
+ "harness|mmlu_college_mathematics|5": {
232
+ "acc": 0.33,
233
+ "acc_stderr": 0.04725815626252605,
234
+ "acc_norm": 0.33,
235
+ "acc_norm_stderr": 0.04725815626252605
236
+ },
237
+ "harness|mmlu_high_school_government_and_politics|5": {
238
+ "acc": 0.44559585492227977,
239
+ "acc_stderr": 0.0358701498607566,
240
+ "acc_norm": 0.44559585492227977,
241
+ "acc_norm_stderr": 0.0358701498607566
242
+ },
243
+ "harness|mmlu_econometrics|5": {
244
+ "acc": 0.2719298245614035,
245
+ "acc_stderr": 0.041857744240220575,
246
+ "acc_norm": 0.2719298245614035,
247
+ "acc_norm_stderr": 0.041857744240220575
248
+ },
249
+ "harness|mmlu_high_school_psychology|5": {
250
+ "acc": 0.3798165137614679,
251
+ "acc_stderr": 0.020808825617866244,
252
+ "acc_norm": 0.3798165137614679,
253
+ "acc_norm_stderr": 0.020808825617866244
254
+ },
255
+ "harness|mmlu_formal_logic|5": {
256
+ "acc": 0.3492063492063492,
257
+ "acc_stderr": 0.04263906892795132,
258
+ "acc_norm": 0.3492063492063492,
259
+ "acc_norm_stderr": 0.04263906892795132
260
+ },
261
+ "harness|mmlu_nutrition|5": {
262
+ "acc": 0.4117647058823529,
263
+ "acc_stderr": 0.02818059632825929,
264
+ "acc_norm": 0.4117647058823529,
265
+ "acc_norm_stderr": 0.02818059632825929
266
+ },
267
+ "harness|mmlu_business_ethics|5": {
268
+ "acc": 0.42,
269
+ "acc_stderr": 0.049604496374885836,
270
+ "acc_norm": 0.42,
271
+ "acc_norm_stderr": 0.049604496374885836
272
+ },
273
+ "harness|mmlu_international_law|5": {
274
+ "acc": 0.5619834710743802,
275
+ "acc_stderr": 0.045291468044357915,
276
+ "acc_norm": 0.5619834710743802,
277
+ "acc_norm_stderr": 0.045291468044357915
278
+ },
279
+ "harness|mmlu_astronomy|5": {
280
+ "acc": 0.34868421052631576,
281
+ "acc_stderr": 0.038781398887976125,
282
+ "acc_norm": 0.34868421052631576,
283
+ "acc_norm_stderr": 0.038781398887976125
284
+ },
285
+ "harness|mmlu_professional_psychology|5": {
286
+ "acc": 0.3284313725490196,
287
+ "acc_stderr": 0.018999707383162666,
288
+ "acc_norm": 0.3284313725490196,
289
+ "acc_norm_stderr": 0.018999707383162666
290
+ },
291
+ "harness|mmlu_professional_accounting|5": {
292
+ "acc": 0.2730496453900709,
293
+ "acc_stderr": 0.026577860943307857,
294
+ "acc_norm": 0.2730496453900709,
295
+ "acc_norm_stderr": 0.026577860943307857
296
+ },
297
+ "harness|mmlu_machine_learning|5": {
298
+ "acc": 0.2767857142857143,
299
+ "acc_stderr": 0.04246624336697627,
300
+ "acc_norm": 0.2767857142857143,
301
+ "acc_norm_stderr": 0.04246624336697627
302
+ },
303
+ "harness|mmlu_high_school_statistics|5": {
304
+ "acc": 0.4074074074074074,
305
+ "acc_stderr": 0.03350991604696043,
306
+ "acc_norm": 0.4074074074074074,
307
+ "acc_norm_stderr": 0.03350991604696043
308
+ },
309
+ "harness|mmlu_moral_scenarios|5": {
310
+ "acc": 0.23910614525139665,
311
+ "acc_stderr": 0.014265554192331149,
312
+ "acc_norm": 0.23910614525139665,
313
+ "acc_norm_stderr": 0.014265554192331149
314
+ },
315
+ "harness|mmlu_college_computer_science|5": {
316
+ "acc": 0.33,
317
+ "acc_stderr": 0.047258156262526045,
318
+ "acc_norm": 0.33,
319
+ "acc_norm_stderr": 0.047258156262526045
320
+ },
321
+ "harness|mmlu_high_school_computer_science|5": {
322
+ "acc": 0.4,
323
+ "acc_stderr": 0.04923659639173309,
324
+ "acc_norm": 0.4,
325
+ "acc_norm_stderr": 0.04923659639173309
326
+ },
327
+ "harness|mmlu_professional_medicine|5": {
328
+ "acc": 0.4227941176470588,
329
+ "acc_stderr": 0.030008562845003483,
330
+ "acc_norm": 0.4227941176470588,
331
+ "acc_norm_stderr": 0.030008562845003483
332
+ },
333
+ "harness|mmlu_security_studies|5": {
334
+ "acc": 0.3469387755102041,
335
+ "acc_stderr": 0.030472526026726492,
336
+ "acc_norm": 0.3469387755102041,
337
+ "acc_norm_stderr": 0.030472526026726492
338
+ },
339
+ "harness|mmlu_high_school_world_history|5": {
340
+ "acc": 0.4177215189873418,
341
+ "acc_stderr": 0.032103530322412685,
342
+ "acc_norm": 0.4177215189873418,
343
+ "acc_norm_stderr": 0.032103530322412685
344
+ },
345
+ "harness|mmlu_professional_law|5": {
346
+ "acc": 0.3005215123859192,
347
+ "acc_stderr": 0.011709918883039124,
348
+ "acc_norm": 0.3005215123859192,
349
+ "acc_norm_stderr": 0.011709918883039124
350
+ },
351
+ "harness|mmlu_high_school_us_history|5": {
352
+ "acc": 0.3872549019607843,
353
+ "acc_stderr": 0.03418931233833344,
354
+ "acc_norm": 0.3872549019607843,
355
+ "acc_norm_stderr": 0.03418931233833344
356
+ },
357
+ "harness|mmlu_high_school_european_history|5": {
358
+ "acc": 0.43636363636363634,
359
+ "acc_stderr": 0.03872592983524753,
360
+ "acc_norm": 0.43636363636363634,
361
+ "acc_norm_stderr": 0.03872592983524753
362
+ },
363
+ "harness|truthfulqa_mc|0": {
364
+ "mc1": 0.3072215422276622,
365
+ "mc1_stderr": 0.016150201321323002,
366
+ "mc2": 0.4721418472000992,
367
+ "mc2_stderr": 0.01626625866283201
368
+ },
369
+ "harness|commongen_v2|2": {
370
+ "acc": 0.27863046044864226,
371
+ "acc_stderr": 0.01541373949434568,
372
+ "acc_norm": 0.3825265643447462,
373
+ "acc_norm_stderr": 0.016709165387228803
374
+ }
375
+ },
376
+ "versions": {
377
+ "all": 0,
378
+ "harness|arc_challenge|25": 0,
379
+ "harness|hellaswag|10": 0,
380
+ "harness|mmlu_world_religions|5": 1,
381
+ "harness|mmlu_management|5": 1,
382
+ "harness|mmlu_miscellaneous|5": 1,
383
+ "harness|mmlu_anatomy|5": 1,
384
+ "harness|mmlu_abstract_algebra|5": 1,
385
+ "harness|mmlu_conceptual_physics|5": 1,
386
+ "harness|mmlu_virology|5": 1,
387
+ "harness|mmlu_philosophy|5": 1,
388
+ "harness|mmlu_human_aging|5": 1,
389
+ "harness|mmlu_human_sexuality|5": 1,
390
+ "harness|mmlu_medical_genetics|5": 1,
391
+ "harness|mmlu_high_school_geography|5": 1,
392
+ "harness|mmlu_electrical_engineering|5": 1,
393
+ "harness|mmlu_college_physics|5": 1,
394
+ "harness|mmlu_high_school_microeconomics|5": 1,
395
+ "harness|mmlu_high_school_macroeconomics|5": 1,
396
+ "harness|mmlu_computer_security|5": 1,
397
+ "harness|mmlu_global_facts|5": 1,
398
+ "harness|mmlu_jurisprudence|5": 1,
399
+ "harness|mmlu_high_school_chemistry|5": 1,
400
+ "harness|mmlu_high_school_biology|5": 1,
401
+ "harness|mmlu_marketing|5": 1,
402
+ "harness|mmlu_clinical_knowledge|5": 1,
403
+ "harness|mmlu_public_relations|5": 1,
404
+ "harness|mmlu_high_school_mathematics|5": 1,
405
+ "harness|mmlu_high_school_physics|5": 1,
406
+ "harness|mmlu_sociology|5": 1,
407
+ "harness|mmlu_college_medicine|5": 1,
408
+ "harness|mmlu_elementary_mathematics|5": 1,
409
+ "harness|mmlu_college_biology|5": 1,
410
+ "harness|mmlu_college_chemistry|5": 1,
411
+ "harness|mmlu_us_foreign_policy|5": 1,
412
+ "harness|mmlu_moral_disputes|5": 1,
413
+ "harness|mmlu_logical_fallacies|5": 1,
414
+ "harness|mmlu_prehistory|5": 1,
415
+ "harness|mmlu_college_mathematics|5": 1,
416
+ "harness|mmlu_high_school_government_and_politics|5": 1,
417
+ "harness|mmlu_econometrics|5": 1,
418
+ "harness|mmlu_high_school_psychology|5": 1,
419
+ "harness|mmlu_formal_logic|5": 1,
420
+ "harness|mmlu_nutrition|5": 1,
421
+ "harness|mmlu_business_ethics|5": 1,
422
+ "harness|mmlu_international_law|5": 1,
423
+ "harness|mmlu_astronomy|5": 1,
424
+ "harness|mmlu_professional_psychology|5": 1,
425
+ "harness|mmlu_professional_accounting|5": 1,
426
+ "harness|mmlu_machine_learning|5": 1,
427
+ "harness|mmlu_high_school_statistics|5": 1,
428
+ "harness|mmlu_moral_scenarios|5": 1,
429
+ "harness|mmlu_college_computer_science|5": 1,
430
+ "harness|mmlu_high_school_computer_science|5": 1,
431
+ "harness|mmlu_professional_medicine|5": 1,
432
+ "harness|mmlu_security_studies|5": 1,
433
+ "harness|mmlu_high_school_world_history|5": 1,
434
+ "harness|mmlu_professional_law|5": 1,
435
+ "harness|mmlu_high_school_us_history|5": 1,
436
+ "harness|mmlu_high_school_european_history|5": 1,
437
+ "harness|truthfulqa_mc|0": 0,
438
+ "harness|commongen_v2|2": 1
439
+ },
440
+ "config_general": {
441
+ "model_name": "BioMistral/BioMistral-7B",
442
+ "model_sha": "9a11e1ffa817c211cbb52ee1fb312dc6b61b40a5",
443
+ "model_dtype": "torch.float16",
444
+ "lighteval_sha": "",
445
+ "num_few_shot_default": 0,
446
+ "num_fewshot_seeds": 1,
447
+ "override_batch_size": 1,
448
+ "max_samples": null
449
+ }
450
+ }
eval-results/EleutherAI/polyglot-ko-1.3b/EleutherAI_polyglot-ko-1.3b_result_2023-09-24 15_21_38.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "daily": {
4
+ "daily": 11
5
+ },
6
+ "quarterly": {
7
+ "quarterly": 11
8
+ },
9
+ "harness|arc_challenge|25": {
10
+ "acc": 0.2235494880546075,
11
+ "acc_stderr": 0.012174896631202605,
12
+ "acc_norm": 0.2815699658703072,
13
+ "acc_norm_stderr": 0.013143376735009015
14
+ },
15
+ "harness|hellaswag|10": {
16
+ "acc": 0.3345947022505477,
17
+ "acc_stderr": 0.004708842600177431,
18
+ "acc_norm": 0.4135630352519418,
19
+ "acc_norm_stderr": 0.0049146550633294974
20
+ },
21
+ "harness|mmlu_world_religions|5": {
22
+ "acc": 0.27485380116959063,
23
+ "acc_stderr": 0.03424042924691585,
24
+ "acc_norm": 0.27485380116959063,
25
+ "acc_norm_stderr": 0.03424042924691585
26
+ },
27
+ "harness|mmlu_management|5": {
28
+ "acc": 0.27184466019417475,
29
+ "acc_stderr": 0.044052680241409216,
30
+ "acc_norm": 0.27184466019417475,
31
+ "acc_norm_stderr": 0.044052680241409216
32
+ },
33
+ "harness|mmlu_miscellaneous|5": {
34
+ "acc": 0.26947637292464877,
35
+ "acc_stderr": 0.015866243073215065,
36
+ "acc_norm": 0.26947637292464877,
37
+ "acc_norm_stderr": 0.015866243073215065
38
+ },
39
+ "harness|mmlu_anatomy|5": {
40
+ "acc": 0.26666666666666666,
41
+ "acc_stderr": 0.038201699145179055,
42
+ "acc_norm": 0.26666666666666666,
43
+ "acc_norm_stderr": 0.038201699145179055
44
+ },
45
+ "harness|mmlu_abstract_algebra|5": {
46
+ "acc": 0.3,
47
+ "acc_stderr": 0.046056618647183814,
48
+ "acc_norm": 0.3,
49
+ "acc_norm_stderr": 0.046056618647183814
50
+ },
51
+ "harness|mmlu_conceptual_physics|5": {
52
+ "acc": 0.2127659574468085,
53
+ "acc_stderr": 0.026754391348039783,
54
+ "acc_norm": 0.2127659574468085,
55
+ "acc_norm_stderr": 0.026754391348039783
56
+ },
57
+ "harness|mmlu_virology|5": {
58
+ "acc": 0.24096385542168675,
59
+ "acc_stderr": 0.033293941190735296,
60
+ "acc_norm": 0.24096385542168675,
61
+ "acc_norm_stderr": 0.033293941190735296
62
+ },
63
+ "harness|mmlu_philosophy|5": {
64
+ "acc": 0.2379421221864952,
65
+ "acc_stderr": 0.024185150647818707,
66
+ "acc_norm": 0.2379421221864952,
67
+ "acc_norm_stderr": 0.024185150647818707
68
+ },
69
+ "harness|mmlu_human_aging|5": {
70
+ "acc": 0.2825112107623318,
71
+ "acc_stderr": 0.030216831011508766,
72
+ "acc_norm": 0.2825112107623318,
73
+ "acc_norm_stderr": 0.030216831011508766
74
+ },
75
+ "harness|mmlu_human_sexuality|5": {
76
+ "acc": 0.21374045801526717,
77
+ "acc_stderr": 0.0359546161177469,
78
+ "acc_norm": 0.21374045801526717,
79
+ "acc_norm_stderr": 0.0359546161177469
80
+ },
81
+ "harness|mmlu_medical_genetics|5": {
82
+ "acc": 0.24,
83
+ "acc_stderr": 0.042923469599092816,
84
+ "acc_norm": 0.24,
85
+ "acc_norm_stderr": 0.042923469599092816
86
+ },
87
+ "harness|mmlu_high_school_geography|5": {
88
+ "acc": 0.2474747474747475,
89
+ "acc_stderr": 0.03074630074212451,
90
+ "acc_norm": 0.2474747474747475,
91
+ "acc_norm_stderr": 0.03074630074212451
92
+ },
93
+ "harness|mmlu_electrical_engineering|5": {
94
+ "acc": 0.22758620689655173,
95
+ "acc_stderr": 0.03493950380131184,
96
+ "acc_norm": 0.22758620689655173,
97
+ "acc_norm_stderr": 0.03493950380131184
98
+ },
99
+ "harness|mmlu_college_physics|5": {
100
+ "acc": 0.22549019607843138,
101
+ "acc_stderr": 0.041583075330832865,
102
+ "acc_norm": 0.22549019607843138,
103
+ "acc_norm_stderr": 0.041583075330832865
104
+ },
105
+ "harness|mmlu_high_school_microeconomics|5": {
106
+ "acc": 0.31512605042016806,
107
+ "acc_stderr": 0.030176808288974337,
108
+ "acc_norm": 0.31512605042016806,
109
+ "acc_norm_stderr": 0.030176808288974337
110
+ },
111
+ "harness|mmlu_high_school_macroeconomics|5": {
112
+ "acc": 0.2205128205128205,
113
+ "acc_stderr": 0.02102067268082791,
114
+ "acc_norm": 0.2205128205128205,
115
+ "acc_norm_stderr": 0.02102067268082791
116
+ },
117
+ "harness|mmlu_computer_security|5": {
118
+ "acc": 0.18,
119
+ "acc_stderr": 0.038612291966536955,
120
+ "acc_norm": 0.18,
121
+ "acc_norm_stderr": 0.038612291966536955
122
+ },
123
+ "harness|mmlu_global_facts|5": {
124
+ "acc": 0.31,
125
+ "acc_stderr": 0.04648231987117316,
126
+ "acc_norm": 0.31,
127
+ "acc_norm_stderr": 0.04648231987117316
128
+ },
129
+ "harness|mmlu_jurisprudence|5": {
130
+ "acc": 0.25,
131
+ "acc_stderr": 0.04186091791394607,
132
+ "acc_norm": 0.25,
133
+ "acc_norm_stderr": 0.04186091791394607
134
+ },
135
+ "harness|mmlu_high_school_chemistry|5": {
136
+ "acc": 0.2660098522167488,
137
+ "acc_stderr": 0.03108982600293752,
138
+ "acc_norm": 0.2660098522167488,
139
+ "acc_norm_stderr": 0.03108982600293752
140
+ },
141
+ "harness|mmlu_high_school_biology|5": {
142
+ "acc": 0.3,
143
+ "acc_stderr": 0.02606936229533513,
144
+ "acc_norm": 0.3,
145
+ "acc_norm_stderr": 0.02606936229533513
146
+ },
147
+ "harness|mmlu_marketing|5": {
148
+ "acc": 0.23076923076923078,
149
+ "acc_stderr": 0.027601921381417607,
150
+ "acc_norm": 0.23076923076923078,
151
+ "acc_norm_stderr": 0.027601921381417607
152
+ },
153
+ "harness|mmlu_clinical_knowledge|5": {
154
+ "acc": 0.25660377358490566,
155
+ "acc_stderr": 0.026880647889051968,
156
+ "acc_norm": 0.25660377358490566,
157
+ "acc_norm_stderr": 0.026880647889051968
158
+ },
159
+ "harness|mmlu_public_relations|5": {
160
+ "acc": 0.2545454545454545,
161
+ "acc_stderr": 0.04172343038705383,
162
+ "acc_norm": 0.2545454545454545,
163
+ "acc_norm_stderr": 0.04172343038705383
164
+ },
165
+ "harness|mmlu_high_school_mathematics|5": {
166
+ "acc": 0.2962962962962963,
167
+ "acc_stderr": 0.02784081149587194,
168
+ "acc_norm": 0.2962962962962963,
169
+ "acc_norm_stderr": 0.02784081149587194
170
+ },
171
+ "harness|mmlu_high_school_physics|5": {
172
+ "acc": 0.304635761589404,
173
+ "acc_stderr": 0.03757949922943342,
174
+ "acc_norm": 0.304635761589404,
175
+ "acc_norm_stderr": 0.03757949922943342
176
+ },
177
+ "harness|mmlu_sociology|5": {
178
+ "acc": 0.25870646766169153,
179
+ "acc_stderr": 0.03096590312357303,
180
+ "acc_norm": 0.25870646766169153,
181
+ "acc_norm_stderr": 0.03096590312357303
182
+ },
183
+ "harness|mmlu_college_medicine|5": {
184
+ "acc": 0.2254335260115607,
185
+ "acc_stderr": 0.03186209851641144,
186
+ "acc_norm": 0.2254335260115607,
187
+ "acc_norm_stderr": 0.03186209851641144
188
+ },
189
+ "harness|mmlu_elementary_mathematics|5": {
190
+ "acc": 0.2566137566137566,
191
+ "acc_stderr": 0.022494510767503154,
192
+ "acc_norm": 0.2566137566137566,
193
+ "acc_norm_stderr": 0.022494510767503154
194
+ },
195
+ "harness|mmlu_college_biology|5": {
196
+ "acc": 0.2638888888888889,
197
+ "acc_stderr": 0.03685651095897532,
198
+ "acc_norm": 0.2638888888888889,
199
+ "acc_norm_stderr": 0.03685651095897532
200
+ },
201
+ "harness|mmlu_college_chemistry|5": {
202
+ "acc": 0.23,
203
+ "acc_stderr": 0.04229525846816505,
204
+ "acc_norm": 0.23,
205
+ "acc_norm_stderr": 0.04229525846816505
206
+ },
207
+ "harness|mmlu_us_foreign_policy|5": {
208
+ "acc": 0.22,
209
+ "acc_stderr": 0.04163331998932269,
210
+ "acc_norm": 0.22,
211
+ "acc_norm_stderr": 0.04163331998932269
212
+ },
213
+ "harness|mmlu_moral_disputes|5": {
214
+ "acc": 0.24855491329479767,
215
+ "acc_stderr": 0.023267528432100174,
216
+ "acc_norm": 0.24855491329479767,
217
+ "acc_norm_stderr": 0.023267528432100174
218
+ },
219
+ "harness|mmlu_logical_fallacies|5": {
220
+ "acc": 0.31901840490797545,
221
+ "acc_stderr": 0.03661997551073836,
222
+ "acc_norm": 0.31901840490797545,
223
+ "acc_norm_stderr": 0.03661997551073836
224
+ },
225
+ "harness|mmlu_prehistory|5": {
226
+ "acc": 0.2623456790123457,
227
+ "acc_stderr": 0.024477222856135114,
228
+ "acc_norm": 0.2623456790123457,
229
+ "acc_norm_stderr": 0.024477222856135114
230
+ },
231
+ "harness|mmlu_college_mathematics|5": {
232
+ "acc": 0.25,
233
+ "acc_stderr": 0.04351941398892446,
234
+ "acc_norm": 0.25,
235
+ "acc_norm_stderr": 0.04351941398892446
236
+ },
237
+ "harness|mmlu_high_school_government_and_politics|5": {
238
+ "acc": 0.33678756476683935,
239
+ "acc_stderr": 0.03410780251836184,
240
+ "acc_norm": 0.33678756476683935,
241
+ "acc_norm_stderr": 0.03410780251836184
242
+ },
243
+ "harness|mmlu_econometrics|5": {
244
+ "acc": 0.20175438596491227,
245
+ "acc_stderr": 0.037752050135836386,
246
+ "acc_norm": 0.20175438596491227,
247
+ "acc_norm_stderr": 0.037752050135836386
248
+ },
249
+ "harness|mmlu_high_school_psychology|5": {
250
+ "acc": 0.24220183486238533,
251
+ "acc_stderr": 0.01836817630659862,
252
+ "acc_norm": 0.24220183486238533,
253
+ "acc_norm_stderr": 0.01836817630659862
254
+ },
255
+ "harness|mmlu_formal_logic|5": {
256
+ "acc": 0.23015873015873015,
257
+ "acc_stderr": 0.03764950879790606,
258
+ "acc_norm": 0.23015873015873015,
259
+ "acc_norm_stderr": 0.03764950879790606
260
+ },
261
+ "harness|mmlu_nutrition|5": {
262
+ "acc": 0.23529411764705882,
263
+ "acc_stderr": 0.024288619466046102,
264
+ "acc_norm": 0.23529411764705882,
265
+ "acc_norm_stderr": 0.024288619466046102
266
+ },
267
+ "harness|mmlu_business_ethics|5": {
268
+ "acc": 0.18,
269
+ "acc_stderr": 0.03861229196653695,
270
+ "acc_norm": 0.18,
271
+ "acc_norm_stderr": 0.03861229196653695
272
+ },
273
+ "harness|mmlu_international_law|5": {
274
+ "acc": 0.256198347107438,
275
+ "acc_stderr": 0.039849796533028704,
276
+ "acc_norm": 0.256198347107438,
277
+ "acc_norm_stderr": 0.039849796533028704
278
+ },
279
+ "harness|mmlu_astronomy|5": {
280
+ "acc": 0.21710526315789475,
281
+ "acc_stderr": 0.033550453048829226,
282
+ "acc_norm": 0.21710526315789475,
283
+ "acc_norm_stderr": 0.033550453048829226
284
+ },
285
+ "harness|mmlu_professional_psychology|5": {
286
+ "acc": 0.24019607843137256,
287
+ "acc_stderr": 0.01728276069516743,
288
+ "acc_norm": 0.24019607843137256,
289
+ "acc_norm_stderr": 0.01728276069516743
290
+ },
291
+ "harness|mmlu_professional_accounting|5": {
292
+ "acc": 0.2553191489361702,
293
+ "acc_stderr": 0.02601199293090201,
294
+ "acc_norm": 0.2553191489361702,
295
+ "acc_norm_stderr": 0.02601199293090201
296
+ },
297
+ "harness|mmlu_machine_learning|5": {
298
+ "acc": 0.21428571428571427,
299
+ "acc_stderr": 0.03894641120044793,
300
+ "acc_norm": 0.21428571428571427,
301
+ "acc_norm_stderr": 0.03894641120044793
302
+ },
303
+ "harness|mmlu_high_school_statistics|5": {
304
+ "acc": 0.46296296296296297,
305
+ "acc_stderr": 0.03400603625538272,
306
+ "acc_norm": 0.46296296296296297,
307
+ "acc_norm_stderr": 0.03400603625538272
308
+ },
309
+ "harness|mmlu_moral_scenarios|5": {
310
+ "acc": 0.24692737430167597,
311
+ "acc_stderr": 0.014422292204808852,
312
+ "acc_norm": 0.24692737430167597,
313
+ "acc_norm_stderr": 0.014422292204808852
314
+ },
315
+ "harness|mmlu_college_computer_science|5": {
316
+ "acc": 0.25,
317
+ "acc_stderr": 0.04351941398892446,
318
+ "acc_norm": 0.25,
319
+ "acc_norm_stderr": 0.04351941398892446
320
+ },
321
+ "harness|mmlu_high_school_computer_science|5": {
322
+ "acc": 0.3,
323
+ "acc_stderr": 0.046056618647183814,
324
+ "acc_norm": 0.3,
325
+ "acc_norm_stderr": 0.046056618647183814
326
+ },
327
+ "harness|mmlu_professional_medicine|5": {
328
+ "acc": 0.4411764705882353,
329
+ "acc_stderr": 0.030161911930767102,
330
+ "acc_norm": 0.4411764705882353,
331
+ "acc_norm_stderr": 0.030161911930767102
332
+ },
333
+ "harness|mmlu_security_studies|5": {
334
+ "acc": 0.3795918367346939,
335
+ "acc_stderr": 0.03106721126287249,
336
+ "acc_norm": 0.3795918367346939,
337
+ "acc_norm_stderr": 0.03106721126287249
338
+ },
339
+ "harness|mmlu_high_school_world_history|5": {
340
+ "acc": 0.2109704641350211,
341
+ "acc_stderr": 0.02655837250266192,
342
+ "acc_norm": 0.2109704641350211,
343
+ "acc_norm_stderr": 0.02655837250266192
344
+ },
345
+ "harness|mmlu_professional_law|5": {
346
+ "acc": 0.23468057366362452,
347
+ "acc_stderr": 0.010824026872449344,
348
+ "acc_norm": 0.23468057366362452,
349
+ "acc_norm_stderr": 0.010824026872449344
350
+ },
351
+ "harness|mmlu_high_school_us_history|5": {
352
+ "acc": 0.25,
353
+ "acc_stderr": 0.03039153369274154,
354
+ "acc_norm": 0.25,
355
+ "acc_norm_stderr": 0.03039153369274154
356
+ },
357
+ "harness|mmlu_high_school_european_history|5": {
358
+ "acc": 0.22424242424242424,
359
+ "acc_stderr": 0.03256866661681102,
360
+ "acc_norm": 0.22424242424242424,
361
+ "acc_norm_stderr": 0.03256866661681102
362
+ },
363
+ "harness|truthfulqa_mc|0": {
364
+ "mc1": 0.25091799265605874,
365
+ "mc1_stderr": 0.015176985027707682,
366
+ "mc2": 0.4116568832959107,
367
+ "mc2_stderr": 0.015044504977529799
368
+ },
369
+ "harness|commongen_v2|2": {
370
+ "acc": 0.27744982290436837,
371
+ "acc_stderr": 0.015393630236605975,
372
+ "acc_norm": 0.3400236127508855,
373
+ "acc_norm_stderr": 0.016286717220737674
374
+ }
375
+ },
376
+ "versions": {
377
+ "all": 0,
378
+ "harness|arc_challenge|25": 0,
379
+ "harness|hellaswag|10": 0,
380
+ "harness|mmlu_world_religions|5": 1,
381
+ "harness|mmlu_management|5": 1,
382
+ "harness|mmlu_miscellaneous|5": 1,
383
+ "harness|mmlu_anatomy|5": 1,
384
+ "harness|mmlu_abstract_algebra|5": 1,
385
+ "harness|mmlu_conceptual_physics|5": 1,
386
+ "harness|mmlu_virology|5": 1,
387
+ "harness|mmlu_philosophy|5": 1,
388
+ "harness|mmlu_human_aging|5": 1,
389
+ "harness|mmlu_human_sexuality|5": 1,
390
+ "harness|mmlu_medical_genetics|5": 1,
391
+ "harness|mmlu_high_school_geography|5": 1,
392
+ "harness|mmlu_electrical_engineering|5": 1,
393
+ "harness|mmlu_college_physics|5": 1,
394
+ "harness|mmlu_high_school_microeconomics|5": 1,
395
+ "harness|mmlu_high_school_macroeconomics|5": 1,
396
+ "harness|mmlu_computer_security|5": 1,
397
+ "harness|mmlu_global_facts|5": 1,
398
+ "harness|mmlu_jurisprudence|5": 1,
399
+ "harness|mmlu_high_school_chemistry|5": 1,
400
+ "harness|mmlu_high_school_biology|5": 1,
401
+ "harness|mmlu_marketing|5": 1,
402
+ "harness|mmlu_clinical_knowledge|5": 1,
403
+ "harness|mmlu_public_relations|5": 1,
404
+ "harness|mmlu_high_school_mathematics|5": 1,
405
+ "harness|mmlu_high_school_physics|5": 1,
406
+ "harness|mmlu_sociology|5": 1,
407
+ "harness|mmlu_college_medicine|5": 1,
408
+ "harness|mmlu_elementary_mathematics|5": 1,
409
+ "harness|mmlu_college_biology|5": 1,
410
+ "harness|mmlu_college_chemistry|5": 1,
411
+ "harness|mmlu_us_foreign_policy|5": 1,
412
+ "harness|mmlu_moral_disputes|5": 1,
413
+ "harness|mmlu_logical_fallacies|5": 1,
414
+ "harness|mmlu_prehistory|5": 1,
415
+ "harness|mmlu_college_mathematics|5": 1,
416
+ "harness|mmlu_high_school_government_and_politics|5": 1,
417
+ "harness|mmlu_econometrics|5": 1,
418
+ "harness|mmlu_high_school_psychology|5": 1,
419
+ "harness|mmlu_formal_logic|5": 1,
420
+ "harness|mmlu_nutrition|5": 1,
421
+ "harness|mmlu_business_ethics|5": 1,
422
+ "harness|mmlu_international_law|5": 1,
423
+ "harness|mmlu_astronomy|5": 1,
424
+ "harness|mmlu_professional_psychology|5": 1,
425
+ "harness|mmlu_professional_accounting|5": 1,
426
+ "harness|mmlu_machine_learning|5": 1,
427
+ "harness|mmlu_high_school_statistics|5": 1,
428
+ "harness|mmlu_moral_scenarios|5": 1,
429
+ "harness|mmlu_college_computer_science|5": 1,
430
+ "harness|mmlu_high_school_computer_science|5": 1,
431
+ "harness|mmlu_professional_medicine|5": 1,
432
+ "harness|mmlu_security_studies|5": 1,
433
+ "harness|mmlu_high_school_world_history|5": 1,
434
+ "harness|mmlu_professional_law|5": 1,
435
+ "harness|mmlu_high_school_us_history|5": 1,
436
+ "harness|mmlu_high_school_european_history|5": 1,
437
+ "harness|truthfulqa_mc|0": 0,
438
+ "harness|commongen_v2|2": 1
439
+ },
440
+ "config_general": {
441
+ "model_name": "EleutherAI/polyglot-ko-1.3b",
442
+ "model_sha": "557e162cf6e944fdbae05bab2e45d066a125eacb",
443
+ "model_dtype": "torch.float16",
444
+ "lighteval_sha": "",
445
+ "num_few_shot_default": 0,
446
+ "num_fewshot_seeds": 1,
447
+ "override_batch_size": 1,
448
+ "max_samples": null
449
+ }
450
+ }
eval-results/HuggingFaceH4/.DS_Store ADDED
Binary file (6.15 kB). View file
 
eval-results/HuggingFaceH4/zephyr-7b-beta/result.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "daily": {
4
+ "daily": 8
5
+ },
6
+ "quarterly": {
7
+ "quarterly": 8
8
+ },
9
+ "harness|arc_challenge|25": {
10
+ "acc": 0.33532423208191126,
11
+ "acc_stderr": 0.01379618294778556,
12
+ "acc_norm": 0.3848122866894198,
13
+ "acc_norm_stderr": 0.014218371065251112
14
+ },
15
+ "harness|hellaswag|10": {
16
+ "acc": 0.35480979884485164,
17
+ "acc_stderr": 0.004774778180345192,
18
+ "acc_norm": 0.44911372236606256,
19
+ "acc_norm_stderr": 0.00496387293685794
20
+ },
21
+ "harness|mmlu_world_religions|5": {
22
+ "acc": 0.45614035087719296,
23
+ "acc_stderr": 0.03820042586602966,
24
+ "acc_norm": 0.45614035087719296,
25
+ "acc_norm_stderr": 0.03820042586602966
26
+ },
27
+ "harness|mmlu_management|5": {
28
+ "acc": 0.6019417475728155,
29
+ "acc_stderr": 0.04846748253977238,
30
+ "acc_norm": 0.6019417475728155,
31
+ "acc_norm_stderr": 0.04846748253977238
32
+ },
33
+ "harness|mmlu_miscellaneous|5": {
34
+ "acc": 0.41762452107279696,
35
+ "acc_stderr": 0.017635637326951534,
36
+ "acc_norm": 0.41762452107279696,
37
+ "acc_norm_stderr": 0.017635637326951534
38
+ },
39
+ "harness|mmlu_anatomy|5": {
40
+ "acc": 0.34074074074074073,
41
+ "acc_stderr": 0.040943762699967946,
42
+ "acc_norm": 0.34074074074074073,
43
+ "acc_norm_stderr": 0.040943762699967946
44
+ },
45
+ "harness|mmlu_abstract_algebra|5": {
46
+ "acc": 0.19,
47
+ "acc_stderr": 0.03942772444036623,
48
+ "acc_norm": 0.19,
49
+ "acc_norm_stderr": 0.03942772444036623
50
+ },
51
+ "harness|mmlu_conceptual_physics|5": {
52
+ "acc": 0.2978723404255319,
53
+ "acc_stderr": 0.029896145682095462,
54
+ "acc_norm": 0.2978723404255319,
55
+ "acc_norm_stderr": 0.029896145682095462
56
+ },
57
+ "harness|mmlu_virology|5": {
58
+ "acc": 0.3614457831325301,
59
+ "acc_stderr": 0.0374005938202932,
60
+ "acc_norm": 0.3614457831325301,
61
+ "acc_norm_stderr": 0.0374005938202932
62
+ },
63
+ "harness|mmlu_philosophy|5": {
64
+ "acc": 0.4758842443729904,
65
+ "acc_stderr": 0.028365041542564584,
66
+ "acc_norm": 0.4758842443729904,
67
+ "acc_norm_stderr": 0.028365041542564584
68
+ },
69
+ "harness|mmlu_human_aging|5": {
70
+ "acc": 0.3811659192825112,
71
+ "acc_stderr": 0.032596251184168284,
72
+ "acc_norm": 0.3811659192825112,
73
+ "acc_norm_stderr": 0.032596251184168284
74
+ },
75
+ "harness|mmlu_human_sexuality|5": {
76
+ "acc": 0.3511450381679389,
77
+ "acc_stderr": 0.04186445163013751,
78
+ "acc_norm": 0.3511450381679389,
79
+ "acc_norm_stderr": 0.04186445163013751
80
+ },
81
+ "harness|mmlu_medical_genetics|5": {
82
+ "acc": 0.27,
83
+ "acc_stderr": 0.0446196043338474,
84
+ "acc_norm": 0.27,
85
+ "acc_norm_stderr": 0.0446196043338474
86
+ },
87
+ "harness|mmlu_high_school_geography|5": {
88
+ "acc": 0.494949494949495,
89
+ "acc_stderr": 0.035621707606254015,
90
+ "acc_norm": 0.494949494949495,
91
+ "acc_norm_stderr": 0.035621707606254015
92
+ },
93
+ "harness|mmlu_electrical_engineering|5": {
94
+ "acc": 0.4,
95
+ "acc_stderr": 0.04082482904638628,
96
+ "acc_norm": 0.4,
97
+ "acc_norm_stderr": 0.04082482904638628
98
+ },
99
+ "harness|mmlu_college_physics|5": {
100
+ "acc": 0.3137254901960784,
101
+ "acc_stderr": 0.04617034827006717,
102
+ "acc_norm": 0.3137254901960784,
103
+ "acc_norm_stderr": 0.04617034827006717
104
+ },
105
+ "harness|mmlu_high_school_microeconomics|5": {
106
+ "acc": 0.4957983193277311,
107
+ "acc_stderr": 0.0324773433444811,
108
+ "acc_norm": 0.4957983193277311,
109
+ "acc_norm_stderr": 0.0324773433444811
110
+ },
111
+ "harness|mmlu_high_school_macroeconomics|5": {
112
+ "acc": 0.4256410256410256,
113
+ "acc_stderr": 0.025069094387296546,
114
+ "acc_norm": 0.4256410256410256,
115
+ "acc_norm_stderr": 0.025069094387296546
116
+ },
117
+ "harness|mmlu_computer_security|5": {
118
+ "acc": 0.59,
119
+ "acc_stderr": 0.049431107042371025,
120
+ "acc_norm": 0.59,
121
+ "acc_norm_stderr": 0.049431107042371025
122
+ },
123
+ "harness|mmlu_global_facts|5": {
124
+ "acc": 0.29,
125
+ "acc_stderr": 0.045604802157206845,
126
+ "acc_norm": 0.29,
127
+ "acc_norm_stderr": 0.045604802157206845
128
+ },
129
+ "harness|mmlu_jurisprudence|5": {
130
+ "acc": 0.4537037037037037,
131
+ "acc_stderr": 0.04812917324536821,
132
+ "acc_norm": 0.4537037037037037,
133
+ "acc_norm_stderr": 0.04812917324536821
134
+ },
135
+ "harness|mmlu_high_school_chemistry|5": {
136
+ "acc": 0.35467980295566504,
137
+ "acc_stderr": 0.03366124489051449,
138
+ "acc_norm": 0.35467980295566504,
139
+ "acc_norm_stderr": 0.03366124489051449
140
+ },
141
+ "harness|mmlu_high_school_biology|5": {
142
+ "acc": 0.4290322580645161,
143
+ "acc_stderr": 0.02815603653823321,
144
+ "acc_norm": 0.4290322580645161,
145
+ "acc_norm_stderr": 0.02815603653823321
146
+ },
147
+ "harness|mmlu_marketing|5": {
148
+ "acc": 0.6666666666666666,
149
+ "acc_stderr": 0.03088273697413865,
150
+ "acc_norm": 0.6666666666666666,
151
+ "acc_norm_stderr": 0.03088273697413865
152
+ },
153
+ "harness|mmlu_clinical_knowledge|5": {
154
+ "acc": 0.4188679245283019,
155
+ "acc_stderr": 0.03036505082911521,
156
+ "acc_norm": 0.4188679245283019,
157
+ "acc_norm_stderr": 0.03036505082911521
158
+ },
159
+ "harness|mmlu_public_relations|5": {
160
+ "acc": 0.42727272727272725,
161
+ "acc_stderr": 0.04738198703545483,
162
+ "acc_norm": 0.42727272727272725,
163
+ "acc_norm_stderr": 0.04738198703545483
164
+ },
165
+ "harness|mmlu_high_school_mathematics|5": {
166
+ "acc": 0.34814814814814815,
167
+ "acc_stderr": 0.029045600290616258,
168
+ "acc_norm": 0.34814814814814815,
169
+ "acc_norm_stderr": 0.029045600290616258
170
+ },
171
+ "harness|mmlu_high_school_physics|5": {
172
+ "acc": 0.2913907284768212,
173
+ "acc_stderr": 0.037101857261199946,
174
+ "acc_norm": 0.2913907284768212,
175
+ "acc_norm_stderr": 0.037101857261199946
176
+ },
177
+ "harness|mmlu_sociology|5": {
178
+ "acc": 0.5174129353233831,
179
+ "acc_stderr": 0.03533389234739245,
180
+ "acc_norm": 0.5174129353233831,
181
+ "acc_norm_stderr": 0.03533389234739245
182
+ },
183
+ "harness|mmlu_college_medicine|5": {
184
+ "acc": 0.37572254335260113,
185
+ "acc_stderr": 0.03692820767264867,
186
+ "acc_norm": 0.37572254335260113,
187
+ "acc_norm_stderr": 0.03692820767264867
188
+ },
189
+ "harness|mmlu_elementary_mathematics|5": {
190
+ "acc": 0.3492063492063492,
191
+ "acc_stderr": 0.024552292209342658,
192
+ "acc_norm": 0.3492063492063492,
193
+ "acc_norm_stderr": 0.024552292209342658
194
+ },
195
+ "harness|mmlu_college_biology|5": {
196
+ "acc": 0.3333333333333333,
197
+ "acc_stderr": 0.039420826399272135,
198
+ "acc_norm": 0.3333333333333333,
199
+ "acc_norm_stderr": 0.039420826399272135
200
+ },
201
+ "harness|mmlu_college_chemistry|5": {
202
+ "acc": 0.35,
203
+ "acc_stderr": 0.04793724854411019,
204
+ "acc_norm": 0.35,
205
+ "acc_norm_stderr": 0.04793724854411019
206
+ },
207
+ "harness|mmlu_us_foreign_policy|5": {
208
+ "acc": 0.49,
209
+ "acc_stderr": 0.05024183937956913,
210
+ "acc_norm": 0.49,
211
+ "acc_norm_stderr": 0.05024183937956913
212
+ },
213
+ "harness|mmlu_moral_disputes|5": {
214
+ "acc": 0.47398843930635837,
215
+ "acc_stderr": 0.026882643434022885,
216
+ "acc_norm": 0.47398843930635837,
217
+ "acc_norm_stderr": 0.026882643434022885
218
+ },
219
+ "harness|mmlu_logical_fallacies|5": {
220
+ "acc": 0.44171779141104295,
221
+ "acc_stderr": 0.039015918258361836,
222
+ "acc_norm": 0.44171779141104295,
223
+ "acc_norm_stderr": 0.039015918258361836
224
+ },
225
+ "harness|mmlu_prehistory|5": {
226
+ "acc": 0.42592592592592593,
227
+ "acc_stderr": 0.027513747284379424,
228
+ "acc_norm": 0.42592592592592593,
229
+ "acc_norm_stderr": 0.027513747284379424
230
+ },
231
+ "harness|mmlu_college_mathematics|5": {
232
+ "acc": 0.33,
233
+ "acc_stderr": 0.04725815626252606,
234
+ "acc_norm": 0.33,
235
+ "acc_norm_stderr": 0.04725815626252606
236
+ },
237
+ "harness|mmlu_high_school_government_and_politics|5": {
238
+ "acc": 0.5129533678756477,
239
+ "acc_stderr": 0.0360722806104775,
240
+ "acc_norm": 0.5129533678756477,
241
+ "acc_norm_stderr": 0.0360722806104775
242
+ },
243
+ "harness|mmlu_econometrics|5": {
244
+ "acc": 0.24561403508771928,
245
+ "acc_stderr": 0.0404933929774814,
246
+ "acc_norm": 0.24561403508771928,
247
+ "acc_norm_stderr": 0.0404933929774814
248
+ },
249
+ "harness|mmlu_high_school_psychology|5": {
250
+ "acc": 0.47155963302752296,
251
+ "acc_stderr": 0.02140261569734804,
252
+ "acc_norm": 0.47155963302752296,
253
+ "acc_norm_stderr": 0.02140261569734804
254
+ },
255
+ "harness|mmlu_formal_logic|5": {
256
+ "acc": 0.36507936507936506,
257
+ "acc_stderr": 0.04306241259127152,
258
+ "acc_norm": 0.36507936507936506,
259
+ "acc_norm_stderr": 0.04306241259127152
260
+ },
261
+ "harness|mmlu_nutrition|5": {
262
+ "acc": 0.4117647058823529,
263
+ "acc_stderr": 0.028180596328259297,
264
+ "acc_norm": 0.4117647058823529,
265
+ "acc_norm_stderr": 0.028180596328259297
266
+ },
267
+ "harness|mmlu_business_ethics|5": {
268
+ "acc": 0.44,
269
+ "acc_stderr": 0.04988876515698589,
270
+ "acc_norm": 0.44,
271
+ "acc_norm_stderr": 0.04988876515698589
272
+ },
273
+ "harness|mmlu_international_law|5": {
274
+ "acc": 0.5867768595041323,
275
+ "acc_stderr": 0.04495087843548408,
276
+ "acc_norm": 0.5867768595041323,
277
+ "acc_norm_stderr": 0.04495087843548408
278
+ },
279
+ "harness|mmlu_astronomy|5": {
280
+ "acc": 0.40131578947368424,
281
+ "acc_stderr": 0.03988903703336284,
282
+ "acc_norm": 0.40131578947368424,
283
+ "acc_norm_stderr": 0.03988903703336284
284
+ },
285
+ "harness|mmlu_professional_psychology|5": {
286
+ "acc": 0.32679738562091504,
287
+ "acc_stderr": 0.018975427920507215,
288
+ "acc_norm": 0.32679738562091504,
289
+ "acc_norm_stderr": 0.018975427920507215
290
+ },
291
+ "harness|mmlu_professional_accounting|5": {
292
+ "acc": 0.3333333333333333,
293
+ "acc_stderr": 0.02812163604063988,
294
+ "acc_norm": 0.3333333333333333,
295
+ "acc_norm_stderr": 0.02812163604063988
296
+ },
297
+ "harness|mmlu_machine_learning|5": {
298
+ "acc": 0.3392857142857143,
299
+ "acc_stderr": 0.04493949068613539,
300
+ "acc_norm": 0.3392857142857143,
301
+ "acc_norm_stderr": 0.04493949068613539
302
+ },
303
+ "harness|mmlu_high_school_statistics|5": {
304
+ "acc": 0.41203703703703703,
305
+ "acc_stderr": 0.03356787758160835,
306
+ "acc_norm": 0.41203703703703703,
307
+ "acc_norm_stderr": 0.03356787758160835
308
+ },
309
+ "harness|mmlu_moral_scenarios|5": {
310
+ "acc": 0.329608938547486,
311
+ "acc_stderr": 0.015721531075183884,
312
+ "acc_norm": 0.329608938547486,
313
+ "acc_norm_stderr": 0.015721531075183884
314
+ },
315
+ "harness|mmlu_college_computer_science|5": {
316
+ "acc": 0.39,
317
+ "acc_stderr": 0.04902071300001975,
318
+ "acc_norm": 0.39,
319
+ "acc_norm_stderr": 0.04902071300001975
320
+ },
321
+ "harness|mmlu_high_school_computer_science|5": {
322
+ "acc": 0.61,
323
+ "acc_stderr": 0.04902071300001975,
324
+ "acc_norm": 0.61,
325
+ "acc_norm_stderr": 0.04902071300001975
326
+ },
327
+ "harness|mmlu_professional_medicine|5": {
328
+ "acc": 0.375,
329
+ "acc_stderr": 0.029408372932278746,
330
+ "acc_norm": 0.375,
331
+ "acc_norm_stderr": 0.029408372932278746
332
+ },
333
+ "harness|mmlu_security_studies|5": {
334
+ "acc": 0.43673469387755104,
335
+ "acc_stderr": 0.03175195237583322,
336
+ "acc_norm": 0.43673469387755104,
337
+ "acc_norm_stderr": 0.03175195237583322
338
+ },
339
+ "harness|mmlu_high_school_world_history|5": {
340
+ "acc": 0.4810126582278481,
341
+ "acc_stderr": 0.03252375148090448,
342
+ "acc_norm": 0.4810126582278481,
343
+ "acc_norm_stderr": 0.03252375148090448
344
+ },
345
+ "harness|mmlu_professional_law|5": {
346
+ "acc": 0.29791395045632335,
347
+ "acc_stderr": 0.011680717340400059,
348
+ "acc_norm": 0.29791395045632335,
349
+ "acc_norm_stderr": 0.011680717340400059
350
+ },
351
+ "harness|mmlu_high_school_us_history|5": {
352
+ "acc": 0.29411764705882354,
353
+ "acc_stderr": 0.03198001660115072,
354
+ "acc_norm": 0.29411764705882354,
355
+ "acc_norm_stderr": 0.03198001660115072
356
+ },
357
+ "harness|mmlu_high_school_european_history|5": {
358
+ "acc": 0.30303030303030304,
359
+ "acc_stderr": 0.03588624800091707,
360
+ "acc_norm": 0.30303030303030304,
361
+ "acc_norm_stderr": 0.03588624800091707
362
+ },
363
+ "harness|truthfulqa_mc|0": {
364
+ "mc1": 0.3317013463892289,
365
+ "mc1_stderr": 0.01648214881024147,
366
+ "mc2": 0.5171680571717291,
367
+ "mc2_stderr": 0.01606077987901482
368
+ },
369
+ "harness|commongen_v2|2": {
370
+ "acc": 0.39787485242030696,
371
+ "acc_stderr": 0.01682795905473339,
372
+ "acc_norm": 0.4014167650531287,
373
+ "acc_norm_stderr": 0.01685290785872906
374
+ }
375
+ },
376
+ "versions": {
377
+ "all": 0,
378
+ "harness|arc_challenge|25": 0,
379
+ "harness|hellaswag|10": 0,
380
+ "harness|mmlu_world_religions|5": 1,
381
+ "harness|mmlu_management|5": 1,
382
+ "harness|mmlu_miscellaneous|5": 1,
383
+ "harness|mmlu_anatomy|5": 1,
384
+ "harness|mmlu_abstract_algebra|5": 1,
385
+ "harness|mmlu_conceptual_physics|5": 1,
386
+ "harness|mmlu_virology|5": 1,
387
+ "harness|mmlu_philosophy|5": 1,
388
+ "harness|mmlu_human_aging|5": 1,
389
+ "harness|mmlu_human_sexuality|5": 1,
390
+ "harness|mmlu_medical_genetics|5": 1,
391
+ "harness|mmlu_high_school_geography|5": 1,
392
+ "harness|mmlu_electrical_engineering|5": 1,
393
+ "harness|mmlu_college_physics|5": 1,
394
+ "harness|mmlu_high_school_microeconomics|5": 1,
395
+ "harness|mmlu_high_school_macroeconomics|5": 1,
396
+ "harness|mmlu_computer_security|5": 1,
397
+ "harness|mmlu_global_facts|5": 1,
398
+ "harness|mmlu_jurisprudence|5": 1,
399
+ "harness|mmlu_high_school_chemistry|5": 1,
400
+ "harness|mmlu_high_school_biology|5": 1,
401
+ "harness|mmlu_marketing|5": 1,
402
+ "harness|mmlu_clinical_knowledge|5": 1,
403
+ "harness|mmlu_public_relations|5": 1,
404
+ "harness|mmlu_high_school_mathematics|5": 1,
405
+ "harness|mmlu_high_school_physics|5": 1,
406
+ "harness|mmlu_sociology|5": 1,
407
+ "harness|mmlu_college_medicine|5": 1,
408
+ "harness|mmlu_elementary_mathematics|5": 1,
409
+ "harness|mmlu_college_biology|5": 1,
410
+ "harness|mmlu_college_chemistry|5": 1,
411
+ "harness|mmlu_us_foreign_policy|5": 1,
412
+ "harness|mmlu_moral_disputes|5": 1,
413
+ "harness|mmlu_logical_fallacies|5": 1,
414
+ "harness|mmlu_prehistory|5": 1,
415
+ "harness|mmlu_college_mathematics|5": 1,
416
+ "harness|mmlu_high_school_government_and_politics|5": 1,
417
+ "harness|mmlu_econometrics|5": 1,
418
+ "harness|mmlu_high_school_psychology|5": 1,
419
+ "harness|mmlu_formal_logic|5": 1,
420
+ "harness|mmlu_nutrition|5": 1,
421
+ "harness|mmlu_business_ethics|5": 1,
422
+ "harness|mmlu_international_law|5": 1,
423
+ "harness|mmlu_astronomy|5": 1,
424
+ "harness|mmlu_professional_psychology|5": 1,
425
+ "harness|mmlu_professional_accounting|5": 1,
426
+ "harness|mmlu_machine_learning|5": 1,
427
+ "harness|mmlu_high_school_statistics|5": 1,
428
+ "harness|mmlu_moral_scenarios|5": 1,
429
+ "harness|mmlu_college_computer_science|5": 1,
430
+ "harness|mmlu_high_school_computer_science|5": 1,
431
+ "harness|mmlu_professional_medicine|5": 1,
432
+ "harness|mmlu_security_studies|5": 1,
433
+ "harness|mmlu_high_school_world_history|5": 1,
434
+ "harness|mmlu_professional_law|5": 1,
435
+ "harness|mmlu_high_school_us_history|5": 1,
436
+ "harness|mmlu_high_school_european_history|5": 1,
437
+ "harness|truthfulqa_mc|0": 0,
438
+ "harness|commongen_v2|2": 1
439
+ },
440
+ "config_general": {
441
+ "model_name": "HuggingFaceH4/zephyr-7b-beta",
442
+ "model_sha": "3bac358730f8806e5c3dc7c7e19eb36e045bf720",
443
+ "model_dtype": "torch.float16",
444
+ "lighteval_sha": "",
445
+ "num_few_shot_default": 0,
446
+ "num_fewshot_seeds": 1,
447
+ "override_batch_size": 1,
448
+ "max_samples": null
449
+ }
450
+ }
eval-results/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
eval-results/nlpai-lab/KULLM3/result.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "daily": {
4
+ "daily": 6
5
+ },
6
+ "quarterly": {
7
+ "quarterly": 6
8
+ },
9
+ "harness|arc_challenge|25": {
10
+ "acc": 0.42918088737201365,
11
+ "acc_stderr": 0.014464085894870651,
12
+ "acc_norm": 0.46501706484641636,
13
+ "acc_norm_stderr": 0.014575583922019672
14
+ },
15
+ "harness|hellaswag|10": {
16
+ "acc": 0.445628360884286,
17
+ "acc_stderr": 0.004960191341430244,
18
+ "acc_norm": 0.589523999203346,
19
+ "acc_norm_stderr": 0.004909148239488273
20
+ },
21
+ "harness|mmlu_world_religions|5": {
22
+ "acc": 0.6432748538011696,
23
+ "acc_stderr": 0.03674013002860954,
24
+ "acc_norm": 0.6432748538011696,
25
+ "acc_norm_stderr": 0.03674013002860954
26
+ },
27
+ "harness|mmlu_management|5": {
28
+ "acc": 0.6116504854368932,
29
+ "acc_stderr": 0.04825729337356389,
30
+ "acc_norm": 0.6116504854368932,
31
+ "acc_norm_stderr": 0.04825729337356389
32
+ },
33
+ "harness|mmlu_miscellaneous|5": {
34
+ "acc": 0.6155810983397191,
35
+ "acc_stderr": 0.01739568874281962,
36
+ "acc_norm": 0.6155810983397191,
37
+ "acc_norm_stderr": 0.01739568874281962
38
+ },
39
+ "harness|mmlu_anatomy|5": {
40
+ "acc": 0.4962962962962963,
41
+ "acc_stderr": 0.04319223625811331,
42
+ "acc_norm": 0.4962962962962963,
43
+ "acc_norm_stderr": 0.04319223625811331
44
+ },
45
+ "harness|mmlu_abstract_algebra|5": {
46
+ "acc": 0.26,
47
+ "acc_stderr": 0.04408440022768077,
48
+ "acc_norm": 0.26,
49
+ "acc_norm_stderr": 0.04408440022768077
50
+ },
51
+ "harness|mmlu_conceptual_physics|5": {
52
+ "acc": 0.4553191489361702,
53
+ "acc_stderr": 0.03255525359340354,
54
+ "acc_norm": 0.4553191489361702,
55
+ "acc_norm_stderr": 0.03255525359340354
56
+ },
57
+ "harness|mmlu_virology|5": {
58
+ "acc": 0.5180722891566265,
59
+ "acc_stderr": 0.038899512528272166,
60
+ "acc_norm": 0.5180722891566265,
61
+ "acc_norm_stderr": 0.038899512528272166
62
+ },
63
+ "harness|mmlu_philosophy|5": {
64
+ "acc": 0.5755627009646302,
65
+ "acc_stderr": 0.028071928247946205,
66
+ "acc_norm": 0.5755627009646302,
67
+ "acc_norm_stderr": 0.028071928247946205
68
+ },
69
+ "harness|mmlu_human_aging|5": {
70
+ "acc": 0.5650224215246636,
71
+ "acc_stderr": 0.033272833702713445,
72
+ "acc_norm": 0.5650224215246636,
73
+ "acc_norm_stderr": 0.033272833702713445
74
+ },
75
+ "harness|mmlu_human_sexuality|5": {
76
+ "acc": 0.5877862595419847,
77
+ "acc_stderr": 0.04317171194870255,
78
+ "acc_norm": 0.5877862595419847,
79
+ "acc_norm_stderr": 0.04317171194870255
80
+ },
81
+ "harness|mmlu_medical_genetics|5": {
82
+ "acc": 0.5,
83
+ "acc_stderr": 0.050251890762960605,
84
+ "acc_norm": 0.5,
85
+ "acc_norm_stderr": 0.050251890762960605
86
+ },
87
+ "harness|mmlu_high_school_geography|5": {
88
+ "acc": 0.6515151515151515,
89
+ "acc_stderr": 0.033948539651564025,
90
+ "acc_norm": 0.6515151515151515,
91
+ "acc_norm_stderr": 0.033948539651564025
92
+ },
93
+ "harness|mmlu_electrical_engineering|5": {
94
+ "acc": 0.503448275862069,
95
+ "acc_stderr": 0.04166567577101579,
96
+ "acc_norm": 0.503448275862069,
97
+ "acc_norm_stderr": 0.04166567577101579
98
+ },
99
+ "harness|mmlu_college_physics|5": {
100
+ "acc": 0.2549019607843137,
101
+ "acc_stderr": 0.043364327079931785,
102
+ "acc_norm": 0.2549019607843137,
103
+ "acc_norm_stderr": 0.043364327079931785
104
+ },
105
+ "harness|mmlu_high_school_microeconomics|5": {
106
+ "acc": 0.5756302521008403,
107
+ "acc_stderr": 0.03210479051015776,
108
+ "acc_norm": 0.5756302521008403,
109
+ "acc_norm_stderr": 0.03210479051015776
110
+ },
111
+ "harness|mmlu_high_school_macroeconomics|5": {
112
+ "acc": 0.541025641025641,
113
+ "acc_stderr": 0.025265525491284295,
114
+ "acc_norm": 0.541025641025641,
115
+ "acc_norm_stderr": 0.025265525491284295
116
+ },
117
+ "harness|mmlu_computer_security|5": {
118
+ "acc": 0.54,
119
+ "acc_stderr": 0.05009082659620332,
120
+ "acc_norm": 0.54,
121
+ "acc_norm_stderr": 0.05009082659620332
122
+ },
123
+ "harness|mmlu_global_facts|5": {
124
+ "acc": 0.31,
125
+ "acc_stderr": 0.04648231987117316,
126
+ "acc_norm": 0.31,
127
+ "acc_norm_stderr": 0.04648231987117316
128
+ },
129
+ "harness|mmlu_jurisprudence|5": {
130
+ "acc": 0.5555555555555556,
131
+ "acc_stderr": 0.04803752235190192,
132
+ "acc_norm": 0.5555555555555556,
133
+ "acc_norm_stderr": 0.04803752235190192
134
+ },
135
+ "harness|mmlu_high_school_chemistry|5": {
136
+ "acc": 0.3842364532019704,
137
+ "acc_stderr": 0.0342239856565755,
138
+ "acc_norm": 0.3842364532019704,
139
+ "acc_norm_stderr": 0.0342239856565755
140
+ },
141
+ "harness|mmlu_high_school_biology|5": {
142
+ "acc": 0.5774193548387097,
143
+ "acc_stderr": 0.02810096472427264,
144
+ "acc_norm": 0.5774193548387097,
145
+ "acc_norm_stderr": 0.02810096472427264
146
+ },
147
+ "harness|mmlu_marketing|5": {
148
+ "acc": 0.7777777777777778,
149
+ "acc_stderr": 0.027236013946196673,
150
+ "acc_norm": 0.7777777777777778,
151
+ "acc_norm_stderr": 0.027236013946196673
152
+ },
153
+ "harness|mmlu_clinical_knowledge|5": {
154
+ "acc": 0.4981132075471698,
155
+ "acc_stderr": 0.030772653642075657,
156
+ "acc_norm": 0.4981132075471698,
157
+ "acc_norm_stderr": 0.030772653642075657
158
+ },
159
+ "harness|mmlu_public_relations|5": {
160
+ "acc": 0.5272727272727272,
161
+ "acc_stderr": 0.04782001791380061,
162
+ "acc_norm": 0.5272727272727272,
163
+ "acc_norm_stderr": 0.04782001791380061
164
+ },
165
+ "harness|mmlu_high_school_mathematics|5": {
166
+ "acc": 0.25555555555555554,
167
+ "acc_stderr": 0.026593939101844082,
168
+ "acc_norm": 0.25555555555555554,
169
+ "acc_norm_stderr": 0.026593939101844082
170
+ },
171
+ "harness|mmlu_high_school_physics|5": {
172
+ "acc": 0.33774834437086093,
173
+ "acc_stderr": 0.038615575462551684,
174
+ "acc_norm": 0.33774834437086093,
175
+ "acc_norm_stderr": 0.038615575462551684
176
+ },
177
+ "harness|mmlu_sociology|5": {
178
+ "acc": 0.7064676616915423,
179
+ "acc_stderr": 0.032200241045342054,
180
+ "acc_norm": 0.7064676616915423,
181
+ "acc_norm_stderr": 0.032200241045342054
182
+ },
183
+ "harness|mmlu_college_medicine|5": {
184
+ "acc": 0.4797687861271676,
185
+ "acc_stderr": 0.03809342081273958,
186
+ "acc_norm": 0.4797687861271676,
187
+ "acc_norm_stderr": 0.03809342081273958
188
+ },
189
+ "harness|mmlu_elementary_mathematics|5": {
190
+ "acc": 0.38095238095238093,
191
+ "acc_stderr": 0.025010749116137602,
192
+ "acc_norm": 0.38095238095238093,
193
+ "acc_norm_stderr": 0.025010749116137602
194
+ },
195
+ "harness|mmlu_college_biology|5": {
196
+ "acc": 0.4236111111111111,
197
+ "acc_stderr": 0.041321250197233685,
198
+ "acc_norm": 0.4236111111111111,
199
+ "acc_norm_stderr": 0.041321250197233685
200
+ },
201
+ "harness|mmlu_college_chemistry|5": {
202
+ "acc": 0.31,
203
+ "acc_stderr": 0.04648231987117316,
204
+ "acc_norm": 0.31,
205
+ "acc_norm_stderr": 0.04648231987117316
206
+ },
207
+ "harness|mmlu_us_foreign_policy|5": {
208
+ "acc": 0.71,
209
+ "acc_stderr": 0.04560480215720683,
210
+ "acc_norm": 0.71,
211
+ "acc_norm_stderr": 0.04560480215720683
212
+ },
213
+ "harness|mmlu_moral_disputes|5": {
214
+ "acc": 0.5751445086705202,
215
+ "acc_stderr": 0.026613350840261733,
216
+ "acc_norm": 0.5751445086705202,
217
+ "acc_norm_stderr": 0.026613350840261733
218
+ },
219
+ "harness|mmlu_logical_fallacies|5": {
220
+ "acc": 0.5030674846625767,
221
+ "acc_stderr": 0.03928297078179662,
222
+ "acc_norm": 0.5030674846625767,
223
+ "acc_norm_stderr": 0.03928297078179662
224
+ },
225
+ "harness|mmlu_prehistory|5": {
226
+ "acc": 0.5370370370370371,
227
+ "acc_stderr": 0.027744313443376536,
228
+ "acc_norm": 0.5370370370370371,
229
+ "acc_norm_stderr": 0.027744313443376536
230
+ },
231
+ "harness|mmlu_college_mathematics|5": {
232
+ "acc": 0.33,
233
+ "acc_stderr": 0.04725815626252606,
234
+ "acc_norm": 0.33,
235
+ "acc_norm_stderr": 0.04725815626252606
236
+ },
237
+ "harness|mmlu_high_school_government_and_politics|5": {
238
+ "acc": 0.6217616580310881,
239
+ "acc_stderr": 0.034998072761933376,
240
+ "acc_norm": 0.6217616580310881,
241
+ "acc_norm_stderr": 0.034998072761933376
242
+ },
243
+ "harness|mmlu_econometrics|5": {
244
+ "acc": 0.37719298245614036,
245
+ "acc_stderr": 0.04559522141958216,
246
+ "acc_norm": 0.37719298245614036,
247
+ "acc_norm_stderr": 0.04559522141958216
248
+ },
249
+ "harness|mmlu_high_school_psychology|5": {
250
+ "acc": 0.6385321100917432,
251
+ "acc_stderr": 0.02059808200993736,
252
+ "acc_norm": 0.6385321100917432,
253
+ "acc_norm_stderr": 0.02059808200993736
254
+ },
255
+ "harness|mmlu_formal_logic|5": {
256
+ "acc": 0.4126984126984127,
257
+ "acc_stderr": 0.04403438954768177,
258
+ "acc_norm": 0.4126984126984127,
259
+ "acc_norm_stderr": 0.04403438954768177
260
+ },
261
+ "harness|mmlu_nutrition|5": {
262
+ "acc": 0.5261437908496732,
263
+ "acc_stderr": 0.028590752958852387,
264
+ "acc_norm": 0.5261437908496732,
265
+ "acc_norm_stderr": 0.028590752958852387
266
+ },
267
+ "harness|mmlu_business_ethics|5": {
268
+ "acc": 0.57,
269
+ "acc_stderr": 0.049756985195624284,
270
+ "acc_norm": 0.57,
271
+ "acc_norm_stderr": 0.049756985195624284
272
+ },
273
+ "harness|mmlu_international_law|5": {
274
+ "acc": 0.7520661157024794,
275
+ "acc_stderr": 0.03941897526516304,
276
+ "acc_norm": 0.7520661157024794,
277
+ "acc_norm_stderr": 0.03941897526516304
278
+ },
279
+ "harness|mmlu_astronomy|5": {
280
+ "acc": 0.5789473684210527,
281
+ "acc_stderr": 0.040179012759817494,
282
+ "acc_norm": 0.5789473684210527,
283
+ "acc_norm_stderr": 0.040179012759817494
284
+ },
285
+ "harness|mmlu_professional_psychology|5": {
286
+ "acc": 0.4738562091503268,
287
+ "acc_stderr": 0.020200164564804588,
288
+ "acc_norm": 0.4738562091503268,
289
+ "acc_norm_stderr": 0.020200164564804588
290
+ },
291
+ "harness|mmlu_professional_accounting|5": {
292
+ "acc": 0.3404255319148936,
293
+ "acc_stderr": 0.02826765748265013,
294
+ "acc_norm": 0.3404255319148936,
295
+ "acc_norm_stderr": 0.02826765748265013
296
+ },
297
+ "harness|mmlu_machine_learning|5": {
298
+ "acc": 0.38392857142857145,
299
+ "acc_stderr": 0.046161430750285455,
300
+ "acc_norm": 0.38392857142857145,
301
+ "acc_norm_stderr": 0.046161430750285455
302
+ },
303
+ "harness|mmlu_high_school_statistics|5": {
304
+ "acc": 0.4675925925925926,
305
+ "acc_stderr": 0.03402801581358966,
306
+ "acc_norm": 0.4675925925925926,
307
+ "acc_norm_stderr": 0.03402801581358966
308
+ },
309
+ "harness|mmlu_moral_scenarios|5": {
310
+ "acc": 0.21675977653631284,
311
+ "acc_stderr": 0.013780598486443363,
312
+ "acc_norm": 0.21675977653631284,
313
+ "acc_norm_stderr": 0.013780598486443363
314
+ },
315
+ "harness|mmlu_college_computer_science|5": {
316
+ "acc": 0.39,
317
+ "acc_stderr": 0.04902071300001975,
318
+ "acc_norm": 0.39,
319
+ "acc_norm_stderr": 0.04902071300001975
320
+ },
321
+ "harness|mmlu_high_school_computer_science|5": {
322
+ "acc": 0.71,
323
+ "acc_stderr": 0.04560480215720684,
324
+ "acc_norm": 0.71,
325
+ "acc_norm_stderr": 0.04560480215720684
326
+ },
327
+ "harness|mmlu_professional_medicine|5": {
328
+ "acc": 0.4411764705882353,
329
+ "acc_stderr": 0.0301619119307671,
330
+ "acc_norm": 0.4411764705882353,
331
+ "acc_norm_stderr": 0.0301619119307671
332
+ },
333
+ "harness|mmlu_security_studies|5": {
334
+ "acc": 0.6285714285714286,
335
+ "acc_stderr": 0.03093285879278986,
336
+ "acc_norm": 0.6285714285714286,
337
+ "acc_norm_stderr": 0.03093285879278986
338
+ },
339
+ "harness|mmlu_high_school_world_history|5": {
340
+ "acc": 0.70042194092827,
341
+ "acc_stderr": 0.029818024749753095,
342
+ "acc_norm": 0.70042194092827,
343
+ "acc_norm_stderr": 0.029818024749753095
344
+ },
345
+ "harness|mmlu_professional_law|5": {
346
+ "acc": 0.378748370273794,
347
+ "acc_stderr": 0.012389052105003741,
348
+ "acc_norm": 0.378748370273794,
349
+ "acc_norm_stderr": 0.012389052105003741
350
+ },
351
+ "harness|mmlu_high_school_us_history|5": {
352
+ "acc": 0.6225490196078431,
353
+ "acc_stderr": 0.03402272044340703,
354
+ "acc_norm": 0.6225490196078431,
355
+ "acc_norm_stderr": 0.03402272044340703
356
+ },
357
+ "harness|mmlu_high_school_european_history|5": {
358
+ "acc": 0.6666666666666666,
359
+ "acc_stderr": 0.03681050869161549,
360
+ "acc_norm": 0.6666666666666666,
361
+ "acc_norm_stderr": 0.03681050869161549
362
+ },
363
+ "harness|truthfulqa_mc|0": {
364
+ "mc1": 0.33659730722154224,
365
+ "mc1_stderr": 0.016542412809494877,
366
+ "mc2": 0.49995145184296846,
367
+ "mc2_stderr": 0.015887726098900913
368
+ },
369
+ "harness|commongen_v2|2": {
370
+ "acc": 0.564344746162928,
371
+ "acc_stderr": 0.017047415229476316,
372
+ "acc_norm": 0.6068476977567887,
373
+ "acc_norm_stderr": 0.016793262801287068
374
+ }
375
+ },
376
+ "versions": {
377
+ "all": 0,
378
+ "harness|arc_challenge|25": 0,
379
+ "harness|hellaswag|10": 0,
380
+ "harness|mmlu_world_religions|5": 1,
381
+ "harness|mmlu_management|5": 1,
382
+ "harness|mmlu_miscellaneous|5": 1,
383
+ "harness|mmlu_anatomy|5": 1,
384
+ "harness|mmlu_abstract_algebra|5": 1,
385
+ "harness|mmlu_conceptual_physics|5": 1,
386
+ "harness|mmlu_virology|5": 1,
387
+ "harness|mmlu_philosophy|5": 1,
388
+ "harness|mmlu_human_aging|5": 1,
389
+ "harness|mmlu_human_sexuality|5": 1,
390
+ "harness|mmlu_medical_genetics|5": 1,
391
+ "harness|mmlu_high_school_geography|5": 1,
392
+ "harness|mmlu_electrical_engineering|5": 1,
393
+ "harness|mmlu_college_physics|5": 1,
394
+ "harness|mmlu_high_school_microeconomics|5": 1,
395
+ "harness|mmlu_high_school_macroeconomics|5": 1,
396
+ "harness|mmlu_computer_security|5": 1,
397
+ "harness|mmlu_global_facts|5": 1,
398
+ "harness|mmlu_jurisprudence|5": 1,
399
+ "harness|mmlu_high_school_chemistry|5": 1,
400
+ "harness|mmlu_high_school_biology|5": 1,
401
+ "harness|mmlu_marketing|5": 1,
402
+ "harness|mmlu_clinical_knowledge|5": 1,
403
+ "harness|mmlu_public_relations|5": 1,
404
+ "harness|mmlu_high_school_mathematics|5": 1,
405
+ "harness|mmlu_high_school_physics|5": 1,
406
+ "harness|mmlu_sociology|5": 1,
407
+ "harness|mmlu_college_medicine|5": 1,
408
+ "harness|mmlu_elementary_mathematics|5": 1,
409
+ "harness|mmlu_college_biology|5": 1,
410
+ "harness|mmlu_college_chemistry|5": 1,
411
+ "harness|mmlu_us_foreign_policy|5": 1,
412
+ "harness|mmlu_moral_disputes|5": 1,
413
+ "harness|mmlu_logical_fallacies|5": 1,
414
+ "harness|mmlu_prehistory|5": 1,
415
+ "harness|mmlu_college_mathematics|5": 1,
416
+ "harness|mmlu_high_school_government_and_politics|5": 1,
417
+ "harness|mmlu_econometrics|5": 1,
418
+ "harness|mmlu_high_school_psychology|5": 1,
419
+ "harness|mmlu_formal_logic|5": 1,
420
+ "harness|mmlu_nutrition|5": 1,
421
+ "harness|mmlu_business_ethics|5": 1,
422
+ "harness|mmlu_international_law|5": 1,
423
+ "harness|mmlu_astronomy|5": 1,
424
+ "harness|mmlu_professional_psychology|5": 1,
425
+ "harness|mmlu_professional_accounting|5": 1,
426
+ "harness|mmlu_machine_learning|5": 1,
427
+ "harness|mmlu_high_school_statistics|5": 1,
428
+ "harness|mmlu_moral_scenarios|5": 1,
429
+ "harness|mmlu_college_computer_science|5": 1,
430
+ "harness|mmlu_high_school_computer_science|5": 1,
431
+ "harness|mmlu_professional_medicine|5": 1,
432
+ "harness|mmlu_security_studies|5": 1,
433
+ "harness|mmlu_high_school_world_history|5": 1,
434
+ "harness|mmlu_professional_law|5": 1,
435
+ "harness|mmlu_high_school_us_history|5": 1,
436
+ "harness|mmlu_high_school_european_history|5": 1,
437
+ "harness|truthfulqa_mc|0": 0,
438
+ "harness|commongen_v2|2": 1
439
+ },
440
+ "config_general": {
441
+ "model_name": "nlpai-lab/KULLM3",
442
+ "model_sha": "5a6bcd0fc7f240460eb6d57016f7b4060bc1f43b",
443
+ "model_dtype": "torch.float16",
444
+ "lighteval_sha": "",
445
+ "num_few_shot_default": 0,
446
+ "num_fewshot_seeds": 1,
447
+ "override_batch_size": 1,
448
+ "max_samples": null
449
+ }
450
+ }
eval-results/x2bee/POLAR-14B-DPO-v1.3/result.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "daily": {
4
+ "daily": 4
5
+ },
6
+ "quarterly": {
7
+ "quarterly": 4
8
+ },
9
+ "harness|arc_challenge|25": {
10
+ "acc": 0.7465870307167235,
11
+ "acc_stderr": 0.012710896778378604,
12
+ "acc_norm": 0.7807167235494881,
13
+ "acc_norm_stderr": 0.012091245787615728
14
+ },
15
+ "harness|hellaswag|10": {
16
+ "acc": 0.6385182234614618,
17
+ "acc_stderr": 0.004794478426382617,
18
+ "acc_norm": 0.7561242780322645,
19
+ "acc_norm_stderr": 0.004285410130466119
20
+ },
21
+ "harness|mmlu_world_religions|5": {
22
+ "acc": 0.6900584795321637,
23
+ "acc_stderr": 0.035469769593931624,
24
+ "acc_norm": 0.6900584795321637,
25
+ "acc_norm_stderr": 0.035469769593931624
26
+ },
27
+ "harness|mmlu_management|5": {
28
+ "acc": 0.6601941747572816,
29
+ "acc_stderr": 0.046897659372781335,
30
+ "acc_norm": 0.6601941747572816,
31
+ "acc_norm_stderr": 0.046897659372781335
32
+ },
33
+ "harness|mmlu_miscellaneous|5": {
34
+ "acc": 0.6845466155810983,
35
+ "acc_stderr": 0.016617501738763408,
36
+ "acc_norm": 0.6845466155810983,
37
+ "acc_norm_stderr": 0.016617501738763408
38
+ },
39
+ "harness|mmlu_anatomy|5": {
40
+ "acc": 0.48148148148148145,
41
+ "acc_stderr": 0.04316378599511324,
42
+ "acc_norm": 0.48148148148148145,
43
+ "acc_norm_stderr": 0.04316378599511324
44
+ },
45
+ "harness|mmlu_abstract_algebra|5": {
46
+ "acc": 0.33,
47
+ "acc_stderr": 0.047258156262526045,
48
+ "acc_norm": 0.33,
49
+ "acc_norm_stderr": 0.047258156262526045
50
+ },
51
+ "harness|mmlu_conceptual_physics|5": {
52
+ "acc": 0.46808510638297873,
53
+ "acc_stderr": 0.03261936918467383,
54
+ "acc_norm": 0.46808510638297873,
55
+ "acc_norm_stderr": 0.03261936918467383
56
+ },
57
+ "harness|mmlu_virology|5": {
58
+ "acc": 0.4759036144578313,
59
+ "acc_stderr": 0.03887971849597264,
60
+ "acc_norm": 0.4759036144578313,
61
+ "acc_norm_stderr": 0.03887971849597264
62
+ },
63
+ "harness|mmlu_philosophy|5": {
64
+ "acc": 0.6334405144694534,
65
+ "acc_stderr": 0.02736807824397163,
66
+ "acc_norm": 0.6334405144694534,
67
+ "acc_norm_stderr": 0.02736807824397163
68
+ },
69
+ "harness|mmlu_human_aging|5": {
70
+ "acc": 0.6681614349775785,
71
+ "acc_stderr": 0.03160295143776679,
72
+ "acc_norm": 0.6681614349775785,
73
+ "acc_norm_stderr": 0.03160295143776679
74
+ },
75
+ "harness|mmlu_human_sexuality|5": {
76
+ "acc": 0.6030534351145038,
77
+ "acc_stderr": 0.04291135671009224,
78
+ "acc_norm": 0.6030534351145038,
79
+ "acc_norm_stderr": 0.04291135671009224
80
+ },
81
+ "harness|mmlu_medical_genetics|5": {
82
+ "acc": 0.51,
83
+ "acc_stderr": 0.05024183937956911,
84
+ "acc_norm": 0.51,
85
+ "acc_norm_stderr": 0.05024183937956911
86
+ },
87
+ "harness|mmlu_high_school_geography|5": {
88
+ "acc": 0.7222222222222222,
89
+ "acc_stderr": 0.03191178226713547,
90
+ "acc_norm": 0.7222222222222222,
91
+ "acc_norm_stderr": 0.03191178226713547
92
+ },
93
+ "harness|mmlu_electrical_engineering|5": {
94
+ "acc": 0.47586206896551725,
95
+ "acc_stderr": 0.041618085035015295,
96
+ "acc_norm": 0.47586206896551725,
97
+ "acc_norm_stderr": 0.041618085035015295
98
+ },
99
+ "harness|mmlu_college_physics|5": {
100
+ "acc": 0.2549019607843137,
101
+ "acc_stderr": 0.04336432707993178,
102
+ "acc_norm": 0.2549019607843137,
103
+ "acc_norm_stderr": 0.04336432707993178
104
+ },
105
+ "harness|mmlu_high_school_microeconomics|5": {
106
+ "acc": 0.592436974789916,
107
+ "acc_stderr": 0.031918633744784666,
108
+ "acc_norm": 0.592436974789916,
109
+ "acc_norm_stderr": 0.031918633744784666
110
+ },
111
+ "harness|mmlu_high_school_macroeconomics|5": {
112
+ "acc": 0.5948717948717949,
113
+ "acc_stderr": 0.024890471769938142,
114
+ "acc_norm": 0.5948717948717949,
115
+ "acc_norm_stderr": 0.024890471769938142
116
+ },
117
+ "harness|mmlu_computer_security|5": {
118
+ "acc": 0.66,
119
+ "acc_stderr": 0.04760952285695237,
120
+ "acc_norm": 0.66,
121
+ "acc_norm_stderr": 0.04760952285695237
122
+ },
123
+ "harness|mmlu_global_facts|5": {
124
+ "acc": 0.27,
125
+ "acc_stderr": 0.04461960433384739,
126
+ "acc_norm": 0.27,
127
+ "acc_norm_stderr": 0.04461960433384739
128
+ },
129
+ "harness|mmlu_jurisprudence|5": {
130
+ "acc": 0.6388888888888888,
131
+ "acc_stderr": 0.04643454608906275,
132
+ "acc_norm": 0.6388888888888888,
133
+ "acc_norm_stderr": 0.04643454608906275
134
+ },
135
+ "harness|mmlu_high_school_chemistry|5": {
136
+ "acc": 0.4433497536945813,
137
+ "acc_stderr": 0.034953345821629345,
138
+ "acc_norm": 0.4433497536945813,
139
+ "acc_norm_stderr": 0.034953345821629345
140
+ },
141
+ "harness|mmlu_high_school_biology|5": {
142
+ "acc": 0.5806451612903226,
143
+ "acc_stderr": 0.028071588901091838,
144
+ "acc_norm": 0.5806451612903226,
145
+ "acc_norm_stderr": 0.028071588901091838
146
+ },
147
+ "harness|mmlu_marketing|5": {
148
+ "acc": 0.811965811965812,
149
+ "acc_stderr": 0.025598193686652254,
150
+ "acc_norm": 0.811965811965812,
151
+ "acc_norm_stderr": 0.025598193686652254
152
+ },
153
+ "harness|mmlu_clinical_knowledge|5": {
154
+ "acc": 0.5169811320754717,
155
+ "acc_stderr": 0.030755120364119898,
156
+ "acc_norm": 0.5169811320754717,
157
+ "acc_norm_stderr": 0.030755120364119898
158
+ },
159
+ "harness|mmlu_public_relations|5": {
160
+ "acc": 0.5818181818181818,
161
+ "acc_stderr": 0.04724577405731573,
162
+ "acc_norm": 0.5818181818181818,
163
+ "acc_norm_stderr": 0.04724577405731573
164
+ },
165
+ "harness|mmlu_high_school_mathematics|5": {
166
+ "acc": 0.3888888888888889,
167
+ "acc_stderr": 0.029723278961476664,
168
+ "acc_norm": 0.3888888888888889,
169
+ "acc_norm_stderr": 0.029723278961476664
170
+ },
171
+ "harness|mmlu_high_school_physics|5": {
172
+ "acc": 0.3708609271523179,
173
+ "acc_stderr": 0.03943966699183629,
174
+ "acc_norm": 0.3708609271523179,
175
+ "acc_norm_stderr": 0.03943966699183629
176
+ },
177
+ "harness|mmlu_sociology|5": {
178
+ "acc": 0.6666666666666666,
179
+ "acc_stderr": 0.033333333333333326,
180
+ "acc_norm": 0.6666666666666666,
181
+ "acc_norm_stderr": 0.033333333333333326
182
+ },
183
+ "harness|mmlu_college_medicine|5": {
184
+ "acc": 0.47398843930635837,
185
+ "acc_stderr": 0.038073017265045125,
186
+ "acc_norm": 0.47398843930635837,
187
+ "acc_norm_stderr": 0.038073017265045125
188
+ },
189
+ "harness|mmlu_elementary_mathematics|5": {
190
+ "acc": 0.42328042328042326,
191
+ "acc_stderr": 0.025446365634406793,
192
+ "acc_norm": 0.42328042328042326,
193
+ "acc_norm_stderr": 0.025446365634406793
194
+ },
195
+ "harness|mmlu_college_biology|5": {
196
+ "acc": 0.5625,
197
+ "acc_stderr": 0.04148415739394154,
198
+ "acc_norm": 0.5625,
199
+ "acc_norm_stderr": 0.04148415739394154
200
+ },
201
+ "harness|mmlu_college_chemistry|5": {
202
+ "acc": 0.39,
203
+ "acc_stderr": 0.04902071300001975,
204
+ "acc_norm": 0.39,
205
+ "acc_norm_stderr": 0.04902071300001975
206
+ },
207
+ "harness|mmlu_us_foreign_policy|5": {
208
+ "acc": 0.78,
209
+ "acc_stderr": 0.04163331998932263,
210
+ "acc_norm": 0.78,
211
+ "acc_norm_stderr": 0.04163331998932263
212
+ },
213
+ "harness|mmlu_moral_disputes|5": {
214
+ "acc": 0.5491329479768786,
215
+ "acc_stderr": 0.026788811931562767,
216
+ "acc_norm": 0.5491329479768786,
217
+ "acc_norm_stderr": 0.026788811931562767
218
+ },
219
+ "harness|mmlu_logical_fallacies|5": {
220
+ "acc": 0.6319018404907976,
221
+ "acc_stderr": 0.03789213935838396,
222
+ "acc_norm": 0.6319018404907976,
223
+ "acc_norm_stderr": 0.03789213935838396
224
+ },
225
+ "harness|mmlu_prehistory|5": {
226
+ "acc": 0.5925925925925926,
227
+ "acc_stderr": 0.02733954664066273,
228
+ "acc_norm": 0.5925925925925926,
229
+ "acc_norm_stderr": 0.02733954664066273
230
+ },
231
+ "harness|mmlu_college_mathematics|5": {
232
+ "acc": 0.4,
233
+ "acc_stderr": 0.049236596391733084,
234
+ "acc_norm": 0.4,
235
+ "acc_norm_stderr": 0.049236596391733084
236
+ },
237
+ "harness|mmlu_high_school_government_and_politics|5": {
238
+ "acc": 0.7668393782383419,
239
+ "acc_stderr": 0.03051611137147601,
240
+ "acc_norm": 0.7668393782383419,
241
+ "acc_norm_stderr": 0.03051611137147601
242
+ },
243
+ "harness|mmlu_econometrics|5": {
244
+ "acc": 0.4473684210526316,
245
+ "acc_stderr": 0.046774730044912,
246
+ "acc_norm": 0.4473684210526316,
247
+ "acc_norm_stderr": 0.046774730044912
248
+ },
249
+ "harness|mmlu_high_school_psychology|5": {
250
+ "acc": 0.726605504587156,
251
+ "acc_stderr": 0.01910929984609827,
252
+ "acc_norm": 0.726605504587156,
253
+ "acc_norm_stderr": 0.01910929984609827
254
+ },
255
+ "harness|mmlu_formal_logic|5": {
256
+ "acc": 0.3968253968253968,
257
+ "acc_stderr": 0.04375888492727061,
258
+ "acc_norm": 0.3968253968253968,
259
+ "acc_norm_stderr": 0.04375888492727061
260
+ },
261
+ "harness|mmlu_nutrition|5": {
262
+ "acc": 0.6078431372549019,
263
+ "acc_stderr": 0.027956046165424516,
264
+ "acc_norm": 0.6078431372549019,
265
+ "acc_norm_stderr": 0.027956046165424516
266
+ },
267
+ "harness|mmlu_business_ethics|5": {
268
+ "acc": 0.55,
269
+ "acc_stderr": 0.05,
270
+ "acc_norm": 0.55,
271
+ "acc_norm_stderr": 0.05
272
+ },
273
+ "harness|mmlu_international_law|5": {
274
+ "acc": 0.6942148760330579,
275
+ "acc_stderr": 0.04205953933884122,
276
+ "acc_norm": 0.6942148760330579,
277
+ "acc_norm_stderr": 0.04205953933884122
278
+ },
279
+ "harness|mmlu_astronomy|5": {
280
+ "acc": 0.618421052631579,
281
+ "acc_stderr": 0.03953173377749194,
282
+ "acc_norm": 0.618421052631579,
283
+ "acc_norm_stderr": 0.03953173377749194
284
+ },
285
+ "harness|mmlu_professional_psychology|5": {
286
+ "acc": 0.5669934640522876,
287
+ "acc_stderr": 0.02004544247332422,
288
+ "acc_norm": 0.5669934640522876,
289
+ "acc_norm_stderr": 0.02004544247332422
290
+ },
291
+ "harness|mmlu_professional_accounting|5": {
292
+ "acc": 0.4219858156028369,
293
+ "acc_stderr": 0.029462189233370586,
294
+ "acc_norm": 0.4219858156028369,
295
+ "acc_norm_stderr": 0.029462189233370586
296
+ },
297
+ "harness|mmlu_machine_learning|5": {
298
+ "acc": 0.5089285714285714,
299
+ "acc_stderr": 0.04745033255489123,
300
+ "acc_norm": 0.5089285714285714,
301
+ "acc_norm_stderr": 0.04745033255489123
302
+ },
303
+ "harness|mmlu_high_school_statistics|5": {
304
+ "acc": 0.4351851851851852,
305
+ "acc_stderr": 0.03381200005643526,
306
+ "acc_norm": 0.4351851851851852,
307
+ "acc_norm_stderr": 0.03381200005643526
308
+ },
309
+ "harness|mmlu_moral_scenarios|5": {
310
+ "acc": 0.3787709497206704,
311
+ "acc_stderr": 0.016223533510365117,
312
+ "acc_norm": 0.3787709497206704,
313
+ "acc_norm_stderr": 0.016223533510365117
314
+ },
315
+ "harness|mmlu_college_computer_science|5": {
316
+ "acc": 0.47,
317
+ "acc_stderr": 0.05016135580465919,
318
+ "acc_norm": 0.47,
319
+ "acc_norm_stderr": 0.05016135580465919
320
+ },
321
+ "harness|mmlu_high_school_computer_science|5": {
322
+ "acc": 0.66,
323
+ "acc_stderr": 0.04760952285695238,
324
+ "acc_norm": 0.66,
325
+ "acc_norm_stderr": 0.04760952285695238
326
+ },
327
+ "harness|mmlu_professional_medicine|5": {
328
+ "acc": 0.48161764705882354,
329
+ "acc_stderr": 0.03035230339535196,
330
+ "acc_norm": 0.48161764705882354,
331
+ "acc_norm_stderr": 0.03035230339535196
332
+ },
333
+ "harness|mmlu_security_studies|5": {
334
+ "acc": 0.6448979591836734,
335
+ "acc_stderr": 0.030635655150387634,
336
+ "acc_norm": 0.6448979591836734,
337
+ "acc_norm_stderr": 0.030635655150387634
338
+ },
339
+ "harness|mmlu_high_school_world_history|5": {
340
+ "acc": 0.729957805907173,
341
+ "acc_stderr": 0.028900721906293426,
342
+ "acc_norm": 0.729957805907173,
343
+ "acc_norm_stderr": 0.028900721906293426
344
+ },
345
+ "harness|mmlu_professional_law|5": {
346
+ "acc": 0.41460234680573665,
347
+ "acc_stderr": 0.012582597058908284,
348
+ "acc_norm": 0.41460234680573665,
349
+ "acc_norm_stderr": 0.012582597058908284
350
+ },
351
+ "harness|mmlu_high_school_us_history|5": {
352
+ "acc": 0.6421568627450981,
353
+ "acc_stderr": 0.03364487286088298,
354
+ "acc_norm": 0.6421568627450981,
355
+ "acc_norm_stderr": 0.03364487286088298
356
+ },
357
+ "harness|mmlu_high_school_european_history|5": {
358
+ "acc": 0.6181818181818182,
359
+ "acc_stderr": 0.03793713171165635,
360
+ "acc_norm": 0.6181818181818182,
361
+ "acc_norm_stderr": 0.03793713171165635
362
+ },
363
+ "harness|truthfulqa_mc|0": {
364
+ "mc1": 0.6328029375764994,
365
+ "mc1_stderr": 0.01687480500145318,
366
+ "mc2": 0.7522925779273922,
367
+ "mc2_stderr": 0.014568927682929578
368
+ },
369
+ "harness|commongen_v2|2": {
370
+ "acc": 0.45218417945690675,
371
+ "acc_stderr": 0.017111567130916785,
372
+ "acc_norm": 0.45454545454545453,
373
+ "acc_norm_stderr": 0.017119172208061504
374
+ }
375
+ },
376
+ "versions": {
377
+ "all": 0,
378
+ "harness|arc_challenge|25": 0,
379
+ "harness|hellaswag|10": 0,
380
+ "harness|mmlu_world_religions|5": 1,
381
+ "harness|mmlu_management|5": 1,
382
+ "harness|mmlu_miscellaneous|5": 1,
383
+ "harness|mmlu_anatomy|5": 1,
384
+ "harness|mmlu_abstract_algebra|5": 1,
385
+ "harness|mmlu_conceptual_physics|5": 1,
386
+ "harness|mmlu_virology|5": 1,
387
+ "harness|mmlu_philosophy|5": 1,
388
+ "harness|mmlu_human_aging|5": 1,
389
+ "harness|mmlu_human_sexuality|5": 1,
390
+ "harness|mmlu_medical_genetics|5": 1,
391
+ "harness|mmlu_high_school_geography|5": 1,
392
+ "harness|mmlu_electrical_engineering|5": 1,
393
+ "harness|mmlu_college_physics|5": 1,
394
+ "harness|mmlu_high_school_microeconomics|5": 1,
395
+ "harness|mmlu_high_school_macroeconomics|5": 1,
396
+ "harness|mmlu_computer_security|5": 1,
397
+ "harness|mmlu_global_facts|5": 1,
398
+ "harness|mmlu_jurisprudence|5": 1,
399
+ "harness|mmlu_high_school_chemistry|5": 1,
400
+ "harness|mmlu_high_school_biology|5": 1,
401
+ "harness|mmlu_marketing|5": 1,
402
+ "harness|mmlu_clinical_knowledge|5": 1,
403
+ "harness|mmlu_public_relations|5": 1,
404
+ "harness|mmlu_high_school_mathematics|5": 1,
405
+ "harness|mmlu_high_school_physics|5": 1,
406
+ "harness|mmlu_sociology|5": 1,
407
+ "harness|mmlu_college_medicine|5": 1,
408
+ "harness|mmlu_elementary_mathematics|5": 1,
409
+ "harness|mmlu_college_biology|5": 1,
410
+ "harness|mmlu_college_chemistry|5": 1,
411
+ "harness|mmlu_us_foreign_policy|5": 1,
412
+ "harness|mmlu_moral_disputes|5": 1,
413
+ "harness|mmlu_logical_fallacies|5": 1,
414
+ "harness|mmlu_prehistory|5": 1,
415
+ "harness|mmlu_college_mathematics|5": 1,
416
+ "harness|mmlu_high_school_government_and_politics|5": 1,
417
+ "harness|mmlu_econometrics|5": 1,
418
+ "harness|mmlu_high_school_psychology|5": 1,
419
+ "harness|mmlu_formal_logic|5": 1,
420
+ "harness|mmlu_nutrition|5": 1,
421
+ "harness|mmlu_business_ethics|5": 1,
422
+ "harness|mmlu_international_law|5": 1,
423
+ "harness|mmlu_astronomy|5": 1,
424
+ "harness|mmlu_professional_psychology|5": 1,
425
+ "harness|mmlu_professional_accounting|5": 1,
426
+ "harness|mmlu_machine_learning|5": 1,
427
+ "harness|mmlu_high_school_statistics|5": 1,
428
+ "harness|mmlu_moral_scenarios|5": 1,
429
+ "harness|mmlu_college_computer_science|5": 1,
430
+ "harness|mmlu_high_school_computer_science|5": 1,
431
+ "harness|mmlu_professional_medicine|5": 1,
432
+ "harness|mmlu_security_studies|5": 1,
433
+ "harness|mmlu_high_school_world_history|5": 1,
434
+ "harness|mmlu_professional_law|5": 1,
435
+ "harness|mmlu_high_school_us_history|5": 1,
436
+ "harness|mmlu_high_school_european_history|5": 1,
437
+ "harness|truthfulqa_mc|0": 0,
438
+ "harness|commongen_v2|2": 1
439
+ },
440
+ "config_general": {
441
+ "model_name": "x2bee/POLAR-14B-DPO-v1.3",
442
+ "model_sha": "337edbed4c86db2da27e3b0e07086134f8d27a09",
443
+ "model_dtype": "torch.float16",
444
+ "lighteval_sha": "",
445
+ "num_few_shot_default": 0,
446
+ "num_fewshot_seeds": 1,
447
+ "override_batch_size": 1,
448
+ "max_samples": null
449
+ }
450
+ }
eval-results/x2bee/POLAR-14B-DPO-v1.4/result.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "daily": {
4
+ "daily": 7
5
+ },
6
+ "quarterly": {
7
+ "quarterly": 7
8
+ },
9
+ "harness|arc_challenge|25": {
10
+ "acc": 0.7363481228668942,
11
+ "acc_stderr": 0.012875929151297058,
12
+ "acc_norm": 0.7491467576791809,
13
+ "acc_norm_stderr": 0.012668198621315433
14
+ },
15
+ "harness|hellaswag|10": {
16
+ "acc": 0.7228639713204541,
17
+ "acc_stderr": 0.004466695023677848,
18
+ "acc_norm": 0.7422824138617805,
19
+ "acc_norm_stderr": 0.004364838000335614
20
+ },
21
+ "harness|mmlu_world_religions|5": {
22
+ "acc": 0.6140350877192983,
23
+ "acc_stderr": 0.03733756969066164,
24
+ "acc_norm": 0.6140350877192983,
25
+ "acc_norm_stderr": 0.03733756969066164
26
+ },
27
+ "harness|mmlu_management|5": {
28
+ "acc": 0.6893203883495146,
29
+ "acc_stderr": 0.045821241601615506,
30
+ "acc_norm": 0.6893203883495146,
31
+ "acc_norm_stderr": 0.045821241601615506
32
+ },
33
+ "harness|mmlu_miscellaneous|5": {
34
+ "acc": 0.6526181353767561,
35
+ "acc_stderr": 0.017026671748655728,
36
+ "acc_norm": 0.6526181353767561,
37
+ "acc_norm_stderr": 0.017026671748655728
38
+ },
39
+ "harness|mmlu_anatomy|5": {
40
+ "acc": 0.5037037037037037,
41
+ "acc_stderr": 0.043192236258113324,
42
+ "acc_norm": 0.5037037037037037,
43
+ "acc_norm_stderr": 0.043192236258113324
44
+ },
45
+ "harness|mmlu_abstract_algebra|5": {
46
+ "acc": 0.37,
47
+ "acc_stderr": 0.048523658709391,
48
+ "acc_norm": 0.37,
49
+ "acc_norm_stderr": 0.048523658709391
50
+ },
51
+ "harness|mmlu_conceptual_physics|5": {
52
+ "acc": 0.451063829787234,
53
+ "acc_stderr": 0.032529096196131965,
54
+ "acc_norm": 0.451063829787234,
55
+ "acc_norm_stderr": 0.032529096196131965
56
+ },
57
+ "harness|mmlu_virology|5": {
58
+ "acc": 0.4939759036144578,
59
+ "acc_stderr": 0.03892212195333045,
60
+ "acc_norm": 0.4939759036144578,
61
+ "acc_norm_stderr": 0.03892212195333045
62
+ },
63
+ "harness|mmlu_philosophy|5": {
64
+ "acc": 0.5852090032154341,
65
+ "acc_stderr": 0.02798268045975956,
66
+ "acc_norm": 0.5852090032154341,
67
+ "acc_norm_stderr": 0.02798268045975956
68
+ },
69
+ "harness|mmlu_human_aging|5": {
70
+ "acc": 0.6412556053811659,
71
+ "acc_stderr": 0.032190792004199956,
72
+ "acc_norm": 0.6412556053811659,
73
+ "acc_norm_stderr": 0.032190792004199956
74
+ },
75
+ "harness|mmlu_human_sexuality|5": {
76
+ "acc": 0.5954198473282443,
77
+ "acc_stderr": 0.043046937953806645,
78
+ "acc_norm": 0.5954198473282443,
79
+ "acc_norm_stderr": 0.043046937953806645
80
+ },
81
+ "harness|mmlu_medical_genetics|5": {
82
+ "acc": 0.47,
83
+ "acc_stderr": 0.05016135580465919,
84
+ "acc_norm": 0.47,
85
+ "acc_norm_stderr": 0.05016135580465919
86
+ },
87
+ "harness|mmlu_high_school_geography|5": {
88
+ "acc": 0.6616161616161617,
89
+ "acc_stderr": 0.033711241426263014,
90
+ "acc_norm": 0.6616161616161617,
91
+ "acc_norm_stderr": 0.033711241426263014
92
+ },
93
+ "harness|mmlu_electrical_engineering|5": {
94
+ "acc": 0.4827586206896552,
95
+ "acc_stderr": 0.041641887201693775,
96
+ "acc_norm": 0.4827586206896552,
97
+ "acc_norm_stderr": 0.041641887201693775
98
+ },
99
+ "harness|mmlu_college_physics|5": {
100
+ "acc": 0.2549019607843137,
101
+ "acc_stderr": 0.04336432707993178,
102
+ "acc_norm": 0.2549019607843137,
103
+ "acc_norm_stderr": 0.04336432707993178
104
+ },
105
+ "harness|mmlu_high_school_microeconomics|5": {
106
+ "acc": 0.5882352941176471,
107
+ "acc_stderr": 0.031968769891957786,
108
+ "acc_norm": 0.5882352941176471,
109
+ "acc_norm_stderr": 0.031968769891957786
110
+ },
111
+ "harness|mmlu_high_school_macroeconomics|5": {
112
+ "acc": 0.6025641025641025,
113
+ "acc_stderr": 0.024811920017903836,
114
+ "acc_norm": 0.6025641025641025,
115
+ "acc_norm_stderr": 0.024811920017903836
116
+ },
117
+ "harness|mmlu_computer_security|5": {
118
+ "acc": 0.66,
119
+ "acc_stderr": 0.04760952285695237,
120
+ "acc_norm": 0.66,
121
+ "acc_norm_stderr": 0.04760952285695237
122
+ },
123
+ "harness|mmlu_global_facts|5": {
124
+ "acc": 0.35,
125
+ "acc_stderr": 0.047937248544110196,
126
+ "acc_norm": 0.35,
127
+ "acc_norm_stderr": 0.047937248544110196
128
+ },
129
+ "harness|mmlu_jurisprudence|5": {
130
+ "acc": 0.5925925925925926,
131
+ "acc_stderr": 0.04750077341199984,
132
+ "acc_norm": 0.5925925925925926,
133
+ "acc_norm_stderr": 0.04750077341199984
134
+ },
135
+ "harness|mmlu_high_school_chemistry|5": {
136
+ "acc": 0.43842364532019706,
137
+ "acc_stderr": 0.03491207857486518,
138
+ "acc_norm": 0.43842364532019706,
139
+ "acc_norm_stderr": 0.03491207857486518
140
+ },
141
+ "harness|mmlu_high_school_biology|5": {
142
+ "acc": 0.567741935483871,
143
+ "acc_stderr": 0.028181739720019413,
144
+ "acc_norm": 0.567741935483871,
145
+ "acc_norm_stderr": 0.028181739720019413
146
+ },
147
+ "harness|mmlu_marketing|5": {
148
+ "acc": 0.7948717948717948,
149
+ "acc_stderr": 0.026453508054040356,
150
+ "acc_norm": 0.7948717948717948,
151
+ "acc_norm_stderr": 0.026453508054040356
152
+ },
153
+ "harness|mmlu_clinical_knowledge|5": {
154
+ "acc": 0.5169811320754717,
155
+ "acc_stderr": 0.030755120364119905,
156
+ "acc_norm": 0.5169811320754717,
157
+ "acc_norm_stderr": 0.030755120364119905
158
+ },
159
+ "harness|mmlu_public_relations|5": {
160
+ "acc": 0.5727272727272728,
161
+ "acc_stderr": 0.047381987035454834,
162
+ "acc_norm": 0.5727272727272728,
163
+ "acc_norm_stderr": 0.047381987035454834
164
+ },
165
+ "harness|mmlu_high_school_mathematics|5": {
166
+ "acc": 0.3962962962962963,
167
+ "acc_stderr": 0.029822619458533997,
168
+ "acc_norm": 0.3962962962962963,
169
+ "acc_norm_stderr": 0.029822619458533997
170
+ },
171
+ "harness|mmlu_high_school_physics|5": {
172
+ "acc": 0.3708609271523179,
173
+ "acc_stderr": 0.03943966699183629,
174
+ "acc_norm": 0.3708609271523179,
175
+ "acc_norm_stderr": 0.03943966699183629
176
+ },
177
+ "harness|mmlu_sociology|5": {
178
+ "acc": 0.6766169154228856,
179
+ "acc_stderr": 0.03307615947979035,
180
+ "acc_norm": 0.6766169154228856,
181
+ "acc_norm_stderr": 0.03307615947979035
182
+ },
183
+ "harness|mmlu_college_medicine|5": {
184
+ "acc": 0.49710982658959535,
185
+ "acc_stderr": 0.038124005659748335,
186
+ "acc_norm": 0.49710982658959535,
187
+ "acc_norm_stderr": 0.038124005659748335
188
+ },
189
+ "harness|mmlu_elementary_mathematics|5": {
190
+ "acc": 0.42592592592592593,
191
+ "acc_stderr": 0.02546714904546955,
192
+ "acc_norm": 0.42592592592592593,
193
+ "acc_norm_stderr": 0.02546714904546955
194
+ },
195
+ "harness|mmlu_college_biology|5": {
196
+ "acc": 0.5555555555555556,
197
+ "acc_stderr": 0.04155319955593146,
198
+ "acc_norm": 0.5555555555555556,
199
+ "acc_norm_stderr": 0.04155319955593146
200
+ },
201
+ "harness|mmlu_college_chemistry|5": {
202
+ "acc": 0.4,
203
+ "acc_stderr": 0.04923659639173309,
204
+ "acc_norm": 0.4,
205
+ "acc_norm_stderr": 0.04923659639173309
206
+ },
207
+ "harness|mmlu_us_foreign_policy|5": {
208
+ "acc": 0.73,
209
+ "acc_stderr": 0.044619604333847394,
210
+ "acc_norm": 0.73,
211
+ "acc_norm_stderr": 0.044619604333847394
212
+ },
213
+ "harness|mmlu_moral_disputes|5": {
214
+ "acc": 0.5549132947976878,
215
+ "acc_stderr": 0.02675625512966377,
216
+ "acc_norm": 0.5549132947976878,
217
+ "acc_norm_stderr": 0.02675625512966377
218
+ },
219
+ "harness|mmlu_logical_fallacies|5": {
220
+ "acc": 0.588957055214724,
221
+ "acc_stderr": 0.038656978537853624,
222
+ "acc_norm": 0.588957055214724,
223
+ "acc_norm_stderr": 0.038656978537853624
224
+ },
225
+ "harness|mmlu_prehistory|5": {
226
+ "acc": 0.5771604938271605,
227
+ "acc_stderr": 0.027487472980871595,
228
+ "acc_norm": 0.5771604938271605,
229
+ "acc_norm_stderr": 0.027487472980871595
230
+ },
231
+ "harness|mmlu_college_mathematics|5": {
232
+ "acc": 0.4,
233
+ "acc_stderr": 0.049236596391733084,
234
+ "acc_norm": 0.4,
235
+ "acc_norm_stderr": 0.049236596391733084
236
+ },
237
+ "harness|mmlu_high_school_government_and_politics|5": {
238
+ "acc": 0.7305699481865285,
239
+ "acc_stderr": 0.032018671228777947,
240
+ "acc_norm": 0.7305699481865285,
241
+ "acc_norm_stderr": 0.032018671228777947
242
+ },
243
+ "harness|mmlu_econometrics|5": {
244
+ "acc": 0.42105263157894735,
245
+ "acc_stderr": 0.046446020912223177,
246
+ "acc_norm": 0.42105263157894735,
247
+ "acc_norm_stderr": 0.046446020912223177
248
+ },
249
+ "harness|mmlu_high_school_psychology|5": {
250
+ "acc": 0.7064220183486238,
251
+ "acc_stderr": 0.019525151122639663,
252
+ "acc_norm": 0.7064220183486238,
253
+ "acc_norm_stderr": 0.019525151122639663
254
+ },
255
+ "harness|mmlu_formal_logic|5": {
256
+ "acc": 0.3968253968253968,
257
+ "acc_stderr": 0.04375888492727061,
258
+ "acc_norm": 0.3968253968253968,
259
+ "acc_norm_stderr": 0.04375888492727061
260
+ },
261
+ "harness|mmlu_nutrition|5": {
262
+ "acc": 0.545751633986928,
263
+ "acc_stderr": 0.02850980780262659,
264
+ "acc_norm": 0.545751633986928,
265
+ "acc_norm_stderr": 0.02850980780262659
266
+ },
267
+ "harness|mmlu_business_ethics|5": {
268
+ "acc": 0.55,
269
+ "acc_stderr": 0.05000000000000001,
270
+ "acc_norm": 0.55,
271
+ "acc_norm_stderr": 0.05000000000000001
272
+ },
273
+ "harness|mmlu_international_law|5": {
274
+ "acc": 0.6859504132231405,
275
+ "acc_stderr": 0.04236964753041019,
276
+ "acc_norm": 0.6859504132231405,
277
+ "acc_norm_stderr": 0.04236964753041019
278
+ },
279
+ "harness|mmlu_astronomy|5": {
280
+ "acc": 0.6052631578947368,
281
+ "acc_stderr": 0.039777499346220734,
282
+ "acc_norm": 0.6052631578947368,
283
+ "acc_norm_stderr": 0.039777499346220734
284
+ },
285
+ "harness|mmlu_professional_psychology|5": {
286
+ "acc": 0.5392156862745098,
287
+ "acc_stderr": 0.02016552331390791,
288
+ "acc_norm": 0.5392156862745098,
289
+ "acc_norm_stderr": 0.02016552331390791
290
+ },
291
+ "harness|mmlu_professional_accounting|5": {
292
+ "acc": 0.35815602836879434,
293
+ "acc_stderr": 0.02860208586275942,
294
+ "acc_norm": 0.35815602836879434,
295
+ "acc_norm_stderr": 0.02860208586275942
296
+ },
297
+ "harness|mmlu_machine_learning|5": {
298
+ "acc": 0.4107142857142857,
299
+ "acc_stderr": 0.04669510663875192,
300
+ "acc_norm": 0.4107142857142857,
301
+ "acc_norm_stderr": 0.04669510663875192
302
+ },
303
+ "harness|mmlu_high_school_statistics|5": {
304
+ "acc": 0.44907407407407407,
305
+ "acc_stderr": 0.03392238405321617,
306
+ "acc_norm": 0.44907407407407407,
307
+ "acc_norm_stderr": 0.03392238405321617
308
+ },
309
+ "harness|mmlu_moral_scenarios|5": {
310
+ "acc": 0.3452513966480447,
311
+ "acc_stderr": 0.015901432608930354,
312
+ "acc_norm": 0.3452513966480447,
313
+ "acc_norm_stderr": 0.015901432608930354
314
+ },
315
+ "harness|mmlu_college_computer_science|5": {
316
+ "acc": 0.43,
317
+ "acc_stderr": 0.049756985195624284,
318
+ "acc_norm": 0.43,
319
+ "acc_norm_stderr": 0.049756985195624284
320
+ },
321
+ "harness|mmlu_high_school_computer_science|5": {
322
+ "acc": 0.66,
323
+ "acc_stderr": 0.04760952285695238,
324
+ "acc_norm": 0.66,
325
+ "acc_norm_stderr": 0.04760952285695238
326
+ },
327
+ "harness|mmlu_professional_medicine|5": {
328
+ "acc": 0.45588235294117646,
329
+ "acc_stderr": 0.030254372573976694,
330
+ "acc_norm": 0.45588235294117646,
331
+ "acc_norm_stderr": 0.030254372573976694
332
+ },
333
+ "harness|mmlu_security_studies|5": {
334
+ "acc": 0.6204081632653061,
335
+ "acc_stderr": 0.031067211262872457,
336
+ "acc_norm": 0.6204081632653061,
337
+ "acc_norm_stderr": 0.031067211262872457
338
+ },
339
+ "harness|mmlu_high_school_world_history|5": {
340
+ "acc": 0.6582278481012658,
341
+ "acc_stderr": 0.030874537537553617,
342
+ "acc_norm": 0.6582278481012658,
343
+ "acc_norm_stderr": 0.030874537537553617
344
+ },
345
+ "harness|mmlu_professional_law|5": {
346
+ "acc": 0.4152542372881356,
347
+ "acc_stderr": 0.012585471793400667,
348
+ "acc_norm": 0.4152542372881356,
349
+ "acc_norm_stderr": 0.012585471793400667
350
+ },
351
+ "harness|mmlu_high_school_us_history|5": {
352
+ "acc": 0.5343137254901961,
353
+ "acc_stderr": 0.03501038327635896,
354
+ "acc_norm": 0.5343137254901961,
355
+ "acc_norm_stderr": 0.03501038327635896
356
+ },
357
+ "harness|mmlu_high_school_european_history|5": {
358
+ "acc": 0.5454545454545454,
359
+ "acc_stderr": 0.038881769216741004,
360
+ "acc_norm": 0.5454545454545454,
361
+ "acc_norm_stderr": 0.038881769216741004
362
+ },
363
+ "harness|truthfulqa_mc|0": {
364
+ "mc1": 0.4663402692778458,
365
+ "mc1_stderr": 0.01746379386716811,
366
+ "mc2": NaN,
367
+ "mc2_stderr": NaN
368
+ },
369
+ "harness|commongen_v2|2": {
370
+ "acc": 0.44037780401416765,
371
+ "acc_stderr": 0.01706769977431298,
372
+ "acc_norm": 0.44510035419126326,
373
+ "acc_norm_stderr": 0.01708641743100547
374
+ }
375
+ },
376
+ "versions": {
377
+ "all": 0,
378
+ "harness|arc_challenge|25": 0,
379
+ "harness|hellaswag|10": 0,
380
+ "harness|mmlu_world_religions|5": 1,
381
+ "harness|mmlu_management|5": 1,
382
+ "harness|mmlu_miscellaneous|5": 1,
383
+ "harness|mmlu_anatomy|5": 1,
384
+ "harness|mmlu_abstract_algebra|5": 1,
385
+ "harness|mmlu_conceptual_physics|5": 1,
386
+ "harness|mmlu_virology|5": 1,
387
+ "harness|mmlu_philosophy|5": 1,
388
+ "harness|mmlu_human_aging|5": 1,
389
+ "harness|mmlu_human_sexuality|5": 1,
390
+ "harness|mmlu_medical_genetics|5": 1,
391
+ "harness|mmlu_high_school_geography|5": 1,
392
+ "harness|mmlu_electrical_engineering|5": 1,
393
+ "harness|mmlu_college_physics|5": 1,
394
+ "harness|mmlu_high_school_microeconomics|5": 1,
395
+ "harness|mmlu_high_school_macroeconomics|5": 1,
396
+ "harness|mmlu_computer_security|5": 1,
397
+ "harness|mmlu_global_facts|5": 1,
398
+ "harness|mmlu_jurisprudence|5": 1,
399
+ "harness|mmlu_high_school_chemistry|5": 1,
400
+ "harness|mmlu_high_school_biology|5": 1,
401
+ "harness|mmlu_marketing|5": 1,
402
+ "harness|mmlu_clinical_knowledge|5": 1,
403
+ "harness|mmlu_public_relations|5": 1,
404
+ "harness|mmlu_high_school_mathematics|5": 1,
405
+ "harness|mmlu_high_school_physics|5": 1,
406
+ "harness|mmlu_sociology|5": 1,
407
+ "harness|mmlu_college_medicine|5": 1,
408
+ "harness|mmlu_elementary_mathematics|5": 1,
409
+ "harness|mmlu_college_biology|5": 1,
410
+ "harness|mmlu_college_chemistry|5": 1,
411
+ "harness|mmlu_us_foreign_policy|5": 1,
412
+ "harness|mmlu_moral_disputes|5": 1,
413
+ "harness|mmlu_logical_fallacies|5": 1,
414
+ "harness|mmlu_prehistory|5": 1,
415
+ "harness|mmlu_college_mathematics|5": 1,
416
+ "harness|mmlu_high_school_government_and_politics|5": 1,
417
+ "harness|mmlu_econometrics|5": 1,
418
+ "harness|mmlu_high_school_psychology|5": 1,
419
+ "harness|mmlu_formal_logic|5": 1,
420
+ "harness|mmlu_nutrition|5": 1,
421
+ "harness|mmlu_business_ethics|5": 1,
422
+ "harness|mmlu_international_law|5": 1,
423
+ "harness|mmlu_astronomy|5": 1,
424
+ "harness|mmlu_professional_psychology|5": 1,
425
+ "harness|mmlu_professional_accounting|5": 1,
426
+ "harness|mmlu_machine_learning|5": 1,
427
+ "harness|mmlu_high_school_statistics|5": 1,
428
+ "harness|mmlu_moral_scenarios|5": 1,
429
+ "harness|mmlu_college_computer_science|5": 1,
430
+ "harness|mmlu_high_school_computer_science|5": 1,
431
+ "harness|mmlu_professional_medicine|5": 1,
432
+ "harness|mmlu_security_studies|5": 1,
433
+ "harness|mmlu_high_school_world_history|5": 1,
434
+ "harness|mmlu_professional_law|5": 1,
435
+ "harness|mmlu_high_school_us_history|5": 1,
436
+ "harness|mmlu_high_school_european_history|5": 1,
437
+ "harness|truthfulqa_mc|0": 0,
438
+ "harness|commongen_v2|2": 1
439
+ },
440
+ "config_general": {
441
+ "model_name": "x2bee/POLAR-14B-DPO-v1.4",
442
+ "model_sha": "a6e64075fafaa3d5e393ff89c3cb26f9615e6de9",
443
+ "model_dtype": "torch.float16",
444
+ "lighteval_sha": "",
445
+ "num_few_shot_default": 0,
446
+ "num_fewshot_seeds": 1,
447
+ "override_batch_size": 1,
448
+ "max_samples": null
449
+ }
450
+ }
eval-results/x2bee/POLAR-14B-HES-DPO-v1.5/result.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "daily": {
4
+ "daily": 5
5
+ },
6
+ "quarterly": {
7
+ "quarterly": 5
8
+ },
9
+ "harness|arc_challenge|25": {
10
+ "acc": 0.6638225255972696,
11
+ "acc_stderr": 0.013804855026205756,
12
+ "acc_norm": 0.7278156996587031,
13
+ "acc_norm_stderr": 0.013006600406423709
14
+ },
15
+ "harness|hellaswag|10": {
16
+ "acc": 0.45648277235610435,
17
+ "acc_stderr": 0.004970846697552306,
18
+ "acc_norm": 0.6349332802230632,
19
+ "acc_norm_stderr": 0.004804649197163697
20
+ },
21
+ "harness|mmlu_world_religions|5": {
22
+ "acc": 0.7309941520467836,
23
+ "acc_stderr": 0.0340105262010409,
24
+ "acc_norm": 0.7309941520467836,
25
+ "acc_norm_stderr": 0.0340105262010409
26
+ },
27
+ "harness|mmlu_management|5": {
28
+ "acc": 0.7766990291262136,
29
+ "acc_stderr": 0.04123553189891431,
30
+ "acc_norm": 0.7766990291262136,
31
+ "acc_norm_stderr": 0.04123553189891431
32
+ },
33
+ "harness|mmlu_miscellaneous|5": {
34
+ "acc": 0.7343550446998723,
35
+ "acc_stderr": 0.01579430248788872,
36
+ "acc_norm": 0.7343550446998723,
37
+ "acc_norm_stderr": 0.01579430248788872
38
+ },
39
+ "harness|mmlu_anatomy|5": {
40
+ "acc": 0.45185185185185184,
41
+ "acc_stderr": 0.04299268905480863,
42
+ "acc_norm": 0.45185185185185184,
43
+ "acc_norm_stderr": 0.04299268905480863
44
+ },
45
+ "harness|mmlu_abstract_algebra|5": {
46
+ "acc": 0.35,
47
+ "acc_stderr": 0.04793724854411019,
48
+ "acc_norm": 0.35,
49
+ "acc_norm_stderr": 0.04793724854411019
50
+ },
51
+ "harness|mmlu_conceptual_physics|5": {
52
+ "acc": 0.5276595744680851,
53
+ "acc_stderr": 0.03263597118409769,
54
+ "acc_norm": 0.5276595744680851,
55
+ "acc_norm_stderr": 0.03263597118409769
56
+ },
57
+ "harness|mmlu_virology|5": {
58
+ "acc": 0.4759036144578313,
59
+ "acc_stderr": 0.03887971849597264,
60
+ "acc_norm": 0.4759036144578313,
61
+ "acc_norm_stderr": 0.03887971849597264
62
+ },
63
+ "harness|mmlu_philosophy|5": {
64
+ "acc": 0.6559485530546624,
65
+ "acc_stderr": 0.026981478043648043,
66
+ "acc_norm": 0.6559485530546624,
67
+ "acc_norm_stderr": 0.026981478043648043
68
+ },
69
+ "harness|mmlu_human_aging|5": {
70
+ "acc": 0.6412556053811659,
71
+ "acc_stderr": 0.032190792004199956,
72
+ "acc_norm": 0.6412556053811659,
73
+ "acc_norm_stderr": 0.032190792004199956
74
+ },
75
+ "harness|mmlu_human_sexuality|5": {
76
+ "acc": 0.648854961832061,
77
+ "acc_stderr": 0.04186445163013751,
78
+ "acc_norm": 0.648854961832061,
79
+ "acc_norm_stderr": 0.04186445163013751
80
+ },
81
+ "harness|mmlu_medical_genetics|5": {
82
+ "acc": 0.54,
83
+ "acc_stderr": 0.05009082659620333,
84
+ "acc_norm": 0.54,
85
+ "acc_norm_stderr": 0.05009082659620333
86
+ },
87
+ "harness|mmlu_high_school_geography|5": {
88
+ "acc": 0.7777777777777778,
89
+ "acc_stderr": 0.029620227874790465,
90
+ "acc_norm": 0.7777777777777778,
91
+ "acc_norm_stderr": 0.029620227874790465
92
+ },
93
+ "harness|mmlu_electrical_engineering|5": {
94
+ "acc": 0.5103448275862069,
95
+ "acc_stderr": 0.04165774775728762,
96
+ "acc_norm": 0.5103448275862069,
97
+ "acc_norm_stderr": 0.04165774775728762
98
+ },
99
+ "harness|mmlu_college_physics|5": {
100
+ "acc": 0.3627450980392157,
101
+ "acc_stderr": 0.04784060704105655,
102
+ "acc_norm": 0.3627450980392157,
103
+ "acc_norm_stderr": 0.04784060704105655
104
+ },
105
+ "harness|mmlu_high_school_microeconomics|5": {
106
+ "acc": 0.6680672268907563,
107
+ "acc_stderr": 0.03058869701378364,
108
+ "acc_norm": 0.6680672268907563,
109
+ "acc_norm_stderr": 0.03058869701378364
110
+ },
111
+ "harness|mmlu_high_school_macroeconomics|5": {
112
+ "acc": 0.6384615384615384,
113
+ "acc_stderr": 0.024359581465397,
114
+ "acc_norm": 0.6384615384615384,
115
+ "acc_norm_stderr": 0.024359581465397
116
+ },
117
+ "harness|mmlu_computer_security|5": {
118
+ "acc": 0.65,
119
+ "acc_stderr": 0.04793724854411021,
120
+ "acc_norm": 0.65,
121
+ "acc_norm_stderr": 0.04793724854411021
122
+ },
123
+ "harness|mmlu_global_facts|5": {
124
+ "acc": 0.37,
125
+ "acc_stderr": 0.04852365870939099,
126
+ "acc_norm": 0.37,
127
+ "acc_norm_stderr": 0.04852365870939099
128
+ },
129
+ "harness|mmlu_jurisprudence|5": {
130
+ "acc": 0.6851851851851852,
131
+ "acc_stderr": 0.04489931073591312,
132
+ "acc_norm": 0.6851851851851852,
133
+ "acc_norm_stderr": 0.04489931073591312
134
+ },
135
+ "harness|mmlu_high_school_chemistry|5": {
136
+ "acc": 0.46798029556650245,
137
+ "acc_stderr": 0.035107665979592154,
138
+ "acc_norm": 0.46798029556650245,
139
+ "acc_norm_stderr": 0.035107665979592154
140
+ },
141
+ "harness|mmlu_high_school_biology|5": {
142
+ "acc": 0.6548387096774193,
143
+ "acc_stderr": 0.02704574657353432,
144
+ "acc_norm": 0.6548387096774193,
145
+ "acc_norm_stderr": 0.02704574657353432
146
+ },
147
+ "harness|mmlu_marketing|5": {
148
+ "acc": 0.8162393162393162,
149
+ "acc_stderr": 0.025372139671722933,
150
+ "acc_norm": 0.8162393162393162,
151
+ "acc_norm_stderr": 0.025372139671722933
152
+ },
153
+ "harness|mmlu_clinical_knowledge|5": {
154
+ "acc": 0.5773584905660377,
155
+ "acc_stderr": 0.03040233144576954,
156
+ "acc_norm": 0.5773584905660377,
157
+ "acc_norm_stderr": 0.03040233144576954
158
+ },
159
+ "harness|mmlu_public_relations|5": {
160
+ "acc": 0.6454545454545455,
161
+ "acc_stderr": 0.045820048415054174,
162
+ "acc_norm": 0.6454545454545455,
163
+ "acc_norm_stderr": 0.045820048415054174
164
+ },
165
+ "harness|mmlu_high_school_mathematics|5": {
166
+ "acc": 0.4074074074074074,
167
+ "acc_stderr": 0.029958249250082118,
168
+ "acc_norm": 0.4074074074074074,
169
+ "acc_norm_stderr": 0.029958249250082118
170
+ },
171
+ "harness|mmlu_high_school_physics|5": {
172
+ "acc": 0.3509933774834437,
173
+ "acc_stderr": 0.03896981964257375,
174
+ "acc_norm": 0.3509933774834437,
175
+ "acc_norm_stderr": 0.03896981964257375
176
+ },
177
+ "harness|mmlu_sociology|5": {
178
+ "acc": 0.7263681592039801,
179
+ "acc_stderr": 0.03152439186555404,
180
+ "acc_norm": 0.7263681592039801,
181
+ "acc_norm_stderr": 0.03152439186555404
182
+ },
183
+ "harness|mmlu_college_medicine|5": {
184
+ "acc": 0.5375722543352601,
185
+ "acc_stderr": 0.0380168510452446,
186
+ "acc_norm": 0.5375722543352601,
187
+ "acc_norm_stderr": 0.0380168510452446
188
+ },
189
+ "harness|mmlu_elementary_mathematics|5": {
190
+ "acc": 0.4365079365079365,
191
+ "acc_stderr": 0.025542846817400496,
192
+ "acc_norm": 0.4365079365079365,
193
+ "acc_norm_stderr": 0.025542846817400496
194
+ },
195
+ "harness|mmlu_college_biology|5": {
196
+ "acc": 0.5694444444444444,
197
+ "acc_stderr": 0.04140685639111503,
198
+ "acc_norm": 0.5694444444444444,
199
+ "acc_norm_stderr": 0.04140685639111503
200
+ },
201
+ "harness|mmlu_college_chemistry|5": {
202
+ "acc": 0.43,
203
+ "acc_stderr": 0.049756985195624284,
204
+ "acc_norm": 0.43,
205
+ "acc_norm_stderr": 0.049756985195624284
206
+ },
207
+ "harness|mmlu_us_foreign_policy|5": {
208
+ "acc": 0.78,
209
+ "acc_stderr": 0.04163331998932263,
210
+ "acc_norm": 0.78,
211
+ "acc_norm_stderr": 0.04163331998932263
212
+ },
213
+ "harness|mmlu_moral_disputes|5": {
214
+ "acc": 0.6098265895953757,
215
+ "acc_stderr": 0.026261677607806642,
216
+ "acc_norm": 0.6098265895953757,
217
+ "acc_norm_stderr": 0.026261677607806642
218
+ },
219
+ "harness|mmlu_logical_fallacies|5": {
220
+ "acc": 0.656441717791411,
221
+ "acc_stderr": 0.03731133519673893,
222
+ "acc_norm": 0.656441717791411,
223
+ "acc_norm_stderr": 0.03731133519673893
224
+ },
225
+ "harness|mmlu_prehistory|5": {
226
+ "acc": 0.6574074074074074,
227
+ "acc_stderr": 0.02640614597362568,
228
+ "acc_norm": 0.6574074074074074,
229
+ "acc_norm_stderr": 0.02640614597362568
230
+ },
231
+ "harness|mmlu_college_mathematics|5": {
232
+ "acc": 0.38,
233
+ "acc_stderr": 0.04878317312145632,
234
+ "acc_norm": 0.38,
235
+ "acc_norm_stderr": 0.04878317312145632
236
+ },
237
+ "harness|mmlu_high_school_government_and_politics|5": {
238
+ "acc": 0.7668393782383419,
239
+ "acc_stderr": 0.03051611137147601,
240
+ "acc_norm": 0.7668393782383419,
241
+ "acc_norm_stderr": 0.03051611137147601
242
+ },
243
+ "harness|mmlu_econometrics|5": {
244
+ "acc": 0.45614035087719296,
245
+ "acc_stderr": 0.046854730419077895,
246
+ "acc_norm": 0.45614035087719296,
247
+ "acc_norm_stderr": 0.046854730419077895
248
+ },
249
+ "harness|mmlu_high_school_psychology|5": {
250
+ "acc": 0.7853211009174312,
251
+ "acc_stderr": 0.017604304149256494,
252
+ "acc_norm": 0.7853211009174312,
253
+ "acc_norm_stderr": 0.017604304149256494
254
+ },
255
+ "harness|mmlu_formal_logic|5": {
256
+ "acc": 0.4523809523809524,
257
+ "acc_stderr": 0.044518079590553275,
258
+ "acc_norm": 0.4523809523809524,
259
+ "acc_norm_stderr": 0.044518079590553275
260
+ },
261
+ "harness|mmlu_nutrition|5": {
262
+ "acc": 0.6405228758169934,
263
+ "acc_stderr": 0.027475969910660952,
264
+ "acc_norm": 0.6405228758169934,
265
+ "acc_norm_stderr": 0.027475969910660952
266
+ },
267
+ "harness|mmlu_business_ethics|5": {
268
+ "acc": 0.66,
269
+ "acc_stderr": 0.04760952285695237,
270
+ "acc_norm": 0.66,
271
+ "acc_norm_stderr": 0.04760952285695237
272
+ },
273
+ "harness|mmlu_international_law|5": {
274
+ "acc": 0.7933884297520661,
275
+ "acc_stderr": 0.03695980128098824,
276
+ "acc_norm": 0.7933884297520661,
277
+ "acc_norm_stderr": 0.03695980128098824
278
+ },
279
+ "harness|mmlu_astronomy|5": {
280
+ "acc": 0.6842105263157895,
281
+ "acc_stderr": 0.0378272898086547,
282
+ "acc_norm": 0.6842105263157895,
283
+ "acc_norm_stderr": 0.0378272898086547
284
+ },
285
+ "harness|mmlu_professional_psychology|5": {
286
+ "acc": 0.5964052287581699,
287
+ "acc_stderr": 0.019848280168401164,
288
+ "acc_norm": 0.5964052287581699,
289
+ "acc_norm_stderr": 0.019848280168401164
290
+ },
291
+ "harness|mmlu_professional_accounting|5": {
292
+ "acc": 0.4397163120567376,
293
+ "acc_stderr": 0.02960991207559411,
294
+ "acc_norm": 0.4397163120567376,
295
+ "acc_norm_stderr": 0.02960991207559411
296
+ },
297
+ "harness|mmlu_machine_learning|5": {
298
+ "acc": 0.39285714285714285,
299
+ "acc_stderr": 0.04635550135609976,
300
+ "acc_norm": 0.39285714285714285,
301
+ "acc_norm_stderr": 0.04635550135609976
302
+ },
303
+ "harness|mmlu_high_school_statistics|5": {
304
+ "acc": 0.5787037037037037,
305
+ "acc_stderr": 0.03367462138896078,
306
+ "acc_norm": 0.5787037037037037,
307
+ "acc_norm_stderr": 0.03367462138896078
308
+ },
309
+ "harness|mmlu_moral_scenarios|5": {
310
+ "acc": 0.264804469273743,
311
+ "acc_stderr": 0.01475690648326066,
312
+ "acc_norm": 0.264804469273743,
313
+ "acc_norm_stderr": 0.01475690648326066
314
+ },
315
+ "harness|mmlu_college_computer_science|5": {
316
+ "acc": 0.52,
317
+ "acc_stderr": 0.050211673156867795,
318
+ "acc_norm": 0.52,
319
+ "acc_norm_stderr": 0.050211673156867795
320
+ },
321
+ "harness|mmlu_high_school_computer_science|5": {
322
+ "acc": 0.7,
323
+ "acc_stderr": 0.04605661864718381,
324
+ "acc_norm": 0.7,
325
+ "acc_norm_stderr": 0.04605661864718381
326
+ },
327
+ "harness|mmlu_professional_medicine|5": {
328
+ "acc": 0.5588235294117647,
329
+ "acc_stderr": 0.03016191193076711,
330
+ "acc_norm": 0.5588235294117647,
331
+ "acc_norm_stderr": 0.03016191193076711
332
+ },
333
+ "harness|mmlu_security_studies|5": {
334
+ "acc": 0.6448979591836734,
335
+ "acc_stderr": 0.030635655150387634,
336
+ "acc_norm": 0.6448979591836734,
337
+ "acc_norm_stderr": 0.030635655150387634
338
+ },
339
+ "harness|mmlu_high_school_world_history|5": {
340
+ "acc": 0.7426160337552743,
341
+ "acc_stderr": 0.028458820991460302,
342
+ "acc_norm": 0.7426160337552743,
343
+ "acc_norm_stderr": 0.028458820991460302
344
+ },
345
+ "harness|mmlu_professional_law|5": {
346
+ "acc": 0.44654498044328556,
347
+ "acc_stderr": 0.012697046024399661,
348
+ "acc_norm": 0.44654498044328556,
349
+ "acc_norm_stderr": 0.012697046024399661
350
+ },
351
+ "harness|mmlu_high_school_us_history|5": {
352
+ "acc": 0.6225490196078431,
353
+ "acc_stderr": 0.03402272044340703,
354
+ "acc_norm": 0.6225490196078431,
355
+ "acc_norm_stderr": 0.03402272044340703
356
+ },
357
+ "harness|mmlu_high_school_european_history|5": {
358
+ "acc": 0.6303030303030303,
359
+ "acc_stderr": 0.03769430314512569,
360
+ "acc_norm": 0.6303030303030303,
361
+ "acc_norm_stderr": 0.03769430314512569
362
+ },
363
+ "harness|truthfulqa_mc|0": {
364
+ "mc1": 0.6634026927784578,
365
+ "mc1_stderr": 0.0165424128094949,
366
+ "mc2": 0.7515104740134964,
367
+ "mc2_stderr": 0.014200593490054807
368
+ },
369
+ "harness|commongen_v2|2": {
370
+ "acc": 0.5147579693034239,
371
+ "acc_stderr": 0.01718286443499856,
372
+ "acc_norm": 0.526564344746163,
373
+ "acc_norm_stderr": 0.017166075717577747
374
+ }
375
+ },
376
+ "versions": {
377
+ "all": 0,
378
+ "harness|arc_challenge|25": 0,
379
+ "harness|hellaswag|10": 0,
380
+ "harness|mmlu_world_religions|5": 1,
381
+ "harness|mmlu_management|5": 1,
382
+ "harness|mmlu_miscellaneous|5": 1,
383
+ "harness|mmlu_anatomy|5": 1,
384
+ "harness|mmlu_abstract_algebra|5": 1,
385
+ "harness|mmlu_conceptual_physics|5": 1,
386
+ "harness|mmlu_virology|5": 1,
387
+ "harness|mmlu_philosophy|5": 1,
388
+ "harness|mmlu_human_aging|5": 1,
389
+ "harness|mmlu_human_sexuality|5": 1,
390
+ "harness|mmlu_medical_genetics|5": 1,
391
+ "harness|mmlu_high_school_geography|5": 1,
392
+ "harness|mmlu_electrical_engineering|5": 1,
393
+ "harness|mmlu_college_physics|5": 1,
394
+ "harness|mmlu_high_school_microeconomics|5": 1,
395
+ "harness|mmlu_high_school_macroeconomics|5": 1,
396
+ "harness|mmlu_computer_security|5": 1,
397
+ "harness|mmlu_global_facts|5": 1,
398
+ "harness|mmlu_jurisprudence|5": 1,
399
+ "harness|mmlu_high_school_chemistry|5": 1,
400
+ "harness|mmlu_high_school_biology|5": 1,
401
+ "harness|mmlu_marketing|5": 1,
402
+ "harness|mmlu_clinical_knowledge|5": 1,
403
+ "harness|mmlu_public_relations|5": 1,
404
+ "harness|mmlu_high_school_mathematics|5": 1,
405
+ "harness|mmlu_high_school_physics|5": 1,
406
+ "harness|mmlu_sociology|5": 1,
407
+ "harness|mmlu_college_medicine|5": 1,
408
+ "harness|mmlu_elementary_mathematics|5": 1,
409
+ "harness|mmlu_college_biology|5": 1,
410
+ "harness|mmlu_college_chemistry|5": 1,
411
+ "harness|mmlu_us_foreign_policy|5": 1,
412
+ "harness|mmlu_moral_disputes|5": 1,
413
+ "harness|mmlu_logical_fallacies|5": 1,
414
+ "harness|mmlu_prehistory|5": 1,
415
+ "harness|mmlu_college_mathematics|5": 1,
416
+ "harness|mmlu_high_school_government_and_politics|5": 1,
417
+ "harness|mmlu_econometrics|5": 1,
418
+ "harness|mmlu_high_school_psychology|5": 1,
419
+ "harness|mmlu_formal_logic|5": 1,
420
+ "harness|mmlu_nutrition|5": 1,
421
+ "harness|mmlu_business_ethics|5": 1,
422
+ "harness|mmlu_international_law|5": 1,
423
+ "harness|mmlu_astronomy|5": 1,
424
+ "harness|mmlu_professional_psychology|5": 1,
425
+ "harness|mmlu_professional_accounting|5": 1,
426
+ "harness|mmlu_machine_learning|5": 1,
427
+ "harness|mmlu_high_school_statistics|5": 1,
428
+ "harness|mmlu_moral_scenarios|5": 1,
429
+ "harness|mmlu_college_computer_science|5": 1,
430
+ "harness|mmlu_high_school_computer_science|5": 1,
431
+ "harness|mmlu_professional_medicine|5": 1,
432
+ "harness|mmlu_security_studies|5": 1,
433
+ "harness|mmlu_high_school_world_history|5": 1,
434
+ "harness|mmlu_professional_law|5": 1,
435
+ "harness|mmlu_high_school_us_history|5": 1,
436
+ "harness|mmlu_high_school_european_history|5": 1,
437
+ "harness|truthfulqa_mc|0": 0,
438
+ "harness|commongen_v2|2": 1
439
+ },
440
+ "config_general": {
441
+ "model_name": "x2bee/POLAR-14B-HES-DPO-v1.5",
442
+ "model_sha": "f0bc8e2566ba28c8232d7c690098e634ea894e8d",
443
+ "model_dtype": "torch.float16",
444
+ "lighteval_sha": "",
445
+ "num_few_shot_default": 0,
446
+ "num_fewshot_seeds": 1,
447
+ "override_batch_size": 1,
448
+ "max_samples": null
449
+ }
450
+ }
eval-results/x2bee/POLAR-14B-SON-SFT-v0.1/result.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "daily": {
4
+ "daily": 3
5
+ },
6
+ "quarterly": {
7
+ "quarterly": 3
8
+ },
9
+ "harness|arc_challenge|25": {
10
+ "acc": 0.6646757679180887,
11
+ "acc_stderr": 0.013796182947785564,
12
+ "acc_norm": 0.7244027303754266,
13
+ "acc_norm_stderr": 0.01305716965576184
14
+ },
15
+ "harness|hellaswag|10": {
16
+ "acc": 0.46036646086436966,
17
+ "acc_stderr": 0.004974080638364276,
18
+ "acc_norm": 0.6195976897032464,
19
+ "acc_norm_stderr": 0.004844935327599196
20
+ },
21
+ "harness|mmlu_world_religions|5": {
22
+ "acc": 0.7602339181286549,
23
+ "acc_stderr": 0.03274485211946956,
24
+ "acc_norm": 0.7602339181286549,
25
+ "acc_norm_stderr": 0.03274485211946956
26
+ },
27
+ "harness|mmlu_management|5": {
28
+ "acc": 0.7766990291262136,
29
+ "acc_stderr": 0.04123553189891431,
30
+ "acc_norm": 0.7766990291262136,
31
+ "acc_norm_stderr": 0.04123553189891431
32
+ },
33
+ "harness|mmlu_miscellaneous|5": {
34
+ "acc": 0.7381864623243933,
35
+ "acc_stderr": 0.01572083867844526,
36
+ "acc_norm": 0.7381864623243933,
37
+ "acc_norm_stderr": 0.01572083867844526
38
+ },
39
+ "harness|mmlu_anatomy|5": {
40
+ "acc": 0.5037037037037037,
41
+ "acc_stderr": 0.04319223625811331,
42
+ "acc_norm": 0.5037037037037037,
43
+ "acc_norm_stderr": 0.04319223625811331
44
+ },
45
+ "harness|mmlu_abstract_algebra|5": {
46
+ "acc": 0.35,
47
+ "acc_stderr": 0.04793724854411019,
48
+ "acc_norm": 0.35,
49
+ "acc_norm_stderr": 0.04793724854411019
50
+ },
51
+ "harness|mmlu_conceptual_physics|5": {
52
+ "acc": 0.5404255319148936,
53
+ "acc_stderr": 0.032579014820998335,
54
+ "acc_norm": 0.5404255319148936,
55
+ "acc_norm_stderr": 0.032579014820998335
56
+ },
57
+ "harness|mmlu_virology|5": {
58
+ "acc": 0.5180722891566265,
59
+ "acc_stderr": 0.038899512528272166,
60
+ "acc_norm": 0.5180722891566265,
61
+ "acc_norm_stderr": 0.038899512528272166
62
+ },
63
+ "harness|mmlu_philosophy|5": {
64
+ "acc": 0.6559485530546624,
65
+ "acc_stderr": 0.026981478043648043,
66
+ "acc_norm": 0.6559485530546624,
67
+ "acc_norm_stderr": 0.026981478043648043
68
+ },
69
+ "harness|mmlu_human_aging|5": {
70
+ "acc": 0.6591928251121076,
71
+ "acc_stderr": 0.0318114974705536,
72
+ "acc_norm": 0.6591928251121076,
73
+ "acc_norm_stderr": 0.0318114974705536
74
+ },
75
+ "harness|mmlu_human_sexuality|5": {
76
+ "acc": 0.6564885496183206,
77
+ "acc_stderr": 0.041649760719448786,
78
+ "acc_norm": 0.6564885496183206,
79
+ "acc_norm_stderr": 0.041649760719448786
80
+ },
81
+ "harness|mmlu_medical_genetics|5": {
82
+ "acc": 0.52,
83
+ "acc_stderr": 0.050211673156867795,
84
+ "acc_norm": 0.52,
85
+ "acc_norm_stderr": 0.050211673156867795
86
+ },
87
+ "harness|mmlu_high_school_geography|5": {
88
+ "acc": 0.7575757575757576,
89
+ "acc_stderr": 0.030532892233932036,
90
+ "acc_norm": 0.7575757575757576,
91
+ "acc_norm_stderr": 0.030532892233932036
92
+ },
93
+ "harness|mmlu_electrical_engineering|5": {
94
+ "acc": 0.5586206896551724,
95
+ "acc_stderr": 0.04137931034482757,
96
+ "acc_norm": 0.5586206896551724,
97
+ "acc_norm_stderr": 0.04137931034482757
98
+ },
99
+ "harness|mmlu_college_physics|5": {
100
+ "acc": 0.3137254901960784,
101
+ "acc_stderr": 0.04617034827006717,
102
+ "acc_norm": 0.3137254901960784,
103
+ "acc_norm_stderr": 0.04617034827006717
104
+ },
105
+ "harness|mmlu_high_school_microeconomics|5": {
106
+ "acc": 0.6512605042016807,
107
+ "acc_stderr": 0.03095663632856655,
108
+ "acc_norm": 0.6512605042016807,
109
+ "acc_norm_stderr": 0.03095663632856655
110
+ },
111
+ "harness|mmlu_high_school_macroeconomics|5": {
112
+ "acc": 0.6230769230769231,
113
+ "acc_stderr": 0.024570975364225995,
114
+ "acc_norm": 0.6230769230769231,
115
+ "acc_norm_stderr": 0.024570975364225995
116
+ },
117
+ "harness|mmlu_computer_security|5": {
118
+ "acc": 0.73,
119
+ "acc_stderr": 0.04461960433384739,
120
+ "acc_norm": 0.73,
121
+ "acc_norm_stderr": 0.04461960433384739
122
+ },
123
+ "harness|mmlu_global_facts|5": {
124
+ "acc": 0.32,
125
+ "acc_stderr": 0.04688261722621505,
126
+ "acc_norm": 0.32,
127
+ "acc_norm_stderr": 0.04688261722621505
128
+ },
129
+ "harness|mmlu_jurisprudence|5": {
130
+ "acc": 0.7037037037037037,
131
+ "acc_stderr": 0.04414343666854933,
132
+ "acc_norm": 0.7037037037037037,
133
+ "acc_norm_stderr": 0.04414343666854933
134
+ },
135
+ "harness|mmlu_high_school_chemistry|5": {
136
+ "acc": 0.4630541871921182,
137
+ "acc_stderr": 0.035083705204426656,
138
+ "acc_norm": 0.4630541871921182,
139
+ "acc_norm_stderr": 0.035083705204426656
140
+ },
141
+ "harness|mmlu_high_school_biology|5": {
142
+ "acc": 0.603225806451613,
143
+ "acc_stderr": 0.027831231605767944,
144
+ "acc_norm": 0.603225806451613,
145
+ "acc_norm_stderr": 0.027831231605767944
146
+ },
147
+ "harness|mmlu_marketing|5": {
148
+ "acc": 0.8205128205128205,
149
+ "acc_stderr": 0.025140935950335435,
150
+ "acc_norm": 0.8205128205128205,
151
+ "acc_norm_stderr": 0.025140935950335435
152
+ },
153
+ "harness|mmlu_clinical_knowledge|5": {
154
+ "acc": 0.5962264150943396,
155
+ "acc_stderr": 0.03019761160019795,
156
+ "acc_norm": 0.5962264150943396,
157
+ "acc_norm_stderr": 0.03019761160019795
158
+ },
159
+ "harness|mmlu_public_relations|5": {
160
+ "acc": 0.6181818181818182,
161
+ "acc_stderr": 0.046534298079135075,
162
+ "acc_norm": 0.6181818181818182,
163
+ "acc_norm_stderr": 0.046534298079135075
164
+ },
165
+ "harness|mmlu_high_school_mathematics|5": {
166
+ "acc": 0.37407407407407406,
167
+ "acc_stderr": 0.029502861128955293,
168
+ "acc_norm": 0.37407407407407406,
169
+ "acc_norm_stderr": 0.029502861128955293
170
+ },
171
+ "harness|mmlu_high_school_physics|5": {
172
+ "acc": 0.32450331125827814,
173
+ "acc_stderr": 0.038227469376587525,
174
+ "acc_norm": 0.32450331125827814,
175
+ "acc_norm_stderr": 0.038227469376587525
176
+ },
177
+ "harness|mmlu_sociology|5": {
178
+ "acc": 0.7164179104477612,
179
+ "acc_stderr": 0.03187187537919796,
180
+ "acc_norm": 0.7164179104477612,
181
+ "acc_norm_stderr": 0.03187187537919796
182
+ },
183
+ "harness|mmlu_college_medicine|5": {
184
+ "acc": 0.5375722543352601,
185
+ "acc_stderr": 0.03801685104524458,
186
+ "acc_norm": 0.5375722543352601,
187
+ "acc_norm_stderr": 0.03801685104524458
188
+ },
189
+ "harness|mmlu_elementary_mathematics|5": {
190
+ "acc": 0.42857142857142855,
191
+ "acc_stderr": 0.025487187147859372,
192
+ "acc_norm": 0.42857142857142855,
193
+ "acc_norm_stderr": 0.025487187147859372
194
+ },
195
+ "harness|mmlu_college_biology|5": {
196
+ "acc": 0.5902777777777778,
197
+ "acc_stderr": 0.04112490974670787,
198
+ "acc_norm": 0.5902777777777778,
199
+ "acc_norm_stderr": 0.04112490974670787
200
+ },
201
+ "harness|mmlu_college_chemistry|5": {
202
+ "acc": 0.45,
203
+ "acc_stderr": 0.049999999999999996,
204
+ "acc_norm": 0.45,
205
+ "acc_norm_stderr": 0.049999999999999996
206
+ },
207
+ "harness|mmlu_us_foreign_policy|5": {
208
+ "acc": 0.78,
209
+ "acc_stderr": 0.04163331998932263,
210
+ "acc_norm": 0.78,
211
+ "acc_norm_stderr": 0.04163331998932263
212
+ },
213
+ "harness|mmlu_moral_disputes|5": {
214
+ "acc": 0.6184971098265896,
215
+ "acc_stderr": 0.026152198619726803,
216
+ "acc_norm": 0.6184971098265896,
217
+ "acc_norm_stderr": 0.026152198619726803
218
+ },
219
+ "harness|mmlu_logical_fallacies|5": {
220
+ "acc": 0.6441717791411042,
221
+ "acc_stderr": 0.03761521380046734,
222
+ "acc_norm": 0.6441717791411042,
223
+ "acc_norm_stderr": 0.03761521380046734
224
+ },
225
+ "harness|mmlu_prehistory|5": {
226
+ "acc": 0.6944444444444444,
227
+ "acc_stderr": 0.025630824975621365,
228
+ "acc_norm": 0.6944444444444444,
229
+ "acc_norm_stderr": 0.025630824975621365
230
+ },
231
+ "harness|mmlu_college_mathematics|5": {
232
+ "acc": 0.43,
233
+ "acc_stderr": 0.049756985195624284,
234
+ "acc_norm": 0.43,
235
+ "acc_norm_stderr": 0.049756985195624284
236
+ },
237
+ "harness|mmlu_high_school_government_and_politics|5": {
238
+ "acc": 0.7927461139896373,
239
+ "acc_stderr": 0.029252823291803638,
240
+ "acc_norm": 0.7927461139896373,
241
+ "acc_norm_stderr": 0.029252823291803638
242
+ },
243
+ "harness|mmlu_econometrics|5": {
244
+ "acc": 0.43859649122807015,
245
+ "acc_stderr": 0.04668000738510455,
246
+ "acc_norm": 0.43859649122807015,
247
+ "acc_norm_stderr": 0.04668000738510455
248
+ },
249
+ "harness|mmlu_high_school_psychology|5": {
250
+ "acc": 0.7853211009174312,
251
+ "acc_stderr": 0.017604304149256494,
252
+ "acc_norm": 0.7853211009174312,
253
+ "acc_norm_stderr": 0.017604304149256494
254
+ },
255
+ "harness|mmlu_formal_logic|5": {
256
+ "acc": 0.3968253968253968,
257
+ "acc_stderr": 0.04375888492727062,
258
+ "acc_norm": 0.3968253968253968,
259
+ "acc_norm_stderr": 0.04375888492727062
260
+ },
261
+ "harness|mmlu_nutrition|5": {
262
+ "acc": 0.6437908496732027,
263
+ "acc_stderr": 0.027420477662629245,
264
+ "acc_norm": 0.6437908496732027,
265
+ "acc_norm_stderr": 0.027420477662629245
266
+ },
267
+ "harness|mmlu_business_ethics|5": {
268
+ "acc": 0.63,
269
+ "acc_stderr": 0.04852365870939099,
270
+ "acc_norm": 0.63,
271
+ "acc_norm_stderr": 0.04852365870939099
272
+ },
273
+ "harness|mmlu_international_law|5": {
274
+ "acc": 0.7603305785123967,
275
+ "acc_stderr": 0.03896878985070415,
276
+ "acc_norm": 0.7603305785123967,
277
+ "acc_norm_stderr": 0.03896878985070415
278
+ },
279
+ "harness|mmlu_astronomy|5": {
280
+ "acc": 0.625,
281
+ "acc_stderr": 0.039397364351956274,
282
+ "acc_norm": 0.625,
283
+ "acc_norm_stderr": 0.039397364351956274
284
+ },
285
+ "harness|mmlu_professional_psychology|5": {
286
+ "acc": 0.619281045751634,
287
+ "acc_stderr": 0.019643801557924806,
288
+ "acc_norm": 0.619281045751634,
289
+ "acc_norm_stderr": 0.019643801557924806
290
+ },
291
+ "harness|mmlu_professional_accounting|5": {
292
+ "acc": 0.45390070921985815,
293
+ "acc_stderr": 0.029700453247291467,
294
+ "acc_norm": 0.45390070921985815,
295
+ "acc_norm_stderr": 0.029700453247291467
296
+ },
297
+ "harness|mmlu_machine_learning|5": {
298
+ "acc": 0.41964285714285715,
299
+ "acc_stderr": 0.04684099321077106,
300
+ "acc_norm": 0.41964285714285715,
301
+ "acc_norm_stderr": 0.04684099321077106
302
+ },
303
+ "harness|mmlu_high_school_statistics|5": {
304
+ "acc": 0.5555555555555556,
305
+ "acc_stderr": 0.03388857118502326,
306
+ "acc_norm": 0.5555555555555556,
307
+ "acc_norm_stderr": 0.03388857118502326
308
+ },
309
+ "harness|mmlu_moral_scenarios|5": {
310
+ "acc": 0.3575418994413408,
311
+ "acc_stderr": 0.016029394474894893,
312
+ "acc_norm": 0.3575418994413408,
313
+ "acc_norm_stderr": 0.016029394474894893
314
+ },
315
+ "harness|mmlu_college_computer_science|5": {
316
+ "acc": 0.52,
317
+ "acc_stderr": 0.050211673156867795,
318
+ "acc_norm": 0.52,
319
+ "acc_norm_stderr": 0.050211673156867795
320
+ },
321
+ "harness|mmlu_high_school_computer_science|5": {
322
+ "acc": 0.75,
323
+ "acc_stderr": 0.04351941398892446,
324
+ "acc_norm": 0.75,
325
+ "acc_norm_stderr": 0.04351941398892446
326
+ },
327
+ "harness|mmlu_professional_medicine|5": {
328
+ "acc": 0.5735294117647058,
329
+ "acc_stderr": 0.03004261583271486,
330
+ "acc_norm": 0.5735294117647058,
331
+ "acc_norm_stderr": 0.03004261583271486
332
+ },
333
+ "harness|mmlu_security_studies|5": {
334
+ "acc": 0.6816326530612244,
335
+ "acc_stderr": 0.02982253379398204,
336
+ "acc_norm": 0.6816326530612244,
337
+ "acc_norm_stderr": 0.02982253379398204
338
+ },
339
+ "harness|mmlu_high_school_world_history|5": {
340
+ "acc": 0.7468354430379747,
341
+ "acc_stderr": 0.028304657943035293,
342
+ "acc_norm": 0.7468354430379747,
343
+ "acc_norm_stderr": 0.028304657943035293
344
+ },
345
+ "harness|mmlu_professional_law|5": {
346
+ "acc": 0.455019556714472,
347
+ "acc_stderr": 0.012718456618701789,
348
+ "acc_norm": 0.455019556714472,
349
+ "acc_norm_stderr": 0.012718456618701789
350
+ },
351
+ "harness|mmlu_high_school_us_history|5": {
352
+ "acc": 0.6666666666666666,
353
+ "acc_stderr": 0.033086111132364364,
354
+ "acc_norm": 0.6666666666666666,
355
+ "acc_norm_stderr": 0.033086111132364364
356
+ },
357
+ "harness|mmlu_high_school_european_history|5": {
358
+ "acc": 0.6484848484848484,
359
+ "acc_stderr": 0.037282069986826503,
360
+ "acc_norm": 0.6484848484848484,
361
+ "acc_norm_stderr": 0.037282069986826503
362
+ },
363
+ "harness|truthfulqa_mc|0": {
364
+ "mc1": 0.605875152998776,
365
+ "mc1_stderr": 0.017106588140700332,
366
+ "mc2": 0.7254831072808595,
367
+ "mc2_stderr": 0.014162522228042162
368
+ },
369
+ "harness|commongen_v2|2": {
370
+ "acc": 0.5926800472255017,
371
+ "acc_stderr": 0.01689245669519127,
372
+ "acc_norm": 0.6269185360094451,
373
+ "acc_norm_stderr": 0.016627318275137453
374
+ }
375
+ },
376
+ "versions": {
377
+ "all": 0,
378
+ "harness|arc_challenge|25": 0,
379
+ "harness|hellaswag|10": 0,
380
+ "harness|mmlu_world_religions|5": 1,
381
+ "harness|mmlu_management|5": 1,
382
+ "harness|mmlu_miscellaneous|5": 1,
383
+ "harness|mmlu_anatomy|5": 1,
384
+ "harness|mmlu_abstract_algebra|5": 1,
385
+ "harness|mmlu_conceptual_physics|5": 1,
386
+ "harness|mmlu_virology|5": 1,
387
+ "harness|mmlu_philosophy|5": 1,
388
+ "harness|mmlu_human_aging|5": 1,
389
+ "harness|mmlu_human_sexuality|5": 1,
390
+ "harness|mmlu_medical_genetics|5": 1,
391
+ "harness|mmlu_high_school_geography|5": 1,
392
+ "harness|mmlu_electrical_engineering|5": 1,
393
+ "harness|mmlu_college_physics|5": 1,
394
+ "harness|mmlu_high_school_microeconomics|5": 1,
395
+ "harness|mmlu_high_school_macroeconomics|5": 1,
396
+ "harness|mmlu_computer_security|5": 1,
397
+ "harness|mmlu_global_facts|5": 1,
398
+ "harness|mmlu_jurisprudence|5": 1,
399
+ "harness|mmlu_high_school_chemistry|5": 1,
400
+ "harness|mmlu_high_school_biology|5": 1,
401
+ "harness|mmlu_marketing|5": 1,
402
+ "harness|mmlu_clinical_knowledge|5": 1,
403
+ "harness|mmlu_public_relations|5": 1,
404
+ "harness|mmlu_high_school_mathematics|5": 1,
405
+ "harness|mmlu_high_school_physics|5": 1,
406
+ "harness|mmlu_sociology|5": 1,
407
+ "harness|mmlu_college_medicine|5": 1,
408
+ "harness|mmlu_elementary_mathematics|5": 1,
409
+ "harness|mmlu_college_biology|5": 1,
410
+ "harness|mmlu_college_chemistry|5": 1,
411
+ "harness|mmlu_us_foreign_policy|5": 1,
412
+ "harness|mmlu_moral_disputes|5": 1,
413
+ "harness|mmlu_logical_fallacies|5": 1,
414
+ "harness|mmlu_prehistory|5": 1,
415
+ "harness|mmlu_college_mathematics|5": 1,
416
+ "harness|mmlu_high_school_government_and_politics|5": 1,
417
+ "harness|mmlu_econometrics|5": 1,
418
+ "harness|mmlu_high_school_psychology|5": 1,
419
+ "harness|mmlu_formal_logic|5": 1,
420
+ "harness|mmlu_nutrition|5": 1,
421
+ "harness|mmlu_business_ethics|5": 1,
422
+ "harness|mmlu_international_law|5": 1,
423
+ "harness|mmlu_astronomy|5": 1,
424
+ "harness|mmlu_professional_psychology|5": 1,
425
+ "harness|mmlu_professional_accounting|5": 1,
426
+ "harness|mmlu_machine_learning|5": 1,
427
+ "harness|mmlu_high_school_statistics|5": 1,
428
+ "harness|mmlu_moral_scenarios|5": 1,
429
+ "harness|mmlu_college_computer_science|5": 1,
430
+ "harness|mmlu_high_school_computer_science|5": 1,
431
+ "harness|mmlu_professional_medicine|5": 1,
432
+ "harness|mmlu_security_studies|5": 1,
433
+ "harness|mmlu_high_school_world_history|5": 1,
434
+ "harness|mmlu_professional_law|5": 1,
435
+ "harness|mmlu_high_school_us_history|5": 1,
436
+ "harness|mmlu_high_school_european_history|5": 1,
437
+ "harness|truthfulqa_mc|0": 0,
438
+ "harness|commongen_v2|2": 1
439
+ },
440
+ "config_general": {
441
+ "model_name": "x2bee/POLAR-14B-SON-SFT-v0.1",
442
+ "model_sha": "01286a13088332c1eda4279b5bcfa7a0a33e145f",
443
+ "model_dtype": "torch.float16",
444
+ "lighteval_sha": "",
445
+ "num_few_shot_default": 0,
446
+ "num_fewshot_seeds": 1,
447
+ "override_batch_size": 1,
448
+ "max_samples": null
449
+ }
450
+ }
eval-results/x2bee/POLAR-14B-v0.2/result.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "daily": {
4
+ "daily": 2
5
+ },
6
+ "quarterly": {
7
+ "quarterly": 2
8
+ },
9
+ "harness|arc_challenge|25": {
10
+ "acc": 0.7465870307167235,
11
+ "acc_stderr": 0.012710896778378602,
12
+ "acc_norm": 0.7687713310580204,
13
+ "acc_norm_stderr": 0.012320858834772264
14
+ },
15
+ "harness|hellaswag|10": {
16
+ "acc": 0.681736705835491,
17
+ "acc_stderr": 0.004648503177353952,
18
+ "acc_norm": 0.7999402509460267,
19
+ "acc_norm_stderr": 0.003992272261659531
20
+ },
21
+ "harness|mmlu_world_religions|5": {
22
+ "acc": 0.6549707602339181,
23
+ "acc_stderr": 0.036459813773888065,
24
+ "acc_norm": 0.6549707602339181,
25
+ "acc_norm_stderr": 0.036459813773888065
26
+ },
27
+ "harness|mmlu_management|5": {
28
+ "acc": 0.7378640776699029,
29
+ "acc_stderr": 0.043546310772605956,
30
+ "acc_norm": 0.7378640776699029,
31
+ "acc_norm_stderr": 0.043546310772605956
32
+ },
33
+ "harness|mmlu_miscellaneous|5": {
34
+ "acc": 0.6922094508301405,
35
+ "acc_stderr": 0.016506045045155633,
36
+ "acc_norm": 0.6922094508301405,
37
+ "acc_norm_stderr": 0.016506045045155633
38
+ },
39
+ "harness|mmlu_anatomy|5": {
40
+ "acc": 0.4666666666666667,
41
+ "acc_stderr": 0.043097329010363554,
42
+ "acc_norm": 0.4666666666666667,
43
+ "acc_norm_stderr": 0.043097329010363554
44
+ },
45
+ "harness|mmlu_abstract_algebra|5": {
46
+ "acc": 0.35,
47
+ "acc_stderr": 0.047937248544110196,
48
+ "acc_norm": 0.35,
49
+ "acc_norm_stderr": 0.047937248544110196
50
+ },
51
+ "harness|mmlu_conceptual_physics|5": {
52
+ "acc": 0.4595744680851064,
53
+ "acc_stderr": 0.03257901482099836,
54
+ "acc_norm": 0.4595744680851064,
55
+ "acc_norm_stderr": 0.03257901482099836
56
+ },
57
+ "harness|mmlu_virology|5": {
58
+ "acc": 0.4879518072289157,
59
+ "acc_stderr": 0.03891364495835821,
60
+ "acc_norm": 0.4879518072289157,
61
+ "acc_norm_stderr": 0.03891364495835821
62
+ },
63
+ "harness|mmlu_philosophy|5": {
64
+ "acc": 0.6045016077170418,
65
+ "acc_stderr": 0.027770918531427834,
66
+ "acc_norm": 0.6045016077170418,
67
+ "acc_norm_stderr": 0.027770918531427834
68
+ },
69
+ "harness|mmlu_human_aging|5": {
70
+ "acc": 0.6233183856502242,
71
+ "acc_stderr": 0.03252113489929188,
72
+ "acc_norm": 0.6233183856502242,
73
+ "acc_norm_stderr": 0.03252113489929188
74
+ },
75
+ "harness|mmlu_human_sexuality|5": {
76
+ "acc": 0.6412213740458015,
77
+ "acc_stderr": 0.04206739313864908,
78
+ "acc_norm": 0.6412213740458015,
79
+ "acc_norm_stderr": 0.04206739313864908
80
+ },
81
+ "harness|mmlu_medical_genetics|5": {
82
+ "acc": 0.51,
83
+ "acc_stderr": 0.05024183937956911,
84
+ "acc_norm": 0.51,
85
+ "acc_norm_stderr": 0.05024183937956911
86
+ },
87
+ "harness|mmlu_high_school_geography|5": {
88
+ "acc": 0.7222222222222222,
89
+ "acc_stderr": 0.03191178226713547,
90
+ "acc_norm": 0.7222222222222222,
91
+ "acc_norm_stderr": 0.03191178226713547
92
+ },
93
+ "harness|mmlu_electrical_engineering|5": {
94
+ "acc": 0.5241379310344828,
95
+ "acc_stderr": 0.0416180850350153,
96
+ "acc_norm": 0.5241379310344828,
97
+ "acc_norm_stderr": 0.0416180850350153
98
+ },
99
+ "harness|mmlu_college_physics|5": {
100
+ "acc": 0.3235294117647059,
101
+ "acc_stderr": 0.046550104113196177,
102
+ "acc_norm": 0.3235294117647059,
103
+ "acc_norm_stderr": 0.046550104113196177
104
+ },
105
+ "harness|mmlu_high_school_microeconomics|5": {
106
+ "acc": 0.6764705882352942,
107
+ "acc_stderr": 0.030388353551886793,
108
+ "acc_norm": 0.6764705882352942,
109
+ "acc_norm_stderr": 0.030388353551886793
110
+ },
111
+ "harness|mmlu_high_school_macroeconomics|5": {
112
+ "acc": 0.6384615384615384,
113
+ "acc_stderr": 0.024359581465397,
114
+ "acc_norm": 0.6384615384615384,
115
+ "acc_norm_stderr": 0.024359581465397
116
+ },
117
+ "harness|mmlu_computer_security|5": {
118
+ "acc": 0.65,
119
+ "acc_stderr": 0.0479372485441102,
120
+ "acc_norm": 0.65,
121
+ "acc_norm_stderr": 0.0479372485441102
122
+ },
123
+ "harness|mmlu_global_facts|5": {
124
+ "acc": 0.31,
125
+ "acc_stderr": 0.04648231987117316,
126
+ "acc_norm": 0.31,
127
+ "acc_norm_stderr": 0.04648231987117316
128
+ },
129
+ "harness|mmlu_jurisprudence|5": {
130
+ "acc": 0.6296296296296297,
131
+ "acc_stderr": 0.04668408033024931,
132
+ "acc_norm": 0.6296296296296297,
133
+ "acc_norm_stderr": 0.04668408033024931
134
+ },
135
+ "harness|mmlu_high_school_chemistry|5": {
136
+ "acc": 0.4729064039408867,
137
+ "acc_stderr": 0.03512819077876105,
138
+ "acc_norm": 0.4729064039408867,
139
+ "acc_norm_stderr": 0.03512819077876105
140
+ },
141
+ "harness|mmlu_high_school_biology|5": {
142
+ "acc": 0.5709677419354838,
143
+ "acc_stderr": 0.028156036538233193,
144
+ "acc_norm": 0.5709677419354838,
145
+ "acc_norm_stderr": 0.028156036538233193
146
+ },
147
+ "harness|mmlu_marketing|5": {
148
+ "acc": 0.8034188034188035,
149
+ "acc_stderr": 0.026035386098951292,
150
+ "acc_norm": 0.8034188034188035,
151
+ "acc_norm_stderr": 0.026035386098951292
152
+ },
153
+ "harness|mmlu_clinical_knowledge|5": {
154
+ "acc": 0.5547169811320755,
155
+ "acc_stderr": 0.030588052974270655,
156
+ "acc_norm": 0.5547169811320755,
157
+ "acc_norm_stderr": 0.030588052974270655
158
+ },
159
+ "harness|mmlu_public_relations|5": {
160
+ "acc": 0.6363636363636364,
161
+ "acc_stderr": 0.04607582090719976,
162
+ "acc_norm": 0.6363636363636364,
163
+ "acc_norm_stderr": 0.04607582090719976
164
+ },
165
+ "harness|mmlu_high_school_mathematics|5": {
166
+ "acc": 0.3592592592592593,
167
+ "acc_stderr": 0.029252905927251976,
168
+ "acc_norm": 0.3592592592592593,
169
+ "acc_norm_stderr": 0.029252905927251976
170
+ },
171
+ "harness|mmlu_high_school_physics|5": {
172
+ "acc": 0.3576158940397351,
173
+ "acc_stderr": 0.03913453431177258,
174
+ "acc_norm": 0.3576158940397351,
175
+ "acc_norm_stderr": 0.03913453431177258
176
+ },
177
+ "harness|mmlu_sociology|5": {
178
+ "acc": 0.6268656716417911,
179
+ "acc_stderr": 0.034198326081760065,
180
+ "acc_norm": 0.6268656716417911,
181
+ "acc_norm_stderr": 0.034198326081760065
182
+ },
183
+ "harness|mmlu_college_medicine|5": {
184
+ "acc": 0.48554913294797686,
185
+ "acc_stderr": 0.03810871630454764,
186
+ "acc_norm": 0.48554913294797686,
187
+ "acc_norm_stderr": 0.03810871630454764
188
+ },
189
+ "harness|mmlu_elementary_mathematics|5": {
190
+ "acc": 0.4497354497354497,
191
+ "acc_stderr": 0.025620857042936648,
192
+ "acc_norm": 0.4497354497354497,
193
+ "acc_norm_stderr": 0.025620857042936648
194
+ },
195
+ "harness|mmlu_college_biology|5": {
196
+ "acc": 0.6041666666666666,
197
+ "acc_stderr": 0.04089465449325582,
198
+ "acc_norm": 0.6041666666666666,
199
+ "acc_norm_stderr": 0.04089465449325582
200
+ },
201
+ "harness|mmlu_college_chemistry|5": {
202
+ "acc": 0.32,
203
+ "acc_stderr": 0.046882617226215034,
204
+ "acc_norm": 0.32,
205
+ "acc_norm_stderr": 0.046882617226215034
206
+ },
207
+ "harness|mmlu_us_foreign_policy|5": {
208
+ "acc": 0.71,
209
+ "acc_stderr": 0.045604802157206824,
210
+ "acc_norm": 0.71,
211
+ "acc_norm_stderr": 0.045604802157206824
212
+ },
213
+ "harness|mmlu_moral_disputes|5": {
214
+ "acc": 0.5664739884393064,
215
+ "acc_stderr": 0.026680134761679217,
216
+ "acc_norm": 0.5664739884393064,
217
+ "acc_norm_stderr": 0.026680134761679217
218
+ },
219
+ "harness|mmlu_logical_fallacies|5": {
220
+ "acc": 0.6196319018404908,
221
+ "acc_stderr": 0.038142698932618374,
222
+ "acc_norm": 0.6196319018404908,
223
+ "acc_norm_stderr": 0.038142698932618374
224
+ },
225
+ "harness|mmlu_prehistory|5": {
226
+ "acc": 0.6574074074074074,
227
+ "acc_stderr": 0.026406145973625686,
228
+ "acc_norm": 0.6574074074074074,
229
+ "acc_norm_stderr": 0.026406145973625686
230
+ },
231
+ "harness|mmlu_college_mathematics|5": {
232
+ "acc": 0.37,
233
+ "acc_stderr": 0.04852365870939098,
234
+ "acc_norm": 0.37,
235
+ "acc_norm_stderr": 0.04852365870939098
236
+ },
237
+ "harness|mmlu_high_school_government_and_politics|5": {
238
+ "acc": 0.7616580310880829,
239
+ "acc_stderr": 0.030748905363909895,
240
+ "acc_norm": 0.7616580310880829,
241
+ "acc_norm_stderr": 0.030748905363909895
242
+ },
243
+ "harness|mmlu_econometrics|5": {
244
+ "acc": 0.5,
245
+ "acc_stderr": 0.047036043419179864,
246
+ "acc_norm": 0.5,
247
+ "acc_norm_stderr": 0.047036043419179864
248
+ },
249
+ "harness|mmlu_high_school_psychology|5": {
250
+ "acc": 0.7211009174311926,
251
+ "acc_stderr": 0.01922746887646353,
252
+ "acc_norm": 0.7211009174311926,
253
+ "acc_norm_stderr": 0.01922746887646353
254
+ },
255
+ "harness|mmlu_formal_logic|5": {
256
+ "acc": 0.42857142857142855,
257
+ "acc_stderr": 0.0442626668137991,
258
+ "acc_norm": 0.42857142857142855,
259
+ "acc_norm_stderr": 0.0442626668137991
260
+ },
261
+ "harness|mmlu_nutrition|5": {
262
+ "acc": 0.5816993464052288,
263
+ "acc_stderr": 0.0282451340243873,
264
+ "acc_norm": 0.5816993464052288,
265
+ "acc_norm_stderr": 0.0282451340243873
266
+ },
267
+ "harness|mmlu_business_ethics|5": {
268
+ "acc": 0.73,
269
+ "acc_stderr": 0.044619604333847394,
270
+ "acc_norm": 0.73,
271
+ "acc_norm_stderr": 0.044619604333847394
272
+ },
273
+ "harness|mmlu_international_law|5": {
274
+ "acc": 0.7107438016528925,
275
+ "acc_stderr": 0.041391127276354626,
276
+ "acc_norm": 0.7107438016528925,
277
+ "acc_norm_stderr": 0.041391127276354626
278
+ },
279
+ "harness|mmlu_astronomy|5": {
280
+ "acc": 0.6513157894736842,
281
+ "acc_stderr": 0.038781398887976104,
282
+ "acc_norm": 0.6513157894736842,
283
+ "acc_norm_stderr": 0.038781398887976104
284
+ },
285
+ "harness|mmlu_professional_psychology|5": {
286
+ "acc": 0.5686274509803921,
287
+ "acc_stderr": 0.020036393768352624,
288
+ "acc_norm": 0.5686274509803921,
289
+ "acc_norm_stderr": 0.020036393768352624
290
+ },
291
+ "harness|mmlu_professional_accounting|5": {
292
+ "acc": 0.45390070921985815,
293
+ "acc_stderr": 0.029700453247291477,
294
+ "acc_norm": 0.45390070921985815,
295
+ "acc_norm_stderr": 0.029700453247291477
296
+ },
297
+ "harness|mmlu_machine_learning|5": {
298
+ "acc": 0.4642857142857143,
299
+ "acc_stderr": 0.04733667890053756,
300
+ "acc_norm": 0.4642857142857143,
301
+ "acc_norm_stderr": 0.04733667890053756
302
+ },
303
+ "harness|mmlu_high_school_statistics|5": {
304
+ "acc": 0.5092592592592593,
305
+ "acc_stderr": 0.034093869469927006,
306
+ "acc_norm": 0.5092592592592593,
307
+ "acc_norm_stderr": 0.034093869469927006
308
+ },
309
+ "harness|mmlu_moral_scenarios|5": {
310
+ "acc": 0.37206703910614525,
311
+ "acc_stderr": 0.016165847583563295,
312
+ "acc_norm": 0.37206703910614525,
313
+ "acc_norm_stderr": 0.016165847583563295
314
+ },
315
+ "harness|mmlu_college_computer_science|5": {
316
+ "acc": 0.43,
317
+ "acc_stderr": 0.049756985195624284,
318
+ "acc_norm": 0.43,
319
+ "acc_norm_stderr": 0.049756985195624284
320
+ },
321
+ "harness|mmlu_high_school_computer_science|5": {
322
+ "acc": 0.71,
323
+ "acc_stderr": 0.045604802157206845,
324
+ "acc_norm": 0.71,
325
+ "acc_norm_stderr": 0.045604802157206845
326
+ },
327
+ "harness|mmlu_professional_medicine|5": {
328
+ "acc": 0.5404411764705882,
329
+ "acc_stderr": 0.030273325077345755,
330
+ "acc_norm": 0.5404411764705882,
331
+ "acc_norm_stderr": 0.030273325077345755
332
+ },
333
+ "harness|mmlu_security_studies|5": {
334
+ "acc": 0.6122448979591837,
335
+ "acc_stderr": 0.03119223072679566,
336
+ "acc_norm": 0.6122448979591837,
337
+ "acc_norm_stderr": 0.03119223072679566
338
+ },
339
+ "harness|mmlu_high_school_world_history|5": {
340
+ "acc": 0.7257383966244726,
341
+ "acc_stderr": 0.029041333510598025,
342
+ "acc_norm": 0.7257383966244726,
343
+ "acc_norm_stderr": 0.029041333510598025
344
+ },
345
+ "harness|mmlu_professional_law|5": {
346
+ "acc": 0.4641460234680574,
347
+ "acc_stderr": 0.01273736131873058,
348
+ "acc_norm": 0.4641460234680574,
349
+ "acc_norm_stderr": 0.01273736131873058
350
+ },
351
+ "harness|mmlu_high_school_us_history|5": {
352
+ "acc": 0.6568627450980392,
353
+ "acc_stderr": 0.03332139944668086,
354
+ "acc_norm": 0.6568627450980392,
355
+ "acc_norm_stderr": 0.03332139944668086
356
+ },
357
+ "harness|mmlu_high_school_european_history|5": {
358
+ "acc": 0.6,
359
+ "acc_stderr": 0.03825460278380025,
360
+ "acc_norm": 0.6,
361
+ "acc_norm_stderr": 0.03825460278380025
362
+ },
363
+ "harness|truthfulqa_mc|0": {
364
+ "mc1": 0.7246022031823746,
365
+ "mc1_stderr": 0.01563813566777552,
366
+ "mc2": 0.8107575910195236,
367
+ "mc2_stderr": 0.013335029489665237
368
+ },
369
+ "harness|commongen_v2|2": {
370
+ "acc": 0.525383707201889,
371
+ "acc_stderr": 0.017168187201429253,
372
+ "acc_norm": 0.5442739079102715,
373
+ "acc_norm_stderr": 0.017122829143292655
374
+ }
375
+ },
376
+ "versions": {
377
+ "all": 0,
378
+ "harness|arc_challenge|25": 0,
379
+ "harness|hellaswag|10": 0,
380
+ "harness|mmlu_world_religions|5": 1,
381
+ "harness|mmlu_management|5": 1,
382
+ "harness|mmlu_miscellaneous|5": 1,
383
+ "harness|mmlu_anatomy|5": 1,
384
+ "harness|mmlu_abstract_algebra|5": 1,
385
+ "harness|mmlu_conceptual_physics|5": 1,
386
+ "harness|mmlu_virology|5": 1,
387
+ "harness|mmlu_philosophy|5": 1,
388
+ "harness|mmlu_human_aging|5": 1,
389
+ "harness|mmlu_human_sexuality|5": 1,
390
+ "harness|mmlu_medical_genetics|5": 1,
391
+ "harness|mmlu_high_school_geography|5": 1,
392
+ "harness|mmlu_electrical_engineering|5": 1,
393
+ "harness|mmlu_college_physics|5": 1,
394
+ "harness|mmlu_high_school_microeconomics|5": 1,
395
+ "harness|mmlu_high_school_macroeconomics|5": 1,
396
+ "harness|mmlu_computer_security|5": 1,
397
+ "harness|mmlu_global_facts|5": 1,
398
+ "harness|mmlu_jurisprudence|5": 1,
399
+ "harness|mmlu_high_school_chemistry|5": 1,
400
+ "harness|mmlu_high_school_biology|5": 1,
401
+ "harness|mmlu_marketing|5": 1,
402
+ "harness|mmlu_clinical_knowledge|5": 1,
403
+ "harness|mmlu_public_relations|5": 1,
404
+ "harness|mmlu_high_school_mathematics|5": 1,
405
+ "harness|mmlu_high_school_physics|5": 1,
406
+ "harness|mmlu_sociology|5": 1,
407
+ "harness|mmlu_college_medicine|5": 1,
408
+ "harness|mmlu_elementary_mathematics|5": 1,
409
+ "harness|mmlu_college_biology|5": 1,
410
+ "harness|mmlu_college_chemistry|5": 1,
411
+ "harness|mmlu_us_foreign_policy|5": 1,
412
+ "harness|mmlu_moral_disputes|5": 1,
413
+ "harness|mmlu_logical_fallacies|5": 1,
414
+ "harness|mmlu_prehistory|5": 1,
415
+ "harness|mmlu_college_mathematics|5": 1,
416
+ "harness|mmlu_high_school_government_and_politics|5": 1,
417
+ "harness|mmlu_econometrics|5": 1,
418
+ "harness|mmlu_high_school_psychology|5": 1,
419
+ "harness|mmlu_formal_logic|5": 1,
420
+ "harness|mmlu_nutrition|5": 1,
421
+ "harness|mmlu_business_ethics|5": 1,
422
+ "harness|mmlu_international_law|5": 1,
423
+ "harness|mmlu_astronomy|5": 1,
424
+ "harness|mmlu_professional_psychology|5": 1,
425
+ "harness|mmlu_professional_accounting|5": 1,
426
+ "harness|mmlu_machine_learning|5": 1,
427
+ "harness|mmlu_high_school_statistics|5": 1,
428
+ "harness|mmlu_moral_scenarios|5": 1,
429
+ "harness|mmlu_college_computer_science|5": 1,
430
+ "harness|mmlu_high_school_computer_science|5": 1,
431
+ "harness|mmlu_professional_medicine|5": 1,
432
+ "harness|mmlu_security_studies|5": 1,
433
+ "harness|mmlu_high_school_world_history|5": 1,
434
+ "harness|mmlu_professional_law|5": 1,
435
+ "harness|mmlu_high_school_us_history|5": 1,
436
+ "harness|mmlu_high_school_european_history|5": 1,
437
+ "harness|truthfulqa_mc|0": 0,
438
+ "harness|commongen_v2|2": 1
439
+ },
440
+ "config_general": {
441
+ "model_name": "x2bee/POLAR-14B-v0.2",
442
+ "model_sha": "8d905623a3972e11260420130039c62e115cbbaa",
443
+ "model_dtype": "torch.float16",
444
+ "lighteval_sha": "",
445
+ "num_few_shot_default": 0,
446
+ "num_fewshot_seeds": 1,
447
+ "override_batch_size": 1,
448
+ "max_samples": null
449
+ }
450
+ }
eval-results/x2bee/POLAR-14B-v0.5/result.json ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "daily": {
4
+ "daily": 1
5
+ },
6
+ "quarterly": {
7
+ "quarterly": 1
8
+ },
9
+ "harness|arc_challenge|25": {
10
+ "acc": 0.75,
11
+ "acc_stderr": 0.012653835621466646,
12
+ "acc_norm": 0.7798634812286689,
13
+ "acc_norm_stderr": 0.012108124883460988
14
+ },
15
+ "harness|hellaswag|10": {
16
+ "acc": 0.6500697072296355,
17
+ "acc_stderr": 0.004759729267943182,
18
+ "acc_norm": 0.775542720573591,
19
+ "acc_norm_stderr": 0.004163717220873764
20
+ },
21
+ "harness|mmlu_world_religions|5": {
22
+ "acc": 0.6374269005847953,
23
+ "acc_stderr": 0.036871306155620606,
24
+ "acc_norm": 0.6374269005847953,
25
+ "acc_norm_stderr": 0.036871306155620606
26
+ },
27
+ "harness|mmlu_management|5": {
28
+ "acc": 0.7087378640776699,
29
+ "acc_stderr": 0.044986763205729224,
30
+ "acc_norm": 0.7087378640776699,
31
+ "acc_norm_stderr": 0.044986763205729224
32
+ },
33
+ "harness|mmlu_miscellaneous|5": {
34
+ "acc": 0.6730523627075351,
35
+ "acc_stderr": 0.016774908180131484,
36
+ "acc_norm": 0.6730523627075351,
37
+ "acc_norm_stderr": 0.016774908180131484
38
+ },
39
+ "harness|mmlu_anatomy|5": {
40
+ "acc": 0.45185185185185184,
41
+ "acc_stderr": 0.04299268905480864,
42
+ "acc_norm": 0.45185185185185184,
43
+ "acc_norm_stderr": 0.04299268905480864
44
+ },
45
+ "harness|mmlu_abstract_algebra|5": {
46
+ "acc": 0.36,
47
+ "acc_stderr": 0.048241815132442176,
48
+ "acc_norm": 0.36,
49
+ "acc_norm_stderr": 0.048241815132442176
50
+ },
51
+ "harness|mmlu_conceptual_physics|5": {
52
+ "acc": 0.4723404255319149,
53
+ "acc_stderr": 0.03263597118409769,
54
+ "acc_norm": 0.4723404255319149,
55
+ "acc_norm_stderr": 0.03263597118409769
56
+ },
57
+ "harness|mmlu_virology|5": {
58
+ "acc": 0.46987951807228917,
59
+ "acc_stderr": 0.03885425420866766,
60
+ "acc_norm": 0.46987951807228917,
61
+ "acc_norm_stderr": 0.03885425420866766
62
+ },
63
+ "harness|mmlu_philosophy|5": {
64
+ "acc": 0.594855305466238,
65
+ "acc_stderr": 0.027882383791325963,
66
+ "acc_norm": 0.594855305466238,
67
+ "acc_norm_stderr": 0.027882383791325963
68
+ },
69
+ "harness|mmlu_human_aging|5": {
70
+ "acc": 0.6412556053811659,
71
+ "acc_stderr": 0.032190792004199956,
72
+ "acc_norm": 0.6412556053811659,
73
+ "acc_norm_stderr": 0.032190792004199956
74
+ },
75
+ "harness|mmlu_human_sexuality|5": {
76
+ "acc": 0.5954198473282443,
77
+ "acc_stderr": 0.043046937953806645,
78
+ "acc_norm": 0.5954198473282443,
79
+ "acc_norm_stderr": 0.043046937953806645
80
+ },
81
+ "harness|mmlu_medical_genetics|5": {
82
+ "acc": 0.5,
83
+ "acc_stderr": 0.050251890762960605,
84
+ "acc_norm": 0.5,
85
+ "acc_norm_stderr": 0.050251890762960605
86
+ },
87
+ "harness|mmlu_high_school_geography|5": {
88
+ "acc": 0.7272727272727273,
89
+ "acc_stderr": 0.03173071239071724,
90
+ "acc_norm": 0.7272727272727273,
91
+ "acc_norm_stderr": 0.03173071239071724
92
+ },
93
+ "harness|mmlu_electrical_engineering|5": {
94
+ "acc": 0.503448275862069,
95
+ "acc_stderr": 0.0416656757710158,
96
+ "acc_norm": 0.503448275862069,
97
+ "acc_norm_stderr": 0.0416656757710158
98
+ },
99
+ "harness|mmlu_college_physics|5": {
100
+ "acc": 0.3431372549019608,
101
+ "acc_stderr": 0.04724007352383888,
102
+ "acc_norm": 0.3431372549019608,
103
+ "acc_norm_stderr": 0.04724007352383888
104
+ },
105
+ "harness|mmlu_high_school_microeconomics|5": {
106
+ "acc": 0.6596638655462185,
107
+ "acc_stderr": 0.03077805742293167,
108
+ "acc_norm": 0.6596638655462185,
109
+ "acc_norm_stderr": 0.03077805742293167
110
+ },
111
+ "harness|mmlu_high_school_macroeconomics|5": {
112
+ "acc": 0.6102564102564103,
113
+ "acc_stderr": 0.024726967886647078,
114
+ "acc_norm": 0.6102564102564103,
115
+ "acc_norm_stderr": 0.024726967886647078
116
+ },
117
+ "harness|mmlu_computer_security|5": {
118
+ "acc": 0.67,
119
+ "acc_stderr": 0.047258156262526094,
120
+ "acc_norm": 0.67,
121
+ "acc_norm_stderr": 0.047258156262526094
122
+ },
123
+ "harness|mmlu_global_facts|5": {
124
+ "acc": 0.33,
125
+ "acc_stderr": 0.047258156262526045,
126
+ "acc_norm": 0.33,
127
+ "acc_norm_stderr": 0.047258156262526045
128
+ },
129
+ "harness|mmlu_jurisprudence|5": {
130
+ "acc": 0.6481481481481481,
131
+ "acc_stderr": 0.04616631111801714,
132
+ "acc_norm": 0.6481481481481481,
133
+ "acc_norm_stderr": 0.04616631111801714
134
+ },
135
+ "harness|mmlu_high_school_chemistry|5": {
136
+ "acc": 0.4729064039408867,
137
+ "acc_stderr": 0.03512819077876105,
138
+ "acc_norm": 0.4729064039408867,
139
+ "acc_norm_stderr": 0.03512819077876105
140
+ },
141
+ "harness|mmlu_high_school_biology|5": {
142
+ "acc": 0.5709677419354838,
143
+ "acc_stderr": 0.028156036538233193,
144
+ "acc_norm": 0.5709677419354838,
145
+ "acc_norm_stderr": 0.028156036538233193
146
+ },
147
+ "harness|mmlu_marketing|5": {
148
+ "acc": 0.7735042735042735,
149
+ "acc_stderr": 0.027421007295392943,
150
+ "acc_norm": 0.7735042735042735,
151
+ "acc_norm_stderr": 0.027421007295392943
152
+ },
153
+ "harness|mmlu_clinical_knowledge|5": {
154
+ "acc": 0.5660377358490566,
155
+ "acc_stderr": 0.030503292013342596,
156
+ "acc_norm": 0.5660377358490566,
157
+ "acc_norm_stderr": 0.030503292013342596
158
+ },
159
+ "harness|mmlu_public_relations|5": {
160
+ "acc": 0.6272727272727273,
161
+ "acc_stderr": 0.04631381319425465,
162
+ "acc_norm": 0.6272727272727273,
163
+ "acc_norm_stderr": 0.04631381319425465
164
+ },
165
+ "harness|mmlu_high_school_mathematics|5": {
166
+ "acc": 0.3333333333333333,
167
+ "acc_stderr": 0.0287420409039485,
168
+ "acc_norm": 0.3333333333333333,
169
+ "acc_norm_stderr": 0.0287420409039485
170
+ },
171
+ "harness|mmlu_high_school_physics|5": {
172
+ "acc": 0.39072847682119205,
173
+ "acc_stderr": 0.039837983066598075,
174
+ "acc_norm": 0.39072847682119205,
175
+ "acc_norm_stderr": 0.039837983066598075
176
+ },
177
+ "harness|mmlu_sociology|5": {
178
+ "acc": 0.6417910447761194,
179
+ "acc_stderr": 0.03390393042268814,
180
+ "acc_norm": 0.6417910447761194,
181
+ "acc_norm_stderr": 0.03390393042268814
182
+ },
183
+ "harness|mmlu_college_medicine|5": {
184
+ "acc": 0.5028901734104047,
185
+ "acc_stderr": 0.038124005659748335,
186
+ "acc_norm": 0.5028901734104047,
187
+ "acc_norm_stderr": 0.038124005659748335
188
+ },
189
+ "harness|mmlu_elementary_mathematics|5": {
190
+ "acc": 0.42857142857142855,
191
+ "acc_stderr": 0.025487187147859372,
192
+ "acc_norm": 0.42857142857142855,
193
+ "acc_norm_stderr": 0.025487187147859372
194
+ },
195
+ "harness|mmlu_college_biology|5": {
196
+ "acc": 0.6180555555555556,
197
+ "acc_stderr": 0.040629907841466674,
198
+ "acc_norm": 0.6180555555555556,
199
+ "acc_norm_stderr": 0.040629907841466674
200
+ },
201
+ "harness|mmlu_college_chemistry|5": {
202
+ "acc": 0.3,
203
+ "acc_stderr": 0.046056618647183814,
204
+ "acc_norm": 0.3,
205
+ "acc_norm_stderr": 0.046056618647183814
206
+ },
207
+ "harness|mmlu_us_foreign_policy|5": {
208
+ "acc": 0.72,
209
+ "acc_stderr": 0.04512608598542127,
210
+ "acc_norm": 0.72,
211
+ "acc_norm_stderr": 0.04512608598542127
212
+ },
213
+ "harness|mmlu_moral_disputes|5": {
214
+ "acc": 0.5809248554913294,
215
+ "acc_stderr": 0.026564178111422622,
216
+ "acc_norm": 0.5809248554913294,
217
+ "acc_norm_stderr": 0.026564178111422622
218
+ },
219
+ "harness|mmlu_logical_fallacies|5": {
220
+ "acc": 0.6257668711656442,
221
+ "acc_stderr": 0.03802068102899615,
222
+ "acc_norm": 0.6257668711656442,
223
+ "acc_norm_stderr": 0.03802068102899615
224
+ },
225
+ "harness|mmlu_prehistory|5": {
226
+ "acc": 0.5987654320987654,
227
+ "acc_stderr": 0.027272582849839803,
228
+ "acc_norm": 0.5987654320987654,
229
+ "acc_norm_stderr": 0.027272582849839803
230
+ },
231
+ "harness|mmlu_college_mathematics|5": {
232
+ "acc": 0.34,
233
+ "acc_stderr": 0.04760952285695235,
234
+ "acc_norm": 0.34,
235
+ "acc_norm_stderr": 0.04760952285695235
236
+ },
237
+ "harness|mmlu_high_school_government_and_politics|5": {
238
+ "acc": 0.7512953367875648,
239
+ "acc_stderr": 0.031195840877700304,
240
+ "acc_norm": 0.7512953367875648,
241
+ "acc_norm_stderr": 0.031195840877700304
242
+ },
243
+ "harness|mmlu_econometrics|5": {
244
+ "acc": 0.47368421052631576,
245
+ "acc_stderr": 0.046970851366478626,
246
+ "acc_norm": 0.47368421052631576,
247
+ "acc_norm_stderr": 0.046970851366478626
248
+ },
249
+ "harness|mmlu_high_school_psychology|5": {
250
+ "acc": 0.7229357798165138,
251
+ "acc_stderr": 0.019188482590169538,
252
+ "acc_norm": 0.7229357798165138,
253
+ "acc_norm_stderr": 0.019188482590169538
254
+ },
255
+ "harness|mmlu_formal_logic|5": {
256
+ "acc": 0.4523809523809524,
257
+ "acc_stderr": 0.044518079590553275,
258
+ "acc_norm": 0.4523809523809524,
259
+ "acc_norm_stderr": 0.044518079590553275
260
+ },
261
+ "harness|mmlu_nutrition|5": {
262
+ "acc": 0.5718954248366013,
263
+ "acc_stderr": 0.028332397483664278,
264
+ "acc_norm": 0.5718954248366013,
265
+ "acc_norm_stderr": 0.028332397483664278
266
+ },
267
+ "harness|mmlu_business_ethics|5": {
268
+ "acc": 0.68,
269
+ "acc_stderr": 0.04688261722621504,
270
+ "acc_norm": 0.68,
271
+ "acc_norm_stderr": 0.04688261722621504
272
+ },
273
+ "harness|mmlu_international_law|5": {
274
+ "acc": 0.7520661157024794,
275
+ "acc_stderr": 0.039418975265163025,
276
+ "acc_norm": 0.7520661157024794,
277
+ "acc_norm_stderr": 0.039418975265163025
278
+ },
279
+ "harness|mmlu_astronomy|5": {
280
+ "acc": 0.618421052631579,
281
+ "acc_stderr": 0.03953173377749194,
282
+ "acc_norm": 0.618421052631579,
283
+ "acc_norm_stderr": 0.03953173377749194
284
+ },
285
+ "harness|mmlu_professional_psychology|5": {
286
+ "acc": 0.5408496732026143,
287
+ "acc_stderr": 0.020160213617222516,
288
+ "acc_norm": 0.5408496732026143,
289
+ "acc_norm_stderr": 0.020160213617222516
290
+ },
291
+ "harness|mmlu_professional_accounting|5": {
292
+ "acc": 0.45390070921985815,
293
+ "acc_stderr": 0.029700453247291463,
294
+ "acc_norm": 0.45390070921985815,
295
+ "acc_norm_stderr": 0.029700453247291463
296
+ },
297
+ "harness|mmlu_machine_learning|5": {
298
+ "acc": 0.44642857142857145,
299
+ "acc_stderr": 0.04718471485219588,
300
+ "acc_norm": 0.44642857142857145,
301
+ "acc_norm_stderr": 0.04718471485219588
302
+ },
303
+ "harness|mmlu_high_school_statistics|5": {
304
+ "acc": 0.5416666666666666,
305
+ "acc_stderr": 0.03398110890294636,
306
+ "acc_norm": 0.5416666666666666,
307
+ "acc_norm_stderr": 0.03398110890294636
308
+ },
309
+ "harness|mmlu_moral_scenarios|5": {
310
+ "acc": 0.35195530726256985,
311
+ "acc_stderr": 0.01597266852368907,
312
+ "acc_norm": 0.35195530726256985,
313
+ "acc_norm_stderr": 0.01597266852368907
314
+ },
315
+ "harness|mmlu_college_computer_science|5": {
316
+ "acc": 0.44,
317
+ "acc_stderr": 0.0498887651569859,
318
+ "acc_norm": 0.44,
319
+ "acc_norm_stderr": 0.0498887651569859
320
+ },
321
+ "harness|mmlu_high_school_computer_science|5": {
322
+ "acc": 0.68,
323
+ "acc_stderr": 0.04688261722621503,
324
+ "acc_norm": 0.68,
325
+ "acc_norm_stderr": 0.04688261722621503
326
+ },
327
+ "harness|mmlu_professional_medicine|5": {
328
+ "acc": 0.5147058823529411,
329
+ "acc_stderr": 0.03035969707904612,
330
+ "acc_norm": 0.5147058823529411,
331
+ "acc_norm_stderr": 0.03035969707904612
332
+ },
333
+ "harness|mmlu_security_studies|5": {
334
+ "acc": 0.6122448979591837,
335
+ "acc_stderr": 0.031192230726795656,
336
+ "acc_norm": 0.6122448979591837,
337
+ "acc_norm_stderr": 0.031192230726795656
338
+ },
339
+ "harness|mmlu_high_school_world_history|5": {
340
+ "acc": 0.7215189873417721,
341
+ "acc_stderr": 0.029178682304842538,
342
+ "acc_norm": 0.7215189873417721,
343
+ "acc_norm_stderr": 0.029178682304842538
344
+ },
345
+ "harness|mmlu_professional_law|5": {
346
+ "acc": 0.4634941329856584,
347
+ "acc_stderr": 0.012736153390214963,
348
+ "acc_norm": 0.4634941329856584,
349
+ "acc_norm_stderr": 0.012736153390214963
350
+ },
351
+ "harness|mmlu_high_school_us_history|5": {
352
+ "acc": 0.6568627450980392,
353
+ "acc_stderr": 0.03332139944668086,
354
+ "acc_norm": 0.6568627450980392,
355
+ "acc_norm_stderr": 0.03332139944668086
356
+ },
357
+ "harness|mmlu_high_school_european_history|5": {
358
+ "acc": 0.5818181818181818,
359
+ "acc_stderr": 0.03851716319398393,
360
+ "acc_norm": 0.5818181818181818,
361
+ "acc_norm_stderr": 0.03851716319398393
362
+ },
363
+ "harness|truthfulqa_mc|0": {
364
+ "mc1": 0.7833537331701347,
365
+ "mc1_stderr": 0.014421468452506978,
366
+ "mc2": 0.8572574997405501,
367
+ "mc2_stderr": 0.01200311225898601
368
+ },
369
+ "harness|commongen_v2|2": {
370
+ "acc": 0.5159386068476978,
371
+ "acc_stderr": 0.017181617837190195,
372
+ "acc_norm": 0.5301062573789846,
373
+ "acc_norm_stderr": 0.01715916359017022
374
+ }
375
+ },
376
+ "versions": {
377
+ "all": 0,
378
+ "harness|arc_challenge|25": 0,
379
+ "harness|hellaswag|10": 0,
380
+ "harness|mmlu_world_religions|5": 1,
381
+ "harness|mmlu_management|5": 1,
382
+ "harness|mmlu_miscellaneous|5": 1,
383
+ "harness|mmlu_anatomy|5": 1,
384
+ "harness|mmlu_abstract_algebra|5": 1,
385
+ "harness|mmlu_conceptual_physics|5": 1,
386
+ "harness|mmlu_virology|5": 1,
387
+ "harness|mmlu_philosophy|5": 1,
388
+ "harness|mmlu_human_aging|5": 1,
389
+ "harness|mmlu_human_sexuality|5": 1,
390
+ "harness|mmlu_medical_genetics|5": 1,
391
+ "harness|mmlu_high_school_geography|5": 1,
392
+ "harness|mmlu_electrical_engineering|5": 1,
393
+ "harness|mmlu_college_physics|5": 1,
394
+ "harness|mmlu_high_school_microeconomics|5": 1,
395
+ "harness|mmlu_high_school_macroeconomics|5": 1,
396
+ "harness|mmlu_computer_security|5": 1,
397
+ "harness|mmlu_global_facts|5": 1,
398
+ "harness|mmlu_jurisprudence|5": 1,
399
+ "harness|mmlu_high_school_chemistry|5": 1,
400
+ "harness|mmlu_high_school_biology|5": 1,
401
+ "harness|mmlu_marketing|5": 1,
402
+ "harness|mmlu_clinical_knowledge|5": 1,
403
+ "harness|mmlu_public_relations|5": 1,
404
+ "harness|mmlu_high_school_mathematics|5": 1,
405
+ "harness|mmlu_high_school_physics|5": 1,
406
+ "harness|mmlu_sociology|5": 1,
407
+ "harness|mmlu_college_medicine|5": 1,
408
+ "harness|mmlu_elementary_mathematics|5": 1,
409
+ "harness|mmlu_college_biology|5": 1,
410
+ "harness|mmlu_college_chemistry|5": 1,
411
+ "harness|mmlu_us_foreign_policy|5": 1,
412
+ "harness|mmlu_moral_disputes|5": 1,
413
+ "harness|mmlu_logical_fallacies|5": 1,
414
+ "harness|mmlu_prehistory|5": 1,
415
+ "harness|mmlu_college_mathematics|5": 1,
416
+ "harness|mmlu_high_school_government_and_politics|5": 1,
417
+ "harness|mmlu_econometrics|5": 1,
418
+ "harness|mmlu_high_school_psychology|5": 1,
419
+ "harness|mmlu_formal_logic|5": 1,
420
+ "harness|mmlu_nutrition|5": 1,
421
+ "harness|mmlu_business_ethics|5": 1,
422
+ "harness|mmlu_international_law|5": 1,
423
+ "harness|mmlu_astronomy|5": 1,
424
+ "harness|mmlu_professional_psychology|5": 1,
425
+ "harness|mmlu_professional_accounting|5": 1,
426
+ "harness|mmlu_machine_learning|5": 1,
427
+ "harness|mmlu_high_school_statistics|5": 1,
428
+ "harness|mmlu_moral_scenarios|5": 1,
429
+ "harness|mmlu_college_computer_science|5": 1,
430
+ "harness|mmlu_high_school_computer_science|5": 1,
431
+ "harness|mmlu_professional_medicine|5": 1,
432
+ "harness|mmlu_security_studies|5": 1,
433
+ "harness|mmlu_high_school_world_history|5": 1,
434
+ "harness|mmlu_professional_law|5": 1,
435
+ "harness|mmlu_high_school_us_history|5": 1,
436
+ "harness|mmlu_high_school_european_history|5": 1,
437
+ "harness|truthfulqa_mc|0": 0,
438
+ "harness|commongen_v2|2": 1
439
+ },
440
+ "config_general": {
441
+ "model_name": "x2bee/POLAR-14B-v0.5",
442
+ "model_sha": "74a1ef65a8d650e5358be229def31688738d8c6a",
443
+ "model_dtype": "torch.float16",
444
+ "lighteval_sha": "",
445
+ "num_few_shot_default": 0,
446
+ "num_fewshot_seeds": 1,
447
+ "override_batch_size": 1,
448
+ "max_samples": null
449
+ }
450
+ }
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler==3.10.1
2
+ black==23.11.0
3
+ click==8.1.3
4
+ datasets==2.14.5
5
+ gradio==4.19.2
6
+ gradio_client==0.10.1
7
+ huggingface-hub>=0.18.0
8
+ matplotlib==3.7.1
9
+ numpy==1.24.2
10
+ pandas==2.0.0
11
+ plotly==5.14.1
12
+ python-dateutil==2.8.2
13
+ requests==2.28.2
14
+ sentencepiece
15
+ tqdm==4.65.0
16
+ transformers==4.38.2
17
+ tokenizers>=0.15.0
18
+ gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
19
+ torch
scripts/create_request_file.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pprint
4
+ import re
5
+ from datetime import datetime, timezone
6
+
7
+ import click
8
+ from colorama import Fore
9
+ from huggingface_hub import HfApi, snapshot_download
10
+
11
+ EVAL_REQUESTS_PATH = "eval-queue"
12
+ QUEUE_REPO = "open-ko-llm-leaderboard/requests"
13
+
14
+ precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
15
+ model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
16
+ weight_types = ("Original", "Delta", "Adapter")
17
+
18
+
19
+ def get_model_size(model_info, precision: str):
20
+ size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
21
+ try:
22
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
23
+ except (AttributeError, TypeError):
24
+ try:
25
+ size_match = re.search(size_pattern, model_info.modelId.lower())
26
+ model_size = size_match.group(0)
27
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
28
+ except AttributeError:
29
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
30
+
31
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
32
+ model_size = size_factor * model_size
33
+ return model_size
34
+
35
+
36
+ def main():
37
+ api = HfApi()
38
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
39
+ snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset")
40
+
41
+ model_name = click.prompt("Enter model name")
42
+ revision = click.prompt("Enter revision", default="main")
43
+ precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
44
+ model_type = click.prompt("Enter model type", type=click.Choice(model_types))
45
+ weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
46
+ base_model = click.prompt("Enter base model", default="")
47
+ status = click.prompt("Enter status", default="FINISHED")
48
+
49
+ try:
50
+ model_info = api.model_info(repo_id=model_name, revision=revision)
51
+ except Exception as e:
52
+ print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
53
+ return 1
54
+
55
+ model_size = get_model_size(model_info=model_info, precision=precision)
56
+
57
+ try:
58
+ license = model_info.cardData["license"]
59
+ except Exception:
60
+ license = "?"
61
+
62
+ eval_entry = {
63
+ "model": model_name,
64
+ "base_model": base_model,
65
+ "revision": revision,
66
+ "private": False,
67
+ "precision": precision,
68
+ "weight_type": weight_type,
69
+ "status": status,
70
+ "submitted_time": current_time,
71
+ "model_type": model_type,
72
+ "likes": model_info.likes,
73
+ "params": model_size,
74
+ "license": license,
75
+ }
76
+
77
+ user_name = ""
78
+ model_path = model_name
79
+ if "/" in model_name:
80
+ user_name = model_name.split("/")[0]
81
+ model_path = model_name.split("/")[1]
82
+
83
+ pprint.pprint(eval_entry)
84
+
85
+ if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
86
+ click.echo("continuing...")
87
+
88
+ out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
89
+ os.makedirs(out_dir, exist_ok=True)
90
+ out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
91
+
92
+ with open(out_path, "w") as f:
93
+ f.write(json.dumps(eval_entry))
94
+
95
+ api.upload_file(
96
+ path_or_fileobj=out_path,
97
+ path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
98
+ repo_id=QUEUE_REPO,
99
+ repo_type="dataset",
100
+ commit_message=f"Add {model_name} to eval queue",
101
+ )
102
+ else:
103
+ click.echo("aborting...")
104
+
105
+
106
+ if __name__ == "__main__":
107
+ main()
scripts/update_request_files.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import glob
4
+ import pprint
5
+ import re
6
+ from datetime import datetime, timezone
7
+
8
+ import click
9
+ from colorama import Fore
10
+ from huggingface_hub import HfApi, snapshot_download
11
+ from huggingface_hub.hf_api import ModelInfo
12
+
13
+ API = HfApi()
14
+
15
+
16
+ def get_model_size(model_info: ModelInfo, precision: str):
17
+ size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
18
+ try:
19
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
20
+ except (AttributeError, TypeError ):
21
+ try:
22
+ size_match = re.search(size_pattern, model_info.modelId.split("/")[-1].lower())
23
+ model_size = size_match.group(0)
24
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
25
+ except AttributeError:
26
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
27
+
28
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.split("/")[-1].lower()) else 1
29
+ model_size = size_factor * model_size
30
+ return model_size
31
+
32
+
33
+ def update_request_files(requests_path):
34
+ request_files = os.path.join(
35
+ requests_path, "*/*.json"
36
+ )
37
+ request_files = glob.glob(request_files)
38
+
39
+ request_files = sorted(request_files, reverse=True)
40
+ for tmp_request_file in request_files:
41
+ with open(tmp_request_file, "r") as f:
42
+ req_content = json.load(f)
43
+ new_req_content = add_model_info(req_content)
44
+
45
+ # if new content is different, update the file
46
+ if new_req_content != req_content:
47
+ with open(tmp_request_file, "w") as f:
48
+ f.write(json.dumps(new_req_content, indent=4))
49
+
50
+ def add_model_info(entry):
51
+
52
+ model = entry["model"]
53
+ revision = entry["revision"]
54
+
55
+ try:
56
+ model_info = API.model_info(repo_id=model, revision=revision)
57
+ except Exception:
58
+ print(f"Could not get model information for {model} revision {revision}")
59
+ return entry
60
+
61
+ new_entry = entry.copy()
62
+
63
+ model_size = get_model_size(model_info=model_info, precision='float16')
64
+ new_entry["params"] = model_size
65
+
66
+ new_entry["likes"] = model_info.likes
67
+
68
+ # Were the model card and license filled?
69
+ try:
70
+ license = model_info.cardData["license"]
71
+ new_entry["license"] = license
72
+ except Exception:
73
+ print(f"No license for {model} revision {revision}")
74
+
75
+ print(json.dumps(new_entry, indent=4))
76
+ return new_entry
77
+
78
+
79
+ if __name__ == "__main__":
80
+ # update_request_files("/Users/sean/workspace/leaderboard/leaderboard-test-requests")
81
+ update_request_files("/Volumes/Data-case-sensitive/requests")
82
+
src/__pycache__/envs.cpython-310.pyc ADDED
Binary file (1.07 kB). View file
 
src/__pycache__/populate.cpython-310.pyc ADDED
Binary file (2.93 kB). View file
 
src/display/__pycache__/about.cpython-310.pyc ADDED
Binary file (5.36 kB). View file
 
src/display/__pycache__/css_html_js.cpython-310.pyc ADDED
Binary file (1.69 kB). View file
 
src/display/__pycache__/formatting.cpython-310.pyc ADDED
Binary file (1.78 kB). View file
 
src/display/__pycache__/utils.cpython-310.pyc ADDED
Binary file (5.94 kB). View file
 
src/display/about.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.display.utils import ModelType
2
+
3
+
4
+ TITLE = """<img src="https://i.postimg.cc/250G53CJ/src-display-SIL-logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
5
+
6
+ INTRODUCTION_TEXT = f"""
7
+ Welcome to the Self-Improving Leaderboard (SIL) - A Revolutionary Platform for Evaluating Large Language Models
8
+ The SIL offers a dynamic approach to assessing and ranking open-source LLMs and chatbots. Our innovative system continuously updates test datasets and recalculates rankings daily, ensuring evaluations reflect the rapid evolution of language processing capabilities.
9
+ Key Features:
10
+ • Daily-refreshed test datasets
11
+ • Adaptive ranking system
12
+ • Real-world language processing challenges
13
+ • Comprehensive model performance insights
14
+ Explore our cutting-edge evaluation process, gain deep insights into model capabilities, and see how different LLMs compare in this ever-changing landscape.
15
+ Ready to participate? Submit your model for evaluation on the 'Submit' page and join the forefront of LLM advancement. For a detailed look at our methodology, visit the 'About' page.
16
+ The SIL is proudly developed and maintained by [Your Organization/Team Name]. Together, let's push the boundaries of language AI!
17
+ """
18
+
19
+ LLM_BENCHMARKS_TEXT = f"""
20
+ # How it works
21
+ 🔄 The Self-Improving Leaderboard (SIL) operates on a dynamic evaluation system that continuously evolves to reflect real-world language processing challenges. Here's an overview of our process:
22
+ Daily Dataset Refresh
23
+ Our system generates new test data daily from diverse, reputable sources.
24
+ Advanced Large Language Models (LLMs) are utilized to synthesize additional relevant content.
25
+ The dataset is divided into two sections:
26
+ A primary dataset maintaining the integrity of sourced data
27
+ A noise-injected dataset simulating real-world data complexities
28
+ Model Evaluation
29
+ Participating models are rigorously evaluated against the refreshed dataset every 24 hours.
30
+ We employ a comprehensive set of metrics aligned with industry-standard benchmarks.
31
+ Our evaluation framework is built on the Eleuther AI Language Model Evaluation Harness, ensuring a robust and consistent assessment.
32
+ Ranking System
33
+ Model rankings are updated daily based on their performance across various tasks.
34
+ The leaderboard reflects not only the latest scores but also tracks consistency and adaptability over time.
35
+ Quarterly Comprehensive Evaluation
36
+ Every three months, we conduct an in-depth analysis of model performance.
37
+ This evaluation considers long-term trends, adaptability to evolving data, and overall efficacy.
38
+ Special recognition (e.g., medals or badges) may be awarded based on sustained excellence.
39
+ By continuously refreshing our test data and evaluation criteria, SIL aims to provide a more accurate representation of model performance in real-world scenarios, driving innovation in the field of Natural Language Processing.
40
+
41
+ ## Icons
42
+ {ModelType.PT.to_str(" : ")} model
43
+ {ModelType.IFT.to_str(" : ")} model
44
+ {ModelType.RL.to_str(" : ")} model
45
+ If there is no icon, it indicates that there is insufficient information about the model.
46
+ Please provide information about the model through an issue! 🤩
47
+
48
+ ## Details and Logs
49
+ - Detailed numerical results in the `results` dataset: https://huggingface.co/datasets/junkim100/SIL_results
50
+ - Community queries and running status in the `requests` dataset: https://huggingface.co/datasets/junkim100/SIL_requests
51
+ """
52
+
53
+ EVALUATION_QUEUE_TEXT = f"""
54
+ # Evaluation Queue for the 🔄 Self-Improving Leaderboard
55
+
56
+ ## <Some good practices before submitting a model>
57
+
58
+ ### 1️⃣ Make sure you can load your model and tokenizer using AutoClasses
59
+ ```python
60
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
61
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
62
+ model = AutoModel.from_pretrained("your model name", revision=revision)
63
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
64
+ ```
65
+
66
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
67
+
68
+ ⚠️ Make sure your model is public!
69
+
70
+ ⚠️ Maker sure your model runs with [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)
71
+
72
+
73
+ ### 2️⃣ Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
74
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
75
+
76
+ ### 3️⃣ Make sure your model has an open license!
77
+ We'd love for as many people as possible to know they can use your model
78
+
79
+ ### 4️⃣ Fill up your model card
80
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
81
+
82
+ ## In case of model failure
83
+ If your model is displayed in the `FAILED` category, its execution stopped. Make sure you have followed the above steps first. If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
84
+ """
src/display/css_html_js.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+ /* Hides the final AutoEvalColumn */
3
+ #llm-benchmark-tab-table table td:last-child,
4
+ #llm-benchmark-tab-table table th:last-child {
5
+ display: none;
6
+ }
7
+
8
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
9
+ table td:first-child,
10
+ table th:first-child {
11
+ max-width: 400px;
12
+ overflow: auto;
13
+ white-space: nowrap;
14
+ }
15
+
16
+ /* Full width space */
17
+ .gradio-container {
18
+ max-width: 95%!important;
19
+ }
20
+
21
+ /* Text style and margins */
22
+ .markdown-text {
23
+ font-size: 16px !important;
24
+ }
25
+
26
+ #models-to-add-text {
27
+ font-size: 18px !important;
28
+ }
29
+
30
+ #search-bar-table-box > div:first-child {
31
+ background: none;
32
+ border: none;
33
+ }
34
+
35
+ #search-bar {
36
+ padding: 0px;
37
+ }
38
+
39
+ .tab-buttons button {
40
+ font-size: 20px;
41
+ }
42
+
43
+ /* Filters style */
44
+ #filter_type{
45
+ border: 0;
46
+ padding-left: 0;
47
+ padding-top: 0;
48
+ }
49
+ #filter_type label {
50
+ display: flex;
51
+ }
52
+ #filter_type label > span{
53
+ margin-top: var(--spacing-lg);
54
+ margin-right: 0.5em;
55
+ }
56
+ #filter_type label > .wrap{
57
+ width: 103px;
58
+ }
59
+ #filter_type label > .wrap .wrap-inner{
60
+ padding: 2px;
61
+ }
62
+ #filter_type label > .wrap .wrap-inner input{
63
+ width: 1px
64
+ }
65
+ #filter-columns-type{
66
+ border:0;
67
+ padding:0.5;
68
+ }
69
+ #filter-columns-size{
70
+ border:0;
71
+ padding:0.5;
72
+ }
73
+ #box-filter > .form{
74
+ border: 0
75
+ }
76
+ """
77
+
78
+ get_window_url_params = """
79
+ function(url_params) {
80
+ const params = new URLSearchParams(window.location.search);
81
+ url_params = Object.fromEntries(params);
82
+ return url_params;
83
+ }
84
+ """
src/display/formatting.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime, timezone
3
+
4
+ from huggingface_hub import HfApi
5
+ from huggingface_hub.hf_api import ModelInfo
6
+
7
+
8
+ API = HfApi()
9
+
10
+ def model_hyperlink(link, model_name):
11
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
12
+
13
+
14
+ def make_clickable_model(model_name):
15
+ link = f"https://huggingface.co/{model_name}"
16
+
17
+ details_model_name = model_name.replace("/", "__")
18
+ details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
19
+
20
+ return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
21
+
22
+
23
+ def styled_error(error):
24
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
25
+
26
+
27
+ def styled_warning(warn):
28
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
29
+
30
+
31
+ def styled_message(message):
32
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
33
+
34
+
35
+ def has_no_nan_values(df, columns):
36
+ return df[columns].notna().all(axis=1)
37
+
38
+
39
+ def has_nan_values(df, columns):
40
+ return df[columns].isna().any(axis=1)
src/display/utils.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ from enum import Enum
3
+
4
+ import pandas as pd
5
+
6
+ def fields(raw_class):
7
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
8
+
9
+
10
+ @dataclass
11
+ class Task:
12
+ benchmark: str
13
+ metric: str
14
+ col_name: str
15
+
16
+ class Tasks(Enum):
17
+ arc = Task("arc_challenge", "acc_norm", "ARC")
18
+ hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
19
+ mmlu = Task("mmlu", "acc", "MMLU")
20
+ truthfulqa = Task("truthfulqa_mc", "mc2", "TruthfulQA")
21
+ # winogrande = Task("winogrande", "acc_norm", "Winogrande")
22
+ # gsm8k = Task("gsm8k", "acc_norm", "GSM8k")
23
+ commongen_v2 = Task("commongen_v2", "acc_norm", "CommonGen V2")
24
+ # eqBench = Task("eq_bench", "acc_norm", "EQ Bench")
25
+ # instFollow = Task("inst_follow", "acc_norm", "InstFollow")
26
+ # harmlessness = Task("harmlessness", "acc_norm", "Harmlessness")
27
+ # helpfulness = Task("helpfulness", "acc_norm", "Helpfulness")
28
+
29
+ class Ranks(Enum):
30
+ daily = Task("daily", "daily", "Daily Rank")
31
+ quarterly = Task("quarterly", "quarterly", "Quarterly Rank")
32
+
33
+
34
+ # These classes are for user facing column names,
35
+ # to avoid having to change them all around the code
36
+ # when a modif is needed
37
+ @dataclass
38
+ class ColumnContent:
39
+ name: str
40
+ type: str
41
+ displayed_by_default: bool
42
+ hidden: bool = False
43
+ never_hidden: bool = False
44
+ dummy: bool = False
45
+
46
+ auto_eval_column_dict = []
47
+ # Init
48
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
49
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
50
+ # Ranks
51
+ auto_eval_column_dict.append(["daily", ColumnContent, ColumnContent("Daily Rank", "number", True)])
52
+ auto_eval_column_dict.append(["quarterly", ColumnContent, ColumnContent("Quarterly Rank", "number", True)])
53
+ # Scores
54
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
55
+ for task in Tasks:
56
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
57
+ # Model information
58
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
59
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
60
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
61
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
62
+ auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "bool", False)])
63
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
64
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
65
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
66
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
67
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
68
+ auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, False)])
69
+ # Dummy column for the search bar (hidden by the custom CSS)
70
+ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
71
+
72
+ # We use make dataclass to dynamically fill the scores from Tasks
73
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
74
+
75
+
76
+ @dataclass(frozen=True)
77
+ class EvalQueueColumn: # Queue column
78
+ model = ColumnContent("model", "markdown", True)
79
+ revision = ColumnContent("revision", "str", True)
80
+ private = ColumnContent("private", "bool", True)
81
+ precision = ColumnContent("precision", "str", True)
82
+ weight_type = ColumnContent("weight_type", "str", "Original")
83
+ status = ColumnContent("status", "str", True)
84
+
85
+ # Define the human baselines
86
+ human_baseline_row = {
87
+ AutoEvalColumn.model.name: "<p>Human performance</p>",
88
+ }
89
+
90
+ @dataclass
91
+ class ModelDetails:
92
+ name: str
93
+ symbol: str = "" # emoji, only for the model type
94
+
95
+
96
+ class ModelType(Enum):
97
+ PT = ModelDetails(name="pretrained", symbol="🟢")
98
+ # FT = ModelDetails(name="fine-tuned", symbol="🔶")
99
+ IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
100
+ RL = ModelDetails(name="RL-tuned", symbol="🟦")
101
+ Unknown = ModelDetails(name="", symbol="?")
102
+
103
+ def to_str(self, separator=" "):
104
+ return f"{self.value.symbol}{separator}{self.value.name}"
105
+
106
+ @staticmethod
107
+ def from_str(type):
108
+ # if "fine-tuned" in type or "🔶" in type:
109
+ # return ModelType.FT
110
+ if "pretrained" in type or "🟢" in type:
111
+ return ModelType.PT
112
+ if "RL-tuned" in type or "🟦" in type:
113
+ return ModelType.RL
114
+ if "instruction-tuned" in type or "⭕" in type:
115
+ return ModelType.IFT
116
+ return ModelType.Unknown
117
+
118
+ class WeightType(Enum):
119
+ Adapter = ModelDetails("Adapter")
120
+ Original = ModelDetails("Original")
121
+ Delta = ModelDetails("Delta")
122
+
123
+ class Precision(Enum):
124
+ float16 = ModelDetails("float16")
125
+ # bfloat16 = ModelDetails("bfloat16")
126
+ # qt_8bit = ModelDetails("8bit")
127
+ # qt_4bit = ModelDetails("4bit")
128
+ # qt_GPTQ = ModelDetails("GPTQ")
129
+ Unknown = ModelDetails("?")
130
+
131
+ def from_str(precision):
132
+ if precision in ["torch.float16", "float16"]:
133
+ return Precision.float16
134
+ if precision in ["8bit"]:
135
+ return Precision.qt_8bit
136
+ if precision in ["4bit"]:
137
+ return Precision.qt_4bit
138
+ if precision in ["GPTQ", "None"]:
139
+ return Precision.qt_GPTQ
140
+ return Precision.Unknown
141
+
142
+
143
+
144
+
145
+ # Column selection
146
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
147
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
148
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
149
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
150
+
151
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
152
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
153
+
154
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
155
+
156
+ NUMERIC_INTERVALS = {
157
+ "Unknown": pd.Interval(-1, 0, closed="right"),
158
+ "0~3B": pd.Interval(0, 3, closed="right"),
159
+ "3~7B": pd.Interval(3, 7.3, closed="right"),
160
+ "7~13B": pd.Interval(7.3, 13, closed="right"),
161
+ "13~35B": pd.Interval(13, 35, closed="right"),
162
+ "35~60B": pd.Interval(35, 60, closed="right"),
163
+ "60B+": pd.Interval(60, 10000, closed="right"),
164
+ }
src/envs.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # clone / pull the lmeh eval data
6
+ H4_TOKEN = os.environ.get("H4_TOKEN", None)
7
+
8
+ REPO_ID = "junkim100/self-improving-leaderboard"
9
+ QUEUE_REPO = "junkim100/SIL_requests"
10
+ RESULTS_REPO = "junkim100/SIL_results"
11
+
12
+ PRIVATE_QUEUE_REPO = "open-ko-llm-leaderboard/private-requests"
13
+ PRIVATE_RESULTS_REPO = "open-ko-llm-leaderboard/private-results"
14
+
15
+ IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
16
+
17
+ CACHE_PATH=os.getenv("HF_HOME", ".")
18
+
19
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
20
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
21
+
22
+ EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
23
+ EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
24
+
25
+ PATH_TO_COLLECTION = "open-ko-llm-leaderboard/ko-llm-leaderboard-best-models-659c7e45a481ceea4c883506"
26
+
27
+ # Rate limit variables
28
+ RATE_LIMIT_PERIOD = 7
29
+ RATE_LIMIT_QUOTA = 5
30
+ HAS_HIGHER_RATE_LIMIT = []
31
+
32
+ API = HfApi(token=H4_TOKEN)
src/leaderboard/__pycache__/filter_models.cpython-310.pyc ADDED
Binary file (1.47 kB). View file
 
src/leaderboard/__pycache__/read_evals.cpython-310.pyc ADDED
Binary file (7.78 kB). View file
 
src/leaderboard/filter_models.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.display.formatting import model_hyperlink
2
+ from src.display.utils import AutoEvalColumn
3
+
4
+ # Models which have been flagged by users as being problematic for a reason or another
5
+ # (Model name to forum discussion link)
6
+ FLAGGED_MODELS = {
7
+ "merged": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
8
+ "TeamUNIVA/Komodo_7B_v0.1.0": "https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/44",
9
+ }
10
+
11
+ # Models which have been requested by orgs to not be submitted on the leaderboard
12
+ DO_NOT_SUBMIT_MODELS = [
13
+ ]
14
+
15
+
16
+ def flag_models(leaderboard_data: list[dict]):
17
+ for model_data in leaderboard_data:
18
+ # Merges are flagged automatically
19
+ if model_data[AutoEvalColumn.flagged.name] == True:
20
+ flag_key = "merged"
21
+ else:
22
+ flag_key = model_data["model_name_for_query"]
23
+
24
+ if flag_key in FLAGGED_MODELS:
25
+ issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
26
+ issue_link = model_hyperlink(
27
+ FLAGGED_MODELS[flag_key],
28
+ f"See discussion #{issue_num}",
29
+ )
30
+ model_data[
31
+ AutoEvalColumn.model.name
32
+ ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
33
+ model_data[AutoEvalColumn.flagged.name] = True
34
+ else:
35
+ model_data[AutoEvalColumn.flagged.name] = False
36
+
37
+
38
+ def remove_forbidden_models(leaderboard_data: list[dict]):
39
+ indices_to_remove = []
40
+ for ix, model in enumerate(leaderboard_data):
41
+ if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
42
+ indices_to_remove.append(ix)
43
+
44
+ for ix in reversed(indices_to_remove):
45
+ leaderboard_data.pop(ix)
46
+ return leaderboard_data
47
+
48
+
49
+ def filter_models(leaderboard_data: list[dict]):
50
+ leaderboard_data = remove_forbidden_models(leaderboard_data)
51
+ flag_models(leaderboard_data)
src/leaderboard/read_evals.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import math
4
+ import os
5
+ from dataclasses import dataclass
6
+
7
+ import dateutil
8
+ import numpy as np
9
+
10
+ from huggingface_hub import ModelCard
11
+
12
+ from src.display.formatting import make_clickable_model
13
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Ranks, Precision, WeightType
14
+ from src.submission.check_validity import is_model_on_hub, check_model_card
15
+
16
+
17
+ @dataclass
18
+ class EvalResult:
19
+ # Also see src.display.utils.AutoEvalColumn for what will be displayed.
20
+ eval_name: str # org_model_precision (uid)
21
+ full_model: str # org/model (path on hub)
22
+ org: str
23
+ model: str
24
+ revision: str # commit hash, "" if main
25
+ results: dict
26
+ precision: Precision = Precision.Unknown
27
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
+ weight_type: WeightType = WeightType.Original # Original or Adapter
29
+ architecture: str = "Unknown" # From config file
30
+ license: str = "?"
31
+ likes: int = 0
32
+ num_params: int = 0
33
+ date: str = "" # submission date of request file
34
+ still_on_hub: bool = False
35
+ is_merge: bool = False
36
+ flagged: bool = False
37
+
38
+ @classmethod
39
+ def init_from_json_file(self, json_filepath):
40
+ """Inits the result from the specific model result file"""
41
+ with open(json_filepath) as fp:
42
+ data = json.load(fp)
43
+
44
+ # We manage the legacy config format
45
+ config = data.get("config", data.get("config_general", None))
46
+
47
+ # Precision
48
+ precision = Precision.from_str(config.get("model_dtype"))
49
+
50
+ # Get model and org
51
+ org_and_model = config.get("model_name", config.get("model_args", None))
52
+ org_and_model = org_and_model.split("/", 1)
53
+
54
+ if len(org_and_model) == 1:
55
+ org = None
56
+ model = org_and_model[0]
57
+ result_key = f"{model}_{precision.value.name}"
58
+ else:
59
+ org = org_and_model[0]
60
+ model = org_and_model[1]
61
+ result_key = f"{org}_{model}_{precision.value.name}"
62
+ full_model = "/".join(org_and_model)
63
+
64
+ still_on_hub, error, model_config = is_model_on_hub(
65
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
66
+ )
67
+ architecture = "?"
68
+ if model_config is not None:
69
+ architectures = getattr(model_config, "architectures", None)
70
+ if architectures:
71
+ architecture = ";".join(architectures)
72
+
73
+ # If the model doesn't have a model card or a license, we consider it's deleted
74
+ if still_on_hub:
75
+ try:
76
+ if check_model_card(full_model)[0] is False:
77
+ still_on_hub = False
78
+ except Exception:
79
+ still_on_hub = False
80
+
81
+ # Check if the model is a merge
82
+ is_merge_from_metadata = False
83
+ flagged = False
84
+ if still_on_hub:
85
+ model_card = ModelCard.load(full_model)
86
+
87
+ if model_card.data.tags:
88
+ is_merge_from_metadata = "merge" in model_card.data.tags
89
+ merge_keywords = ["mergekit", "merged model", "merge model", "merging", "Carbon"]
90
+ # If the model is a merge but not saying it in the metadata, we flag it
91
+ is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
92
+ flagged = is_merge_from_model_card and not is_merge_from_metadata
93
+
94
+
95
+ # Extract results available in this file (some results are split in several files)
96
+ results = {}
97
+ for rank in Ranks:
98
+ rank = rank.value
99
+ if rank.benchmark in data["results"]:
100
+ results[rank.benchmark] = data["results"][rank.benchmark][rank.metric]
101
+ for task in Tasks:
102
+ task = task.value
103
+
104
+ # Some truthfulQA values are NaNs
105
+ if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
106
+ if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
107
+ results[task.benchmark] = 0.0
108
+ continue
109
+
110
+ # New tasks have been added, we need to skip them if not exists
111
+ if task.benchmark in ["winogrande", "gsm8k", "eq_bench", "inst_follow", "harmlessness", "helpfulness"]:
112
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
113
+ if accs.size == 0 or any([acc is None for acc in accs]):
114
+ results[task.benchmark] = 0.0
115
+ continue
116
+
117
+ # We average all scores of a given metric (mostly for mmlu)
118
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
119
+ if accs.size == 0 or any([acc is None for acc in accs]):
120
+ continue
121
+
122
+ mean_acc = np.mean(accs) * 100.0
123
+ results[task.benchmark] = mean_acc
124
+
125
+ return self(
126
+ eval_name=result_key,
127
+ full_model=full_model,
128
+ org=org,
129
+ model=model,
130
+ results=results,
131
+ precision=precision,
132
+ revision= config.get("model_sha", ""),
133
+ still_on_hub=still_on_hub,
134
+ architecture=architecture,
135
+ is_merge=is_merge_from_metadata,
136
+ flagged=flagged,
137
+ )
138
+
139
+ def update_with_request_file(self, requests_path):
140
+ """Finds the relevant request file for the current model and updates info with it"""
141
+ request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
142
+
143
+ try:
144
+ with open(request_file, "r") as f:
145
+ request = json.load(f)
146
+ self.model_type = ModelType.from_str(request.get("model_type", ""))
147
+ self.weight_type = WeightType[request.get("weight_type", "Original")]
148
+ self.license = request.get("license", "?")
149
+ self.likes = request.get("likes", 0)
150
+ self.num_params = request.get("params", 0)
151
+ self.date = request.get("submitted_time", "")
152
+ except Exception:
153
+ print(f"Could not find request file for {self.org}/{self.model}")
154
+
155
+ def to_dict(self):
156
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
157
+
158
+ # Skip the new tasks for now
159
+ # TODO: safely remove this code when the task results are all added
160
+ skip_avg_len = 0
161
+ # if self.results['winogrande'] == 0.0:
162
+ # skip_avg_len += 1
163
+ # if self.results['gsm8k'] == 0.0:
164
+ # skip_avg_len += 1
165
+ # if self.results['eq_bench'] == 0.0:
166
+ # skip_avg_len += 1
167
+ # if self.results['inst_follow'] == 0.0:
168
+ # skip_avg_len += 1
169
+ # if self.results['harmlessness'] == 0.0:
170
+ # skip_avg_len += 1
171
+ # if self.results['helpfulness'] == 0.0:
172
+ # skip_avg_len += 1
173
+
174
+ average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
175
+
176
+ data_dict = {
177
+ "eval_name": self.eval_name, # not a column, just a save name,
178
+ AutoEvalColumn.precision.name: self.precision.value.name,
179
+ AutoEvalColumn.model_type.name: self.model_type.value.name,
180
+ AutoEvalColumn.merged.name: self.is_merge,
181
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, # + "🥦" if self.is_merge,
182
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
183
+ AutoEvalColumn.architecture.name: self.architecture,
184
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
185
+ AutoEvalColumn.dummy.name: self.full_model,
186
+ AutoEvalColumn.revision.name: self.revision,
187
+ AutoEvalColumn.average.name: average,
188
+ AutoEvalColumn.license.name: self.license,
189
+ AutoEvalColumn.likes.name: self.likes,
190
+ AutoEvalColumn.params.name: self.num_params,
191
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
192
+ AutoEvalColumn.flagged.name: self.flagged
193
+ }
194
+
195
+ AllColumns = []
196
+ for task in Tasks:
197
+ AllColumns.append(task.value)
198
+ for rank in Ranks:
199
+ AllColumns.append(rank.value)
200
+
201
+ for a in AllColumns:
202
+ if a.benchmark in ["daily", "quarterly"]:
203
+ data_dict[a.col_name] = self.results[a.benchmark]
204
+ print(a.benchmark, self.results[a.benchmark], a.col_name)
205
+ else:
206
+ data_dict[a.col_name] = self.results[a.benchmark]
207
+
208
+ return data_dict
209
+
210
+
211
+ def get_request_file_for_model(requests_path, model_name, precision):
212
+ """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
213
+ request_files = os.path.join(
214
+ requests_path,
215
+ f"{model_name}_eval_request_*.json",
216
+ )
217
+ request_files = glob.glob(request_files)
218
+
219
+ # Select correct request file (precision)
220
+ request_file = ""
221
+ request_files = sorted(request_files, reverse=True)
222
+ for tmp_request_file in request_files:
223
+ with open(tmp_request_file, "r") as f:
224
+ req_content = json.load(f)
225
+ if (
226
+ req_content["status"] in ["FINISHED"]
227
+ and req_content["precision"] == precision.split(".")[-1]
228
+ ):
229
+ request_file = tmp_request_file
230
+ return request_file
231
+
232
+
233
+ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
234
+ """From the path of the results folder root, extract all needed info for results"""
235
+ model_result_filepaths = []
236
+
237
+ for root, _, files in os.walk(results_path):
238
+ # We should only have json files in model results
239
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
240
+ continue
241
+
242
+ # Sort the files by date
243
+ try:
244
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
245
+ except dateutil.parser._parser.ParserError:
246
+ files = [files[-1]]
247
+
248
+ for file in files:
249
+ model_result_filepaths.append(os.path.join(root, file))
250
+
251
+ eval_results = {}
252
+ for model_result_filepath in model_result_filepaths:
253
+ # Creation of result
254
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
255
+ eval_result.update_with_request_file(requests_path)
256
+
257
+ # Store results of same eval together
258
+ eval_name = eval_result.eval_name
259
+ if eval_name in eval_results.keys():
260
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
261
+ else:
262
+ eval_results[eval_name] = eval_result
263
+
264
+ results = []
265
+ for v in eval_results.values():
266
+ try:
267
+ v.to_dict() # we test if the dict version is complete
268
+ results.append(v)
269
+ except KeyError: # not all eval values present
270
+ continue
271
+
272
+ return results
src/populate.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+
6
+ from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
+ from src.leaderboard.filter_models import filter_models
9
+ from src.leaderboard.read_evals import get_raw_eval_results
10
+
11
+
12
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
+ raw_data = get_raw_eval_results(results_path, requests_path)
14
+ all_data_json = [v.to_dict() for v in raw_data]
15
+ # all_data_json.append(baseline_row)
16
+ filter_models(all_data_json)
17
+
18
+ df = pd.DataFrame.from_records(all_data_json)
19
+ # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
20
+ df = df.sort_values(by=["Daily Rank"], ascending=True)
21
+
22
+ # print(df[AutoEvalColumn.average.name])
23
+ try:
24
+ df = df[cols].round(decimals=2)
25
+ except:
26
+ pass
27
+
28
+ # filter out if any of the benchmarks have not been produced
29
+ try:
30
+ df = df[has_no_nan_values(df, benchmark_cols)]
31
+ except:
32
+ pass
33
+ return raw_data, df
34
+
35
+
36
+ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
37
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
38
+ all_evals = []
39
+
40
+ for entry in entries:
41
+ if ".json" in entry:
42
+ file_path = os.path.join(save_path, entry)
43
+ with open(file_path) as fp:
44
+ data = json.load(fp)
45
+
46
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
47
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
48
+
49
+ all_evals.append(data)
50
+ elif ".md" not in entry:
51
+ # this is a folder
52
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
53
+ for sub_entry in sub_entries:
54
+ file_path = os.path.join(save_path, entry, sub_entry)
55
+ with open(file_path) as fp:
56
+ data = json.load(fp)
57
+
58
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
59
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
60
+ all_evals.append(data)
61
+
62
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
63
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
64
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
65
+ failed_list = [e for e in all_evals if e["status"] == "FAILED"]
66
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
67
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
68
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
69
+ df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
70
+ return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]
src/submission/__pycache__/check_validity.cpython-310.pyc ADDED
Binary file (4.64 kB). View file
 
src/submission/__pycache__/submit.cpython-310.pyc ADDED
Binary file (3.57 kB). View file