Spaces:

junkim100
/

self-improving-leaderboard

Runtime error

App Files Files Community

junkim100 commited on Aug 3, 2024

Commit

57dfc04

1 Parent(s): aebe308

Initial Commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +9 -5
app.py +400 -0
eval-queue/.gitattributes +55 -0
eval-queue/01-ai/Yi-1.5-9B-32K_eval_request_False_float16_Original.json +14 -0
eval-queue/BioMistral/BioMistral-7B_eval_request_False_float16_Original.json +15 -0
eval-queue/EleutherAI/polyglot-ko-1.3b_eval_request_False_float16_Original.json +14 -0
eval-queue/HuggingFaceH4/zephyr-7b-beta_eval_request_False_float16_Original.json +15 -0
eval-queue/README.md +3 -0
eval-queue/nlpai-lab/KULLM3_eval_request_False_float16_Original.json +15 -0
eval-queue/x2bee/POLAR-14B-DPO-v1.3_eval_request_False_float16_Original.json +15 -0
eval-queue/x2bee/POLAR-14B-DPO-v1.4_eval_request_False_float16_Original.json +15 -0
eval-queue/x2bee/POLAR-14B-HES-DPO-v1.5_eval_request_False_float16_Original.json +15 -0
eval-queue/x2bee/POLAR-14B-SON-SFT-v0.1_eval_request_False_float16_Original.json +15 -0
eval-queue/x2bee/POLAR-14B-v0.2_eval_request_False_float16_Original.json +15 -0
eval-queue/x2bee/POLAR-14B-v0.5_eval_request_False_float16_Original.json +15 -0
eval-results/.gitattributes +55 -0
eval-results/01-ai/Yi-1.5-9B-32K/result_2024_07_30 20:36:30.json +450 -0
eval-results/BioMistral/BioMistral-7B/BioMistral_BioMistral-7B_result_2024-05-30 01_33_58.json +450 -0
eval-results/EleutherAI/polyglot-ko-1.3b/EleutherAI_polyglot-ko-1.3b_result_2023-09-24 15_21_38.json +450 -0
eval-results/HuggingFaceH4/.DS_Store +0 -0
eval-results/HuggingFaceH4/zephyr-7b-beta/result.json +450 -0
eval-results/README.md +3 -0
eval-results/nlpai-lab/KULLM3/result.json +450 -0
eval-results/x2bee/POLAR-14B-DPO-v1.3/result.json +450 -0
eval-results/x2bee/POLAR-14B-DPO-v1.4/result.json +450 -0
eval-results/x2bee/POLAR-14B-HES-DPO-v1.5/result.json +450 -0
eval-results/x2bee/POLAR-14B-SON-SFT-v0.1/result.json +450 -0
eval-results/x2bee/POLAR-14B-v0.2/result.json +450 -0
eval-results/x2bee/POLAR-14B-v0.5/result.json +450 -0
requirements.txt +19 -0
scripts/create_request_file.py +107 -0
scripts/update_request_files.py +82 -0
src/__pycache__/envs.cpython-310.pyc +0 -0
src/__pycache__/populate.cpython-310.pyc +0 -0
src/display/__pycache__/about.cpython-310.pyc +0 -0
src/display/__pycache__/css_html_js.cpython-310.pyc +0 -0
src/display/__pycache__/formatting.cpython-310.pyc +0 -0
src/display/__pycache__/utils.cpython-310.pyc +0 -0
src/display/about.py +84 -0
src/display/css_html_js.py +84 -0
src/display/formatting.py +40 -0
src/display/utils.py +164 -0
src/envs.py +32 -0
src/leaderboard/__pycache__/filter_models.cpython-310.pyc +0 -0
src/leaderboard/__pycache__/read_evals.cpython-310.pyc +0 -0
src/leaderboard/filter_models.py +51 -0
src/leaderboard/read_evals.py +272 -0
src/populate.py +70 -0
src/submission/__pycache__/check_validity.cpython-310.pyc +0 -0
src/submission/__pycache__/submit.cpython-310.pyc +0 -0

README.md CHANGED Viewed

@@ -1,13 +1,17 @@
 ---
 title: Self Improving Leaderboard
-emoji: 🦀
-colorFrom: purple
-colorTo: green
 sdk: gradio
-sdk_version: 4.40.0
 app_file: app.py
-pinned: false
 license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Self Improving Leaderboard
+emoji: 🔄
+colorFrom: green
+colorTo: indigo
 sdk: gradio
+sdk_version: 4.36.0
 app_file: app.py
+pinned: true
 license: apache-2.0
+duplicated_from: upstage/open-ko-llm-leaderboard
+fullWidth: true
+tags:
+  - leaderboard
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import gradio as gr
+import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+from gradio_space_ci import configure_space_ci # FOR CI
+from src.display.about import (
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    BENCHMARK_COLS,
+    COLS,
+    EVAL_COLS,
+    EVAL_TYPES,
+    NUMERIC_INTERVALS,
+    TYPES,
+    AutoEvalColumn,
+    ModelType,
+    fields,
+    WeightType,
+    Precision
+)
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
+from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
+from src.tools.collections import update_collections
+from src.tools.plots import (
+    create_metric_plot_obj,
+    create_plot_df,
+    create_scores_df,
+)
+def restart_space():
+    API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
+try:
+    print(EVAL_REQUESTS_PATH)
+    snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+    )
+except Exception:
+    restart_space()
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
+    )
+except Exception:
+    restart_space()
+_, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+leaderboard_df = original_df.copy()
+(
+    finished_eval_queue_df,
+    running_eval_queue_df,
+    pending_eval_queue_df,
+    failed_eval_queue_df,
+) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+# Searching and filtering
+def update_table(
+    hidden_df: pd.DataFrame,
+    columns: list,
+    type_query: list,
+    precision_query: str,
+    size_query: list,
+    show_deleted: bool,
+    show_merges: bool,
+    show_flagged: bool,
+    query: str,
+):
+    filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
+    filtered_df = filter_queries(query, filtered_df)
+    df = select_columns(filtered_df, columns)
+    return df
+def quarter_update_table(
+    hidden_df: pd.DataFrame,
+    columns: list,
+    type_query: list,
+    precision_query: str,
+    size_query: list,
+    show_deleted: bool,
+    show_merges: bool,
+    show_flagged: bool,
+    query: str,
+):
+    filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
+    filtered_df = filter_queries(query, filtered_df)
+    df = quarter_select_columns(filtered_df, columns)
+    return df
+def load_query(request: gr.Request):  # triggered only once at startup => read query parameter if it exists
+    query = request.query_params.get("query") or ""
+    return query, query # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
+def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
+    return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
+def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
+    always_here_cols = [
+        AutoEvalColumn.model_type_symbol.name,
+        AutoEvalColumn.model.name,
+    ]
+    # We use COLS to maintain sorting
+    filtered_df = df[
+        always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
+    ]
+    return filtered_df
+def filter_queries(query: str, filtered_df: pd.DataFrame):
+    """Added by Abishek"""
+    final_df = []
+    if query != "":
+        queries = [q.strip() for q in query.split(";")]
+        for _q in queries:
+            _q = _q.strip()
+            if _q != "":
+                temp_filtered_df = search_table(filtered_df, _q)
+                if len(temp_filtered_df) > 0:
+                    final_df.append(temp_filtered_df)
+        if len(final_df) > 0:
+            filtered_df = pd.concat(final_df)
+            filtered_df = filtered_df.drop_duplicates(
+                subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
+            )
+    return filtered_df
+def filter_models(
+    df: pd.DataFrame, type_query: list, size_query: list, precision_query: list
+) -> pd.DataFrame:
+    type_emoji = [t[0] for t in type_query]
+    df = df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
+    df = df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
+    numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
+    params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
+    mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
+    df = df.loc[mask]
+    return df
+leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision])
+print(leaderboard_df)
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🔄 Self-Improving Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        search_bar = gr.Textbox(
+                            placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
+                            show_label=False,
+                            elem_id="search-bar",
+                        )
+                    with gr.Row():
+                        shown_columns = gr.CheckboxGroup(
+                            choices=[
+                                c.name
+                                for c in fields(AutoEvalColumn)
+                                if not c.hidden and not c.never_hidden and not c.dummy
+                            ],
+                            value=[
+                                c.name
+                                for c in fields(AutoEvalColumn)
+                                if c.displayed_by_default and not c.hidden and not c.never_hidden
+                            ],
+                            label="Select columns to show",
+                            elem_id="column-select",
+                            interactive=True,
+                        )
+                with gr.Column(min_width=320):
+                    #with gr.Box(elem_id="box-filter"):
+                    filter_columns_type = gr.CheckboxGroup(
+                        label="Model types",
+                        choices=[t.to_str() for t in ModelType],
+                        value=[t.to_str() for t in ModelType],
+                        interactive=True,
+                        elem_id="filter-columns-type",
+                    )
+                    filter_columns_precision = gr.CheckboxGroup(
+                        label="Precision",
+                        choices=[i.value.name for i in Precision],
+                        value=[i.value.name for i in Precision],
+                        interactive=True,
+                        elem_id="filter-columns-precision",
+                    )
+                    filter_columns_size = gr.CheckboxGroup(
+                        label="Model sizes (in billions of parameters)",
+                        choices=list(NUMERIC_INTERVALS.keys()),
+                        value=list(NUMERIC_INTERVALS.keys()),
+                        interactive=True,
+                        elem_id="filter-columns-size",
+                    )
+            leaderboard_table = gr.components.Dataframe(
+                value=leaderboard_df[
+                    [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
+                    + shown_columns.value
+                    + [AutoEvalColumn.dummy.name]
+                ],
+                headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
+                datatype=TYPES,
+                elem_id="leaderboard-table",
+                interactive=False,
+                visible=True,
+                #column_widths=["2%", "33%"]
+            )
+            # Dummy leaderboard for handling the case when the user uses backspace key
+            hidden_leaderboard_table_for_search = gr.components.Dataframe(
+                value=original_df[COLS],
+                headers=COLS,
+                datatype=TYPES,
+                visible=False,
+            )
+            search_bar.submit(
+                update_table,
+                [
+                    hidden_leaderboard_table_for_search,
+                    shown_columns,
+                    filter_columns_type,
+                    filter_columns_precision,
+                    filter_columns_size,
+                    search_bar,
+                ],
+                leaderboard_table,
+            )
+            # Define a hidden component that will trigger a reload only if a query parameter has be set
+            hidden_search_bar = gr.Textbox(value="", visible=False)
+            hidden_search_bar.change(
+                update_table,
+                [
+                    hidden_leaderboard_table_for_search,
+                    shown_columns,
+                    filter_columns_type,
+                    filter_columns_precision,
+                    filter_columns_size,
+                    search_bar,
+                ],
+                leaderboard_table,
+            )
+            # Check query parameter once at startup and update search bar + hidden component
+            demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
+            for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
+                selector.change(
+                    update_table,
+                    [
+                        hidden_leaderboard_table_for_search,
+                        shown_columns,
+                        filter_columns_type,
+                        filter_columns_precision,
+                        filter_columns_size,
+                        search_bar,
+                    ],
+                    leaderboard_table,
+                    queue=True,
+                )
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Column():
+                    with gr.Accordion(
+                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            finished_eval_table = gr.components.Dataframe(
+                                value=finished_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            running_eval_table = gr.components.Dataframe(
+                                value=running_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            pending_eval_table = gr.components.Dataframe(
+                                value=pending_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"❌ Failed Evaluations ({len(failed_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            pending_eval_table = gr.components.Dataframe(
+                                value=failed_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+            with gr.Row():
+                with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Model name")
+                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                    private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
+                    model_type = gr.Dropdown(
+                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+                        label="Model type",
+                        multiselect=False,
+                        value=ModelType.IFT.to_str(" : "),
+                        interactive=True,
+                    )
+                with gr.Column():
+                    precision = gr.Dropdown(
+                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
+                        label="Precision",
+                        multiselect=False,
+                        value="float16",
+                        interactive=True,
+                    )
+                    weight_type = gr.Dropdown(
+                        choices=[i.value.name for i in WeightType],
+                        label="Weights type",
+                        multiselect=False,
+                        value="Original",
+                        interactive=True,
+                    )
+                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+            submit_button = gr.Button("Submit Evalulation!")
+            submission_result = gr.Markdown()
+            submit_button.click(
+                add_new_eval,
+                [
+                    model_name_textbox,
+                    base_model_name_textbox,
+                    revision_name_textbox,
+                    precision,
+                    private,
+                    weight_type,
+                    model_type,
+                ],
+                submission_result,
+            )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
+# Both launches the space and its CI
+configure_space_ci(
+    demo.queue(default_concurrency_limit=40),
+    trusted_authors=[],  # add manually trusted authors
+    private="True",  # ephemeral spaces will have same visibility as the main space. Otherwise, set to `True` or `False` explicitly.
+    variables={},  # We overwrite HF_HOME as tmp CI spaces will have no cache
+    secrets=["HF_TOKEN", "H4_TOKEN"],  # which secret do I want to copy from the main space? Can be a `List[str]`.
+    hardware=None,  # "cpu-basic" by default. Otherwise set to "auto" to have same hardware as the main space or any valid string value.
+    storage=None,  # no storage by default. Otherwise set to "auto" to have same storage as the main space or any valid string value.
+).launch()

eval-queue/.gitattributes ADDED Viewed

	@@ -0,0 +1,55 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text

eval-queue/01-ai/Yi-1.5-9B-32K_eval_request_False_float16_Original.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model": "01-ai/Yi-1.5-9B-32K",
+  "base_model": "",
+  "revision": "c0239dbc923b8a2b5ca849763bdd592d39c60850",
+  "private": false,
+  "precision": "float16",
+  "weight_type": "Original",
+  "status": "FINISHED",
+  "submitted_time": "2024-07-29T13:10:13Z",
+  "model_type": "\ud83d\udfe2 : pretrained",
+  "likes": 18,
+  "params": 8.829,
+  "license": "apache-2.0"
+}

eval-queue/BioMistral/BioMistral-7B_eval_request_False_float16_Original.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "model": "BioMistral/BioMistral-7B",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-05-30 01:33:58",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "2031",
+    "params": 7.0,
+    "likes": 354,
+    "license": "apache-2.0"
+}

eval-queue/EleutherAI/polyglot-ko-1.3b_eval_request_False_float16_Original.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model": "EleutherAI/polyglot-ko-1.3b",
+  "base_model": "",
+  "revision": "main",
+  "private": false,
+  "precision": "float16",
+  "weight_type": "Original",
+  "status": "FINISHED",
+  "submitted_time": "2024-07-25T11:04:40Z",
+  "model_type": "\ud83d\udfe2 : pretrained",
+  "likes": 71,
+  "params": 1.432,
+  "license": "apache-2.0"
+}

eval-queue/HuggingFaceH4/zephyr-7b-beta_eval_request_False_float16_Original.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "model": "HuggingFaceH4/zephyr-7b-beta",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2023-11-01 04:21:47",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "401",
+    "params": 7.242,
+    "likes": 1162,
+    "license": "mit"
+}

eval-queue/README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: apache-2.0
+---

eval-queue/nlpai-lab/KULLM3_eval_request_False_float16_Original.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "model": "nlpai-lab/KULLM3",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-04-08 05:16:47",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "1751",
+    "params": 10.732000350952148,
+    "likes": 13,
+    "license": "cc-by-nc-4.0"
+}

eval-queue/x2bee/POLAR-14B-DPO-v1.3_eval_request_False_float16_Original.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "model": "x2bee/POLAR-14B-DPO-v1.3",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-05-23 11:59:50",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "1987",
+    "params": 14.220999717712402,
+    "likes": 0,
+    "license": "apache-2.0"
+}

eval-queue/x2bee/POLAR-14B-DPO-v1.4_eval_request_False_float16_Original.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "model": "x2bee/POLAR-14B-DPO-v1.4",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-05-27 15:02:47",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "2004",
+    "params": 14.220999717712402,
+    "likes": 0,
+    "license": "apache-2.0"
+}

eval-queue/x2bee/POLAR-14B-HES-DPO-v1.5_eval_request_False_float16_Original.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "model": "x2bee/POLAR-14B-HES-DPO-v1.5",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-05-29 23:53:33",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "2029",
+    "params": 14.220999717712402,
+    "likes": 0,
+    "license": "apache-2.0"
+}

eval-queue/x2bee/POLAR-14B-SON-SFT-v0.1_eval_request_False_float16_Original.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "model": "x2bee/POLAR-14B-SON-SFT-v0.1",
+    "base_model": "x2bee/POLAR-14B-v0.2",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-05-27 13:52:58",
+    "model_type": "\u2b55 : instruction-tuned",
+    "job_id": "2003",
+    "params": 14.220999717712402,
+    "likes": 0,
+    "license": "apache-2.0"
+}

eval-queue/x2bee/POLAR-14B-v0.2_eval_request_False_float16_Original.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "model": "x2bee/POLAR-14B-v0.2",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-05-02 00:34:33",
+    "model_type": "\ud83d\udfe2 : pretrained",
+    "job_id": "1874",
+    "params": 14.220999717712402,
+    "likes": 0,
+    "license": "apache-2.0"
+}

eval-queue/x2bee/POLAR-14B-v0.5_eval_request_False_float16_Original.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "model": "x2bee/POLAR-14B-v0.5",
+    "base_model": "",
+    "revision": "main",
+    "private": false,
+    "precision": "float16",
+    "weight_type": "Original",
+    "status": "FINISHED",
+    "submitted_time": "2024-06-05 00:49:59",
+    "model_type": "\ud83d\udfe2 : pretrained",
+    "job_id": "2041",
+    "params": 14.220999717712402,
+    "likes": 0,
+    "license": "apache-2.0"
+}

eval-results/.gitattributes ADDED Viewed

	@@ -0,0 +1,55 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text

eval-results/01-ai/Yi-1.5-9B-32K/result_2024_07_30 20:36:30.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+    "results": {
+        "daily": {
+          "daily": 7
+        },
+        "quarterly": {
+          "quarterly": 7
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.29948805460750855,
+            "acc_stderr": 0.013385021637313567,
+            "acc_norm": 0.3506825938566553,
+            "acc_norm_stderr": 0.013944635930726089
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.3333001394144593,
+            "acc_stderr": 0.004704293898729902,
+            "acc_norm": 0.4137621987651862,
+            "acc_norm_stderr": 0.004915003499517831
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.47953216374269003,
+            "acc_stderr": 0.0383161053282193,
+            "acc_norm": 0.47953216374269003,
+            "acc_norm_stderr": 0.0383161053282193
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.5631067961165048,
+            "acc_stderr": 0.049111471073657764,
+            "acc_norm": 0.5631067961165048,
+            "acc_norm_stderr": 0.049111471073657764
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.47509578544061304,
+            "acc_stderr": 0.01785777070490102,
+            "acc_norm": 0.47509578544061304,
+            "acc_norm_stderr": 0.01785777070490102
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.28888888888888886,
+            "acc_stderr": 0.0391545063041425,
+            "acc_norm": 0.28888888888888886,
+            "acc_norm_stderr": 0.0391545063041425
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.31,
+            "acc_stderr": 0.04648231987117316,
+            "acc_norm": 0.31,
+            "acc_norm_stderr": 0.04648231987117316
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.46808510638297873,
+            "acc_stderr": 0.03261936918467382,
+            "acc_norm": 0.46808510638297873,
+            "acc_norm_stderr": 0.03261936918467382
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.45180722891566266,
+            "acc_stderr": 0.03874371556587953,
+            "acc_norm": 0.45180722891566266,
+            "acc_norm_stderr": 0.03874371556587953
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.47266881028938906,
+            "acc_stderr": 0.028355633568328188,
+            "acc_norm": 0.47266881028938906,
+            "acc_norm_stderr": 0.028355633568328188
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.45739910313901344,
+            "acc_stderr": 0.033435777055830646,
+            "acc_norm": 0.45739910313901344,
+            "acc_norm_stderr": 0.033435777055830646
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.5267175572519084,
+            "acc_stderr": 0.04379024936553894,
+            "acc_norm": 0.5267175572519084,
+            "acc_norm_stderr": 0.04379024936553894
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.39,
+            "acc_stderr": 0.04902071300001975,
+            "acc_norm": 0.39,
+            "acc_norm_stderr": 0.04902071300001975
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.5555555555555556,
+            "acc_stderr": 0.035402943770953675,
+            "acc_norm": 0.5555555555555556,
+            "acc_norm_stderr": 0.035402943770953675
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.5724137931034483,
+            "acc_stderr": 0.04122737111370332,
+            "acc_norm": 0.5724137931034483,
+            "acc_norm_stderr": 0.04122737111370332
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.3137254901960784,
+            "acc_stderr": 0.04617034827006716,
+            "acc_norm": 0.3137254901960784,
+            "acc_norm_stderr": 0.04617034827006716
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.5,
+            "acc_stderr": 0.032478490123081544,
+            "acc_norm": 0.5,
+            "acc_norm_stderr": 0.032478490123081544
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.47692307692307695,
+            "acc_stderr": 0.025323990861736125,
+            "acc_norm": 0.47692307692307695,
+            "acc_norm_stderr": 0.025323990861736125
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.53,
+            "acc_stderr": 0.05016135580465919,
+            "acc_norm": 0.53,
+            "acc_norm_stderr": 0.05016135580465919
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.37,
+            "acc_stderr": 0.048523658709391,
+            "acc_norm": 0.37,
+            "acc_norm_stderr": 0.048523658709391
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.5740740740740741,
+            "acc_stderr": 0.047803436269367894,
+            "acc_norm": 0.5740740740740741,
+            "acc_norm_stderr": 0.047803436269367894
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.4187192118226601,
+            "acc_stderr": 0.03471192860518468,
+            "acc_norm": 0.4187192118226601,
+            "acc_norm_stderr": 0.03471192860518468
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.47419354838709676,
+            "acc_stderr": 0.02840609505765332,
+            "acc_norm": 0.47419354838709676,
+            "acc_norm_stderr": 0.02840609505765332
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.6752136752136753,
+            "acc_stderr": 0.03067902276549883,
+            "acc_norm": 0.6752136752136753,
+            "acc_norm_stderr": 0.03067902276549883
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.44150943396226416,
+            "acc_stderr": 0.030561590426731833,
+            "acc_norm": 0.44150943396226416,
+            "acc_norm_stderr": 0.030561590426731833
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.4727272727272727,
+            "acc_stderr": 0.04782001791380063,
+            "acc_norm": 0.4727272727272727,
+            "acc_norm_stderr": 0.04782001791380063
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.4185185185185185,
+            "acc_stderr": 0.030078013075022066,
+            "acc_norm": 0.4185185185185185,
+            "acc_norm_stderr": 0.030078013075022066
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.304635761589404,
+            "acc_stderr": 0.03757949922943343,
+            "acc_norm": 0.304635761589404,
+            "acc_norm_stderr": 0.03757949922943343
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.6069651741293532,
+            "acc_stderr": 0.0345368246603156,
+            "acc_norm": 0.6069651741293532,
+            "acc_norm_stderr": 0.0345368246603156
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.4046242774566474,
+            "acc_stderr": 0.03742461193887248,
+            "acc_norm": 0.4046242774566474,
+            "acc_norm_stderr": 0.03742461193887248
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.5476190476190477,
+            "acc_stderr": 0.02563425811555495,
+            "acc_norm": 0.5476190476190477,
+            "acc_norm_stderr": 0.02563425811555495
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.3472222222222222,
+            "acc_stderr": 0.039812405437178615,
+            "acc_norm": 0.3472222222222222,
+            "acc_norm_stderr": 0.039812405437178615
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.04725815626252605,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.04725815626252605
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.57,
+            "acc_stderr": 0.04975698519562426,
+            "acc_norm": 0.57,
+            "acc_norm_stderr": 0.04975698519562426
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.49710982658959535,
+            "acc_stderr": 0.026918645383239015,
+            "acc_norm": 0.49710982658959535,
+            "acc_norm_stderr": 0.026918645383239015
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.5276073619631901,
+            "acc_stderr": 0.03922378290610991,
+            "acc_norm": 0.5276073619631901,
+            "acc_norm_stderr": 0.03922378290610991
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.49691358024691357,
+            "acc_stderr": 0.027820214158594377,
+            "acc_norm": 0.49691358024691357,
+            "acc_norm_stderr": 0.027820214158594377
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.45,
+            "acc_stderr": 0.05,
+            "acc_norm": 0.45,
+            "acc_norm_stderr": 0.05
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.49222797927461137,
+            "acc_stderr": 0.03608003225569654,
+            "acc_norm": 0.49222797927461137,
+            "acc_norm_stderr": 0.03608003225569654
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.41228070175438597,
+            "acc_stderr": 0.046306532033665956,
+            "acc_norm": 0.41228070175438597,
+            "acc_norm_stderr": 0.046306532033665956
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.5027522935779817,
+            "acc_stderr": 0.02143699835976532,
+            "acc_norm": 0.5027522935779817,
+            "acc_norm_stderr": 0.02143699835976532
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.40476190476190477,
+            "acc_stderr": 0.04390259265377561,
+            "acc_norm": 0.40476190476190477,
+            "acc_norm_stderr": 0.04390259265377561
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.49019607843137253,
+            "acc_stderr": 0.028624412550167958,
+            "acc_norm": 0.49019607843137253,
+            "acc_norm_stderr": 0.028624412550167958
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.5,
+            "acc_stderr": 0.050251890762960605,
+            "acc_norm": 0.5,
+            "acc_norm_stderr": 0.050251890762960605
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.7355371900826446,
+            "acc_stderr": 0.04026187527591205,
+            "acc_norm": 0.7355371900826446,
+            "acc_norm_stderr": 0.04026187527591205
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.45394736842105265,
+            "acc_stderr": 0.04051646342874142,
+            "acc_norm": 0.45394736842105265,
+            "acc_norm_stderr": 0.04051646342874142
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.39705882352941174,
+            "acc_stderr": 0.019794488900024113,
+            "acc_norm": 0.39705882352941174,
+            "acc_norm_stderr": 0.019794488900024113
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.40070921985815605,
+            "acc_stderr": 0.029233465745573086,
+            "acc_norm": 0.40070921985815605,
+            "acc_norm_stderr": 0.029233465745573086
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.04635550135609976,
+            "acc_norm": 0.39285714285714285,
+            "acc_norm_stderr": 0.04635550135609976
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.4675925925925926,
+            "acc_stderr": 0.034028015813589656,
+            "acc_norm": 0.4675925925925926,
+            "acc_norm_stderr": 0.034028015813589656
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.3329608938547486,
+            "acc_stderr": 0.015761716178397552,
+            "acc_norm": 0.3329608938547486,
+            "acc_norm_stderr": 0.015761716178397552
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.43,
+            "acc_stderr": 0.049756985195624284,
+            "acc_norm": 0.43,
+            "acc_norm_stderr": 0.049756985195624284
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.76,
+            "acc_stderr": 0.042923469599092816,
+            "acc_norm": 0.76,
+            "acc_norm_stderr": 0.042923469599092816
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.35294117647058826,
+            "acc_stderr": 0.029029422815681404,
+            "acc_norm": 0.35294117647058826,
+            "acc_norm_stderr": 0.029029422815681404
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6163265306122448,
+            "acc_stderr": 0.031130880396235943,
+            "acc_norm": 0.6163265306122448,
+            "acc_norm_stderr": 0.031130880396235943
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.5654008438818565,
+            "acc_stderr": 0.03226759995510145,
+            "acc_norm": 0.5654008438818565,
+            "acc_norm_stderr": 0.03226759995510145
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.36571056062581486,
+            "acc_stderr": 0.012301028188840567,
+            "acc_norm": 0.36571056062581486,
+            "acc_norm_stderr": 0.012301028188840567
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.4852941176470588,
+            "acc_stderr": 0.03507793834791324,
+            "acc_norm": 0.4852941176470588,
+            "acc_norm_stderr": 0.03507793834791324
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.5151515151515151,
+            "acc_stderr": 0.03902551007374448,
+            "acc_norm": 0.5151515151515151,
+            "acc_norm_stderr": 0.03902551007374448
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.2937576499388005,
+            "mc1_stderr": 0.015945068581236614,
+            "mc2": 0.4670848140389129,
+            "mc2_stderr": 0.01585178282587417
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.47107438016528924,
+            "acc_stderr": 0.017161563949916348,
+            "acc_norm": 0.5171192443919717,
+            "acc_norm_stderr": 0.017180275246085626
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "01-ai/Yi-1.5-9B-32K",
+        "model_sha": "c0239dbc923b8a2b5ca849763bdd592d39c60850",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}

eval-results/BioMistral/BioMistral-7B/BioMistral_BioMistral-7B_result_2024-05-30 01_33_58.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+    "results": {
+        "daily": {
+          "daily": 10
+        },
+        "quarterly": {
+          "quarterly": 10
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.257679180887372,
+            "acc_stderr": 0.012780770562768416,
+            "acc_norm": 0.3122866894197952,
+            "acc_norm_stderr": 0.013542598541688065
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.3229436367257518,
+            "acc_stderr": 0.004666457279979418,
+            "acc_norm": 0.39255128460466043,
+            "acc_norm_stderr": 0.004873203269366306
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.34502923976608185,
+            "acc_stderr": 0.036459813773888065,
+            "acc_norm": 0.34502923976608185,
+            "acc_norm_stderr": 0.036459813773888065
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.4368932038834951,
+            "acc_stderr": 0.04911147107365778,
+            "acc_norm": 0.4368932038834951,
+            "acc_norm_stderr": 0.04911147107365778
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.3780332056194125,
+            "acc_stderr": 0.017339844462104625,
+            "acc_norm": 0.3780332056194125,
+            "acc_norm_stderr": 0.017339844462104625
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.3037037037037037,
+            "acc_stderr": 0.039725528847851355,
+            "acc_norm": 0.3037037037037037,
+            "acc_norm_stderr": 0.039725528847851355
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.37,
+            "acc_stderr": 0.04852365870939099,
+            "acc_norm": 0.37,
+            "acc_norm_stderr": 0.04852365870939099
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.28085106382978725,
+            "acc_stderr": 0.02937917046412482,
+            "acc_norm": 0.28085106382978725,
+            "acc_norm_stderr": 0.02937917046412482
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.3373493975903614,
+            "acc_stderr": 0.03680783690727581,
+            "acc_norm": 0.3373493975903614,
+            "acc_norm_stderr": 0.03680783690727581
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.3954983922829582,
+            "acc_stderr": 0.027770918531427838,
+            "acc_norm": 0.3954983922829582,
+            "acc_norm_stderr": 0.027770918531427838
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.34977578475336324,
+            "acc_stderr": 0.03200736719484503,
+            "acc_norm": 0.34977578475336324,
+            "acc_norm_stderr": 0.03200736719484503
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.3969465648854962,
+            "acc_stderr": 0.04291135671009224,
+            "acc_norm": 0.3969465648854962,
+            "acc_norm_stderr": 0.04291135671009224
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.42,
+            "acc_stderr": 0.049604496374885836,
+            "acc_norm": 0.42,
+            "acc_norm_stderr": 0.049604496374885836
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.4292929292929293,
+            "acc_stderr": 0.03526552724601199,
+            "acc_norm": 0.4292929292929293,
+            "acc_norm_stderr": 0.03526552724601199
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.4,
+            "acc_stderr": 0.04082482904638628,
+            "acc_norm": 0.4,
+            "acc_norm_stderr": 0.04082482904638628
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.30392156862745096,
+            "acc_stderr": 0.045766654032077636,
+            "acc_norm": 0.30392156862745096,
+            "acc_norm_stderr": 0.045766654032077636
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.40336134453781514,
+            "acc_stderr": 0.031866081214088314,
+            "acc_norm": 0.40336134453781514,
+            "acc_norm_stderr": 0.031866081214088314
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.40512820512820513,
+            "acc_stderr": 0.024890471769938145,
+            "acc_norm": 0.40512820512820513,
+            "acc_norm_stderr": 0.024890471769938145
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.48,
+            "acc_stderr": 0.050211673156867795,
+            "acc_norm": 0.48,
+            "acc_norm_stderr": 0.050211673156867795
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.32,
+            "acc_stderr": 0.04688261722621505,
+            "acc_norm": 0.32,
+            "acc_norm_stderr": 0.04688261722621505
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.49074074074074076,
+            "acc_stderr": 0.04832853553437055,
+            "acc_norm": 0.49074074074074076,
+            "acc_norm_stderr": 0.04832853553437055
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.37438423645320196,
+            "acc_stderr": 0.03405155380561952,
+            "acc_norm": 0.37438423645320196,
+            "acc_norm_stderr": 0.03405155380561952
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.36774193548387096,
+            "acc_stderr": 0.027430866579973474,
+            "acc_norm": 0.36774193548387096,
+            "acc_norm_stderr": 0.027430866579973474
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.5598290598290598,
+            "acc_stderr": 0.0325207417206305,
+            "acc_norm": 0.5598290598290598,
+            "acc_norm_stderr": 0.0325207417206305
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.3886792452830189,
+            "acc_stderr": 0.030000485448675986,
+            "acc_norm": 0.3886792452830189,
+            "acc_norm_stderr": 0.030000485448675986
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.44545454545454544,
+            "acc_stderr": 0.047605488214603246,
+            "acc_norm": 0.44545454545454544,
+            "acc_norm_stderr": 0.047605488214603246
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.34444444444444444,
+            "acc_stderr": 0.028972648884844267,
+            "acc_norm": 0.34444444444444444,
+            "acc_norm_stderr": 0.028972648884844267
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.3443708609271523,
+            "acc_stderr": 0.038796870240733264,
+            "acc_norm": 0.3443708609271523,
+            "acc_norm_stderr": 0.038796870240733264
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.4577114427860697,
+            "acc_stderr": 0.035228658640995975,
+            "acc_norm": 0.4577114427860697,
+            "acc_norm_stderr": 0.035228658640995975
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.3815028901734104,
+            "acc_stderr": 0.03703851193099521,
+            "acc_norm": 0.3815028901734104,
+            "acc_norm_stderr": 0.03703851193099521
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.35714285714285715,
+            "acc_stderr": 0.02467786284133278,
+            "acc_norm": 0.35714285714285715,
+            "acc_norm_stderr": 0.02467786284133278
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.3333333333333333,
+            "acc_stderr": 0.03942082639927213,
+            "acc_norm": 0.3333333333333333,
+            "acc_norm_stderr": 0.03942082639927213
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.47,
+            "acc_stderr": 0.05016135580465919,
+            "acc_norm": 0.47,
+            "acc_norm_stderr": 0.05016135580465919
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.54,
+            "acc_stderr": 0.05009082659620333,
+            "acc_norm": 0.54,
+            "acc_norm_stderr": 0.05009082659620333
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.44508670520231214,
+            "acc_stderr": 0.02675625512966377,
+            "acc_norm": 0.44508670520231214,
+            "acc_norm_stderr": 0.02675625512966377
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.34355828220858897,
+            "acc_stderr": 0.03731133519673893,
+            "acc_norm": 0.34355828220858897,
+            "acc_norm_stderr": 0.03731133519673893
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.37037037037037035,
+            "acc_stderr": 0.02686949074481525,
+            "acc_norm": 0.37037037037037035,
+            "acc_norm_stderr": 0.02686949074481525
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.04725815626252605,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.04725815626252605
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.44559585492227977,
+            "acc_stderr": 0.0358701498607566,
+            "acc_norm": 0.44559585492227977,
+            "acc_norm_stderr": 0.0358701498607566
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.2719298245614035,
+            "acc_stderr": 0.041857744240220575,
+            "acc_norm": 0.2719298245614035,
+            "acc_norm_stderr": 0.041857744240220575
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.3798165137614679,
+            "acc_stderr": 0.020808825617866244,
+            "acc_norm": 0.3798165137614679,
+            "acc_norm_stderr": 0.020808825617866244
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.3492063492063492,
+            "acc_stderr": 0.04263906892795132,
+            "acc_norm": 0.3492063492063492,
+            "acc_norm_stderr": 0.04263906892795132
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.4117647058823529,
+            "acc_stderr": 0.02818059632825929,
+            "acc_norm": 0.4117647058823529,
+            "acc_norm_stderr": 0.02818059632825929
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.42,
+            "acc_stderr": 0.049604496374885836,
+            "acc_norm": 0.42,
+            "acc_norm_stderr": 0.049604496374885836
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.5619834710743802,
+            "acc_stderr": 0.045291468044357915,
+            "acc_norm": 0.5619834710743802,
+            "acc_norm_stderr": 0.045291468044357915
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.34868421052631576,
+            "acc_stderr": 0.038781398887976125,
+            "acc_norm": 0.34868421052631576,
+            "acc_norm_stderr": 0.038781398887976125
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.3284313725490196,
+            "acc_stderr": 0.018999707383162666,
+            "acc_norm": 0.3284313725490196,
+            "acc_norm_stderr": 0.018999707383162666
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.2730496453900709,
+            "acc_stderr": 0.026577860943307857,
+            "acc_norm": 0.2730496453900709,
+            "acc_norm_stderr": 0.026577860943307857
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.2767857142857143,
+            "acc_stderr": 0.04246624336697627,
+            "acc_norm": 0.2767857142857143,
+            "acc_norm_stderr": 0.04246624336697627
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.4074074074074074,
+            "acc_stderr": 0.03350991604696043,
+            "acc_norm": 0.4074074074074074,
+            "acc_norm_stderr": 0.03350991604696043
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.23910614525139665,
+            "acc_stderr": 0.014265554192331149,
+            "acc_norm": 0.23910614525139665,
+            "acc_norm_stderr": 0.014265554192331149
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.047258156262526045,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.047258156262526045
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.4,
+            "acc_stderr": 0.04923659639173309,
+            "acc_norm": 0.4,
+            "acc_norm_stderr": 0.04923659639173309
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.4227941176470588,
+            "acc_stderr": 0.030008562845003483,
+            "acc_norm": 0.4227941176470588,
+            "acc_norm_stderr": 0.030008562845003483
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.3469387755102041,
+            "acc_stderr": 0.030472526026726492,
+            "acc_norm": 0.3469387755102041,
+            "acc_norm_stderr": 0.030472526026726492
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.4177215189873418,
+            "acc_stderr": 0.032103530322412685,
+            "acc_norm": 0.4177215189873418,
+            "acc_norm_stderr": 0.032103530322412685
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.3005215123859192,
+            "acc_stderr": 0.011709918883039124,
+            "acc_norm": 0.3005215123859192,
+            "acc_norm_stderr": 0.011709918883039124
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.3872549019607843,
+            "acc_stderr": 0.03418931233833344,
+            "acc_norm": 0.3872549019607843,
+            "acc_norm_stderr": 0.03418931233833344
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.43636363636363634,
+            "acc_stderr": 0.03872592983524753,
+            "acc_norm": 0.43636363636363634,
+            "acc_norm_stderr": 0.03872592983524753
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.3072215422276622,
+            "mc1_stderr": 0.016150201321323002,
+            "mc2": 0.4721418472000992,
+            "mc2_stderr": 0.01626625866283201
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.27863046044864226,
+            "acc_stderr": 0.01541373949434568,
+            "acc_norm": 0.3825265643447462,
+            "acc_norm_stderr": 0.016709165387228803
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "BioMistral/BioMistral-7B",
+        "model_sha": "9a11e1ffa817c211cbb52ee1fb312dc6b61b40a5",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}

eval-results/EleutherAI/polyglot-ko-1.3b/EleutherAI_polyglot-ko-1.3b_result_2023-09-24 15_21_38.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+    "results": {
+        "daily": {
+          "daily": 11
+        },
+        "quarterly": {
+          "quarterly": 11
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.2235494880546075,
+            "acc_stderr": 0.012174896631202605,
+            "acc_norm": 0.2815699658703072,
+            "acc_norm_stderr": 0.013143376735009015
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.3345947022505477,
+            "acc_stderr": 0.004708842600177431,
+            "acc_norm": 0.4135630352519418,
+            "acc_norm_stderr": 0.0049146550633294974
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.27485380116959063,
+            "acc_stderr": 0.03424042924691585,
+            "acc_norm": 0.27485380116959063,
+            "acc_norm_stderr": 0.03424042924691585
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.27184466019417475,
+            "acc_stderr": 0.044052680241409216,
+            "acc_norm": 0.27184466019417475,
+            "acc_norm_stderr": 0.044052680241409216
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.26947637292464877,
+            "acc_stderr": 0.015866243073215065,
+            "acc_norm": 0.26947637292464877,
+            "acc_norm_stderr": 0.015866243073215065
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.26666666666666666,
+            "acc_stderr": 0.038201699145179055,
+            "acc_norm": 0.26666666666666666,
+            "acc_norm_stderr": 0.038201699145179055
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.3,
+            "acc_stderr": 0.046056618647183814,
+            "acc_norm": 0.3,
+            "acc_norm_stderr": 0.046056618647183814
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.2127659574468085,
+            "acc_stderr": 0.026754391348039783,
+            "acc_norm": 0.2127659574468085,
+            "acc_norm_stderr": 0.026754391348039783
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.24096385542168675,
+            "acc_stderr": 0.033293941190735296,
+            "acc_norm": 0.24096385542168675,
+            "acc_norm_stderr": 0.033293941190735296
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.2379421221864952,
+            "acc_stderr": 0.024185150647818707,
+            "acc_norm": 0.2379421221864952,
+            "acc_norm_stderr": 0.024185150647818707
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.2825112107623318,
+            "acc_stderr": 0.030216831011508766,
+            "acc_norm": 0.2825112107623318,
+            "acc_norm_stderr": 0.030216831011508766
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.21374045801526717,
+            "acc_stderr": 0.0359546161177469,
+            "acc_norm": 0.21374045801526717,
+            "acc_norm_stderr": 0.0359546161177469
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.24,
+            "acc_stderr": 0.042923469599092816,
+            "acc_norm": 0.24,
+            "acc_norm_stderr": 0.042923469599092816
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.2474747474747475,
+            "acc_stderr": 0.03074630074212451,
+            "acc_norm": 0.2474747474747475,
+            "acc_norm_stderr": 0.03074630074212451
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.22758620689655173,
+            "acc_stderr": 0.03493950380131184,
+            "acc_norm": 0.22758620689655173,
+            "acc_norm_stderr": 0.03493950380131184
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.22549019607843138,
+            "acc_stderr": 0.041583075330832865,
+            "acc_norm": 0.22549019607843138,
+            "acc_norm_stderr": 0.041583075330832865
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.31512605042016806,
+            "acc_stderr": 0.030176808288974337,
+            "acc_norm": 0.31512605042016806,
+            "acc_norm_stderr": 0.030176808288974337
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.2205128205128205,
+            "acc_stderr": 0.02102067268082791,
+            "acc_norm": 0.2205128205128205,
+            "acc_norm_stderr": 0.02102067268082791
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.18,
+            "acc_stderr": 0.038612291966536955,
+            "acc_norm": 0.18,
+            "acc_norm_stderr": 0.038612291966536955
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.31,
+            "acc_stderr": 0.04648231987117316,
+            "acc_norm": 0.31,
+            "acc_norm_stderr": 0.04648231987117316
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.25,
+            "acc_stderr": 0.04186091791394607,
+            "acc_norm": 0.25,
+            "acc_norm_stderr": 0.04186091791394607
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.2660098522167488,
+            "acc_stderr": 0.03108982600293752,
+            "acc_norm": 0.2660098522167488,
+            "acc_norm_stderr": 0.03108982600293752
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.3,
+            "acc_stderr": 0.02606936229533513,
+            "acc_norm": 0.3,
+            "acc_norm_stderr": 0.02606936229533513
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.23076923076923078,
+            "acc_stderr": 0.027601921381417607,
+            "acc_norm": 0.23076923076923078,
+            "acc_norm_stderr": 0.027601921381417607
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.25660377358490566,
+            "acc_stderr": 0.026880647889051968,
+            "acc_norm": 0.25660377358490566,
+            "acc_norm_stderr": 0.026880647889051968
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.2545454545454545,
+            "acc_stderr": 0.04172343038705383,
+            "acc_norm": 0.2545454545454545,
+            "acc_norm_stderr": 0.04172343038705383
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.2962962962962963,
+            "acc_stderr": 0.02784081149587194,
+            "acc_norm": 0.2962962962962963,
+            "acc_norm_stderr": 0.02784081149587194
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.304635761589404,
+            "acc_stderr": 0.03757949922943342,
+            "acc_norm": 0.304635761589404,
+            "acc_norm_stderr": 0.03757949922943342
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.25870646766169153,
+            "acc_stderr": 0.03096590312357303,
+            "acc_norm": 0.25870646766169153,
+            "acc_norm_stderr": 0.03096590312357303
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.2254335260115607,
+            "acc_stderr": 0.03186209851641144,
+            "acc_norm": 0.2254335260115607,
+            "acc_norm_stderr": 0.03186209851641144
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.2566137566137566,
+            "acc_stderr": 0.022494510767503154,
+            "acc_norm": 0.2566137566137566,
+            "acc_norm_stderr": 0.022494510767503154
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.2638888888888889,
+            "acc_stderr": 0.03685651095897532,
+            "acc_norm": 0.2638888888888889,
+            "acc_norm_stderr": 0.03685651095897532
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.23,
+            "acc_stderr": 0.04229525846816505,
+            "acc_norm": 0.23,
+            "acc_norm_stderr": 0.04229525846816505
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.22,
+            "acc_stderr": 0.04163331998932269,
+            "acc_norm": 0.22,
+            "acc_norm_stderr": 0.04163331998932269
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.24855491329479767,
+            "acc_stderr": 0.023267528432100174,
+            "acc_norm": 0.24855491329479767,
+            "acc_norm_stderr": 0.023267528432100174
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.31901840490797545,
+            "acc_stderr": 0.03661997551073836,
+            "acc_norm": 0.31901840490797545,
+            "acc_norm_stderr": 0.03661997551073836
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.2623456790123457,
+            "acc_stderr": 0.024477222856135114,
+            "acc_norm": 0.2623456790123457,
+            "acc_norm_stderr": 0.024477222856135114
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.25,
+            "acc_stderr": 0.04351941398892446,
+            "acc_norm": 0.25,
+            "acc_norm_stderr": 0.04351941398892446
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.33678756476683935,
+            "acc_stderr": 0.03410780251836184,
+            "acc_norm": 0.33678756476683935,
+            "acc_norm_stderr": 0.03410780251836184
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.20175438596491227,
+            "acc_stderr": 0.037752050135836386,
+            "acc_norm": 0.20175438596491227,
+            "acc_norm_stderr": 0.037752050135836386
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.24220183486238533,
+            "acc_stderr": 0.01836817630659862,
+            "acc_norm": 0.24220183486238533,
+            "acc_norm_stderr": 0.01836817630659862
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.23015873015873015,
+            "acc_stderr": 0.03764950879790606,
+            "acc_norm": 0.23015873015873015,
+            "acc_norm_stderr": 0.03764950879790606
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.23529411764705882,
+            "acc_stderr": 0.024288619466046102,
+            "acc_norm": 0.23529411764705882,
+            "acc_norm_stderr": 0.024288619466046102
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.18,
+            "acc_stderr": 0.03861229196653695,
+            "acc_norm": 0.18,
+            "acc_norm_stderr": 0.03861229196653695
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.256198347107438,
+            "acc_stderr": 0.039849796533028704,
+            "acc_norm": 0.256198347107438,
+            "acc_norm_stderr": 0.039849796533028704
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.21710526315789475,
+            "acc_stderr": 0.033550453048829226,
+            "acc_norm": 0.21710526315789475,
+            "acc_norm_stderr": 0.033550453048829226
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.24019607843137256,
+            "acc_stderr": 0.01728276069516743,
+            "acc_norm": 0.24019607843137256,
+            "acc_norm_stderr": 0.01728276069516743
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.2553191489361702,
+            "acc_stderr": 0.02601199293090201,
+            "acc_norm": 0.2553191489361702,
+            "acc_norm_stderr": 0.02601199293090201
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.21428571428571427,
+            "acc_stderr": 0.03894641120044793,
+            "acc_norm": 0.21428571428571427,
+            "acc_norm_stderr": 0.03894641120044793
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.46296296296296297,
+            "acc_stderr": 0.03400603625538272,
+            "acc_norm": 0.46296296296296297,
+            "acc_norm_stderr": 0.03400603625538272
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.24692737430167597,
+            "acc_stderr": 0.014422292204808852,
+            "acc_norm": 0.24692737430167597,
+            "acc_norm_stderr": 0.014422292204808852
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.25,
+            "acc_stderr": 0.04351941398892446,
+            "acc_norm": 0.25,
+            "acc_norm_stderr": 0.04351941398892446
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.3,
+            "acc_stderr": 0.046056618647183814,
+            "acc_norm": 0.3,
+            "acc_norm_stderr": 0.046056618647183814
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.4411764705882353,
+            "acc_stderr": 0.030161911930767102,
+            "acc_norm": 0.4411764705882353,
+            "acc_norm_stderr": 0.030161911930767102
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.3795918367346939,
+            "acc_stderr": 0.03106721126287249,
+            "acc_norm": 0.3795918367346939,
+            "acc_norm_stderr": 0.03106721126287249
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.2109704641350211,
+            "acc_stderr": 0.02655837250266192,
+            "acc_norm": 0.2109704641350211,
+            "acc_norm_stderr": 0.02655837250266192
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.23468057366362452,
+            "acc_stderr": 0.010824026872449344,
+            "acc_norm": 0.23468057366362452,
+            "acc_norm_stderr": 0.010824026872449344
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.25,
+            "acc_stderr": 0.03039153369274154,
+            "acc_norm": 0.25,
+            "acc_norm_stderr": 0.03039153369274154
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.22424242424242424,
+            "acc_stderr": 0.03256866661681102,
+            "acc_norm": 0.22424242424242424,
+            "acc_norm_stderr": 0.03256866661681102
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.25091799265605874,
+            "mc1_stderr": 0.015176985027707682,
+            "mc2": 0.4116568832959107,
+            "mc2_stderr": 0.015044504977529799
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.27744982290436837,
+            "acc_stderr": 0.015393630236605975,
+            "acc_norm": 0.3400236127508855,
+            "acc_norm_stderr": 0.016286717220737674
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "EleutherAI/polyglot-ko-1.3b",
+        "model_sha": "557e162cf6e944fdbae05bab2e45d066a125eacb",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}

eval-results/HuggingFaceH4/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

eval-results/HuggingFaceH4/zephyr-7b-beta/result.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+    "results": {
+        "daily": {
+          "daily": 8
+        },
+        "quarterly": {
+          "quarterly": 8
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.33532423208191126,
+            "acc_stderr": 0.01379618294778556,
+            "acc_norm": 0.3848122866894198,
+            "acc_norm_stderr": 0.014218371065251112
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.35480979884485164,
+            "acc_stderr": 0.004774778180345192,
+            "acc_norm": 0.44911372236606256,
+            "acc_norm_stderr": 0.00496387293685794
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.45614035087719296,
+            "acc_stderr": 0.03820042586602966,
+            "acc_norm": 0.45614035087719296,
+            "acc_norm_stderr": 0.03820042586602966
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.6019417475728155,
+            "acc_stderr": 0.04846748253977238,
+            "acc_norm": 0.6019417475728155,
+            "acc_norm_stderr": 0.04846748253977238
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.41762452107279696,
+            "acc_stderr": 0.017635637326951534,
+            "acc_norm": 0.41762452107279696,
+            "acc_norm_stderr": 0.017635637326951534
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.34074074074074073,
+            "acc_stderr": 0.040943762699967946,
+            "acc_norm": 0.34074074074074073,
+            "acc_norm_stderr": 0.040943762699967946
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.19,
+            "acc_stderr": 0.03942772444036623,
+            "acc_norm": 0.19,
+            "acc_norm_stderr": 0.03942772444036623
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.2978723404255319,
+            "acc_stderr": 0.029896145682095462,
+            "acc_norm": 0.2978723404255319,
+            "acc_norm_stderr": 0.029896145682095462
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.3614457831325301,
+            "acc_stderr": 0.0374005938202932,
+            "acc_norm": 0.3614457831325301,
+            "acc_norm_stderr": 0.0374005938202932
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.4758842443729904,
+            "acc_stderr": 0.028365041542564584,
+            "acc_norm": 0.4758842443729904,
+            "acc_norm_stderr": 0.028365041542564584
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.3811659192825112,
+            "acc_stderr": 0.032596251184168284,
+            "acc_norm": 0.3811659192825112,
+            "acc_norm_stderr": 0.032596251184168284
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.3511450381679389,
+            "acc_stderr": 0.04186445163013751,
+            "acc_norm": 0.3511450381679389,
+            "acc_norm_stderr": 0.04186445163013751
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.27,
+            "acc_stderr": 0.0446196043338474,
+            "acc_norm": 0.27,
+            "acc_norm_stderr": 0.0446196043338474
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.494949494949495,
+            "acc_stderr": 0.035621707606254015,
+            "acc_norm": 0.494949494949495,
+            "acc_norm_stderr": 0.035621707606254015
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.4,
+            "acc_stderr": 0.04082482904638628,
+            "acc_norm": 0.4,
+            "acc_norm_stderr": 0.04082482904638628
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.3137254901960784,
+            "acc_stderr": 0.04617034827006717,
+            "acc_norm": 0.3137254901960784,
+            "acc_norm_stderr": 0.04617034827006717
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.4957983193277311,
+            "acc_stderr": 0.0324773433444811,
+            "acc_norm": 0.4957983193277311,
+            "acc_norm_stderr": 0.0324773433444811
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.4256410256410256,
+            "acc_stderr": 0.025069094387296546,
+            "acc_norm": 0.4256410256410256,
+            "acc_norm_stderr": 0.025069094387296546
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.59,
+            "acc_stderr": 0.049431107042371025,
+            "acc_norm": 0.59,
+            "acc_norm_stderr": 0.049431107042371025
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.29,
+            "acc_stderr": 0.045604802157206845,
+            "acc_norm": 0.29,
+            "acc_norm_stderr": 0.045604802157206845
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.4537037037037037,
+            "acc_stderr": 0.04812917324536821,
+            "acc_norm": 0.4537037037037037,
+            "acc_norm_stderr": 0.04812917324536821
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.35467980295566504,
+            "acc_stderr": 0.03366124489051449,
+            "acc_norm": 0.35467980295566504,
+            "acc_norm_stderr": 0.03366124489051449
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.4290322580645161,
+            "acc_stderr": 0.02815603653823321,
+            "acc_norm": 0.4290322580645161,
+            "acc_norm_stderr": 0.02815603653823321
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.6666666666666666,
+            "acc_stderr": 0.03088273697413865,
+            "acc_norm": 0.6666666666666666,
+            "acc_norm_stderr": 0.03088273697413865
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.4188679245283019,
+            "acc_stderr": 0.03036505082911521,
+            "acc_norm": 0.4188679245283019,
+            "acc_norm_stderr": 0.03036505082911521
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.42727272727272725,
+            "acc_stderr": 0.04738198703545483,
+            "acc_norm": 0.42727272727272725,
+            "acc_norm_stderr": 0.04738198703545483
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.34814814814814815,
+            "acc_stderr": 0.029045600290616258,
+            "acc_norm": 0.34814814814814815,
+            "acc_norm_stderr": 0.029045600290616258
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.2913907284768212,
+            "acc_stderr": 0.037101857261199946,
+            "acc_norm": 0.2913907284768212,
+            "acc_norm_stderr": 0.037101857261199946
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.5174129353233831,
+            "acc_stderr": 0.03533389234739245,
+            "acc_norm": 0.5174129353233831,
+            "acc_norm_stderr": 0.03533389234739245
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.37572254335260113,
+            "acc_stderr": 0.03692820767264867,
+            "acc_norm": 0.37572254335260113,
+            "acc_norm_stderr": 0.03692820767264867
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.3492063492063492,
+            "acc_stderr": 0.024552292209342658,
+            "acc_norm": 0.3492063492063492,
+            "acc_norm_stderr": 0.024552292209342658
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.3333333333333333,
+            "acc_stderr": 0.039420826399272135,
+            "acc_norm": 0.3333333333333333,
+            "acc_norm_stderr": 0.039420826399272135
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.35,
+            "acc_stderr": 0.04793724854411019,
+            "acc_norm": 0.35,
+            "acc_norm_stderr": 0.04793724854411019
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.49,
+            "acc_stderr": 0.05024183937956913,
+            "acc_norm": 0.49,
+            "acc_norm_stderr": 0.05024183937956913
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.47398843930635837,
+            "acc_stderr": 0.026882643434022885,
+            "acc_norm": 0.47398843930635837,
+            "acc_norm_stderr": 0.026882643434022885
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.44171779141104295,
+            "acc_stderr": 0.039015918258361836,
+            "acc_norm": 0.44171779141104295,
+            "acc_norm_stderr": 0.039015918258361836
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.42592592592592593,
+            "acc_stderr": 0.027513747284379424,
+            "acc_norm": 0.42592592592592593,
+            "acc_norm_stderr": 0.027513747284379424
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.04725815626252606,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.04725815626252606
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.5129533678756477,
+            "acc_stderr": 0.0360722806104775,
+            "acc_norm": 0.5129533678756477,
+            "acc_norm_stderr": 0.0360722806104775
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.24561403508771928,
+            "acc_stderr": 0.0404933929774814,
+            "acc_norm": 0.24561403508771928,
+            "acc_norm_stderr": 0.0404933929774814
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.47155963302752296,
+            "acc_stderr": 0.02140261569734804,
+            "acc_norm": 0.47155963302752296,
+            "acc_norm_stderr": 0.02140261569734804
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.36507936507936506,
+            "acc_stderr": 0.04306241259127152,
+            "acc_norm": 0.36507936507936506,
+            "acc_norm_stderr": 0.04306241259127152
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.4117647058823529,
+            "acc_stderr": 0.028180596328259297,
+            "acc_norm": 0.4117647058823529,
+            "acc_norm_stderr": 0.028180596328259297
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.44,
+            "acc_stderr": 0.04988876515698589,
+            "acc_norm": 0.44,
+            "acc_norm_stderr": 0.04988876515698589
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.5867768595041323,
+            "acc_stderr": 0.04495087843548408,
+            "acc_norm": 0.5867768595041323,
+            "acc_norm_stderr": 0.04495087843548408
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.40131578947368424,
+            "acc_stderr": 0.03988903703336284,
+            "acc_norm": 0.40131578947368424,
+            "acc_norm_stderr": 0.03988903703336284
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.32679738562091504,
+            "acc_stderr": 0.018975427920507215,
+            "acc_norm": 0.32679738562091504,
+            "acc_norm_stderr": 0.018975427920507215
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.3333333333333333,
+            "acc_stderr": 0.02812163604063988,
+            "acc_norm": 0.3333333333333333,
+            "acc_norm_stderr": 0.02812163604063988
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.3392857142857143,
+            "acc_stderr": 0.04493949068613539,
+            "acc_norm": 0.3392857142857143,
+            "acc_norm_stderr": 0.04493949068613539
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.41203703703703703,
+            "acc_stderr": 0.03356787758160835,
+            "acc_norm": 0.41203703703703703,
+            "acc_norm_stderr": 0.03356787758160835
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.329608938547486,
+            "acc_stderr": 0.015721531075183884,
+            "acc_norm": 0.329608938547486,
+            "acc_norm_stderr": 0.015721531075183884
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.39,
+            "acc_stderr": 0.04902071300001975,
+            "acc_norm": 0.39,
+            "acc_norm_stderr": 0.04902071300001975
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.61,
+            "acc_stderr": 0.04902071300001975,
+            "acc_norm": 0.61,
+            "acc_norm_stderr": 0.04902071300001975
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.375,
+            "acc_stderr": 0.029408372932278746,
+            "acc_norm": 0.375,
+            "acc_norm_stderr": 0.029408372932278746
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.43673469387755104,
+            "acc_stderr": 0.03175195237583322,
+            "acc_norm": 0.43673469387755104,
+            "acc_norm_stderr": 0.03175195237583322
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.4810126582278481,
+            "acc_stderr": 0.03252375148090448,
+            "acc_norm": 0.4810126582278481,
+            "acc_norm_stderr": 0.03252375148090448
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.29791395045632335,
+            "acc_stderr": 0.011680717340400059,
+            "acc_norm": 0.29791395045632335,
+            "acc_norm_stderr": 0.011680717340400059
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.29411764705882354,
+            "acc_stderr": 0.03198001660115072,
+            "acc_norm": 0.29411764705882354,
+            "acc_norm_stderr": 0.03198001660115072
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.30303030303030304,
+            "acc_stderr": 0.03588624800091707,
+            "acc_norm": 0.30303030303030304,
+            "acc_norm_stderr": 0.03588624800091707
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.3317013463892289,
+            "mc1_stderr": 0.01648214881024147,
+            "mc2": 0.5171680571717291,
+            "mc2_stderr": 0.01606077987901482
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.39787485242030696,
+            "acc_stderr": 0.01682795905473339,
+            "acc_norm": 0.4014167650531287,
+            "acc_norm_stderr": 0.01685290785872906
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "HuggingFaceH4/zephyr-7b-beta",
+        "model_sha": "3bac358730f8806e5c3dc7c7e19eb36e045bf720",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}

eval-results/README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: apache-2.0
+---

eval-results/nlpai-lab/KULLM3/result.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+    "results": {
+        "daily": {
+          "daily": 6
+        },
+        "quarterly": {
+          "quarterly": 6
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.42918088737201365,
+            "acc_stderr": 0.014464085894870651,
+            "acc_norm": 0.46501706484641636,
+            "acc_norm_stderr": 0.014575583922019672
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.445628360884286,
+            "acc_stderr": 0.004960191341430244,
+            "acc_norm": 0.589523999203346,
+            "acc_norm_stderr": 0.004909148239488273
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.6432748538011696,
+            "acc_stderr": 0.03674013002860954,
+            "acc_norm": 0.6432748538011696,
+            "acc_norm_stderr": 0.03674013002860954
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.6116504854368932,
+            "acc_stderr": 0.04825729337356389,
+            "acc_norm": 0.6116504854368932,
+            "acc_norm_stderr": 0.04825729337356389
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.6155810983397191,
+            "acc_stderr": 0.01739568874281962,
+            "acc_norm": 0.6155810983397191,
+            "acc_norm_stderr": 0.01739568874281962
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.4962962962962963,
+            "acc_stderr": 0.04319223625811331,
+            "acc_norm": 0.4962962962962963,
+            "acc_norm_stderr": 0.04319223625811331
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.26,
+            "acc_stderr": 0.04408440022768077,
+            "acc_norm": 0.26,
+            "acc_norm_stderr": 0.04408440022768077
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.4553191489361702,
+            "acc_stderr": 0.03255525359340354,
+            "acc_norm": 0.4553191489361702,
+            "acc_norm_stderr": 0.03255525359340354
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.5180722891566265,
+            "acc_stderr": 0.038899512528272166,
+            "acc_norm": 0.5180722891566265,
+            "acc_norm_stderr": 0.038899512528272166
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.5755627009646302,
+            "acc_stderr": 0.028071928247946205,
+            "acc_norm": 0.5755627009646302,
+            "acc_norm_stderr": 0.028071928247946205
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.5650224215246636,
+            "acc_stderr": 0.033272833702713445,
+            "acc_norm": 0.5650224215246636,
+            "acc_norm_stderr": 0.033272833702713445
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.5877862595419847,
+            "acc_stderr": 0.04317171194870255,
+            "acc_norm": 0.5877862595419847,
+            "acc_norm_stderr": 0.04317171194870255
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.5,
+            "acc_stderr": 0.050251890762960605,
+            "acc_norm": 0.5,
+            "acc_norm_stderr": 0.050251890762960605
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.6515151515151515,
+            "acc_stderr": 0.033948539651564025,
+            "acc_norm": 0.6515151515151515,
+            "acc_norm_stderr": 0.033948539651564025
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.503448275862069,
+            "acc_stderr": 0.04166567577101579,
+            "acc_norm": 0.503448275862069,
+            "acc_norm_stderr": 0.04166567577101579
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.2549019607843137,
+            "acc_stderr": 0.043364327079931785,
+            "acc_norm": 0.2549019607843137,
+            "acc_norm_stderr": 0.043364327079931785
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.5756302521008403,
+            "acc_stderr": 0.03210479051015776,
+            "acc_norm": 0.5756302521008403,
+            "acc_norm_stderr": 0.03210479051015776
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.541025641025641,
+            "acc_stderr": 0.025265525491284295,
+            "acc_norm": 0.541025641025641,
+            "acc_norm_stderr": 0.025265525491284295
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.54,
+            "acc_stderr": 0.05009082659620332,
+            "acc_norm": 0.54,
+            "acc_norm_stderr": 0.05009082659620332
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.31,
+            "acc_stderr": 0.04648231987117316,
+            "acc_norm": 0.31,
+            "acc_norm_stderr": 0.04648231987117316
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.5555555555555556,
+            "acc_stderr": 0.04803752235190192,
+            "acc_norm": 0.5555555555555556,
+            "acc_norm_stderr": 0.04803752235190192
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.3842364532019704,
+            "acc_stderr": 0.0342239856565755,
+            "acc_norm": 0.3842364532019704,
+            "acc_norm_stderr": 0.0342239856565755
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.5774193548387097,
+            "acc_stderr": 0.02810096472427264,
+            "acc_norm": 0.5774193548387097,
+            "acc_norm_stderr": 0.02810096472427264
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.7777777777777778,
+            "acc_stderr": 0.027236013946196673,
+            "acc_norm": 0.7777777777777778,
+            "acc_norm_stderr": 0.027236013946196673
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.4981132075471698,
+            "acc_stderr": 0.030772653642075657,
+            "acc_norm": 0.4981132075471698,
+            "acc_norm_stderr": 0.030772653642075657
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.5272727272727272,
+            "acc_stderr": 0.04782001791380061,
+            "acc_norm": 0.5272727272727272,
+            "acc_norm_stderr": 0.04782001791380061
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.25555555555555554,
+            "acc_stderr": 0.026593939101844082,
+            "acc_norm": 0.25555555555555554,
+            "acc_norm_stderr": 0.026593939101844082
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.33774834437086093,
+            "acc_stderr": 0.038615575462551684,
+            "acc_norm": 0.33774834437086093,
+            "acc_norm_stderr": 0.038615575462551684
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.7064676616915423,
+            "acc_stderr": 0.032200241045342054,
+            "acc_norm": 0.7064676616915423,
+            "acc_norm_stderr": 0.032200241045342054
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.4797687861271676,
+            "acc_stderr": 0.03809342081273958,
+            "acc_norm": 0.4797687861271676,
+            "acc_norm_stderr": 0.03809342081273958
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.38095238095238093,
+            "acc_stderr": 0.025010749116137602,
+            "acc_norm": 0.38095238095238093,
+            "acc_norm_stderr": 0.025010749116137602
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.4236111111111111,
+            "acc_stderr": 0.041321250197233685,
+            "acc_norm": 0.4236111111111111,
+            "acc_norm_stderr": 0.041321250197233685
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.31,
+            "acc_stderr": 0.04648231987117316,
+            "acc_norm": 0.31,
+            "acc_norm_stderr": 0.04648231987117316
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.71,
+            "acc_stderr": 0.04560480215720683,
+            "acc_norm": 0.71,
+            "acc_norm_stderr": 0.04560480215720683
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.5751445086705202,
+            "acc_stderr": 0.026613350840261733,
+            "acc_norm": 0.5751445086705202,
+            "acc_norm_stderr": 0.026613350840261733
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.5030674846625767,
+            "acc_stderr": 0.03928297078179662,
+            "acc_norm": 0.5030674846625767,
+            "acc_norm_stderr": 0.03928297078179662
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.5370370370370371,
+            "acc_stderr": 0.027744313443376536,
+            "acc_norm": 0.5370370370370371,
+            "acc_norm_stderr": 0.027744313443376536
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.04725815626252606,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.04725815626252606
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.6217616580310881,
+            "acc_stderr": 0.034998072761933376,
+            "acc_norm": 0.6217616580310881,
+            "acc_norm_stderr": 0.034998072761933376
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.37719298245614036,
+            "acc_stderr": 0.04559522141958216,
+            "acc_norm": 0.37719298245614036,
+            "acc_norm_stderr": 0.04559522141958216
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.6385321100917432,
+            "acc_stderr": 0.02059808200993736,
+            "acc_norm": 0.6385321100917432,
+            "acc_norm_stderr": 0.02059808200993736
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.4126984126984127,
+            "acc_stderr": 0.04403438954768177,
+            "acc_norm": 0.4126984126984127,
+            "acc_norm_stderr": 0.04403438954768177
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.5261437908496732,
+            "acc_stderr": 0.028590752958852387,
+            "acc_norm": 0.5261437908496732,
+            "acc_norm_stderr": 0.028590752958852387
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.57,
+            "acc_stderr": 0.049756985195624284,
+            "acc_norm": 0.57,
+            "acc_norm_stderr": 0.049756985195624284
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.7520661157024794,
+            "acc_stderr": 0.03941897526516304,
+            "acc_norm": 0.7520661157024794,
+            "acc_norm_stderr": 0.03941897526516304
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.5789473684210527,
+            "acc_stderr": 0.040179012759817494,
+            "acc_norm": 0.5789473684210527,
+            "acc_norm_stderr": 0.040179012759817494
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.4738562091503268,
+            "acc_stderr": 0.020200164564804588,
+            "acc_norm": 0.4738562091503268,
+            "acc_norm_stderr": 0.020200164564804588
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.3404255319148936,
+            "acc_stderr": 0.02826765748265013,
+            "acc_norm": 0.3404255319148936,
+            "acc_norm_stderr": 0.02826765748265013
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.38392857142857145,
+            "acc_stderr": 0.046161430750285455,
+            "acc_norm": 0.38392857142857145,
+            "acc_norm_stderr": 0.046161430750285455
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.4675925925925926,
+            "acc_stderr": 0.03402801581358966,
+            "acc_norm": 0.4675925925925926,
+            "acc_norm_stderr": 0.03402801581358966
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.21675977653631284,
+            "acc_stderr": 0.013780598486443363,
+            "acc_norm": 0.21675977653631284,
+            "acc_norm_stderr": 0.013780598486443363
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.39,
+            "acc_stderr": 0.04902071300001975,
+            "acc_norm": 0.39,
+            "acc_norm_stderr": 0.04902071300001975
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.71,
+            "acc_stderr": 0.04560480215720684,
+            "acc_norm": 0.71,
+            "acc_norm_stderr": 0.04560480215720684
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.4411764705882353,
+            "acc_stderr": 0.0301619119307671,
+            "acc_norm": 0.4411764705882353,
+            "acc_norm_stderr": 0.0301619119307671
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6285714285714286,
+            "acc_stderr": 0.03093285879278986,
+            "acc_norm": 0.6285714285714286,
+            "acc_norm_stderr": 0.03093285879278986
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.70042194092827,
+            "acc_stderr": 0.029818024749753095,
+            "acc_norm": 0.70042194092827,
+            "acc_norm_stderr": 0.029818024749753095
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.378748370273794,
+            "acc_stderr": 0.012389052105003741,
+            "acc_norm": 0.378748370273794,
+            "acc_norm_stderr": 0.012389052105003741
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.6225490196078431,
+            "acc_stderr": 0.03402272044340703,
+            "acc_norm": 0.6225490196078431,
+            "acc_norm_stderr": 0.03402272044340703
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.6666666666666666,
+            "acc_stderr": 0.03681050869161549,
+            "acc_norm": 0.6666666666666666,
+            "acc_norm_stderr": 0.03681050869161549
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.33659730722154224,
+            "mc1_stderr": 0.016542412809494877,
+            "mc2": 0.49995145184296846,
+            "mc2_stderr": 0.015887726098900913
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.564344746162928,
+            "acc_stderr": 0.017047415229476316,
+            "acc_norm": 0.6068476977567887,
+            "acc_norm_stderr": 0.016793262801287068
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "nlpai-lab/KULLM3",
+        "model_sha": "5a6bcd0fc7f240460eb6d57016f7b4060bc1f43b",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}

eval-results/x2bee/POLAR-14B-DPO-v1.3/result.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+    "results": {
+        "daily": {
+          "daily": 4
+        },
+        "quarterly": {
+          "quarterly": 4
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.7465870307167235,
+            "acc_stderr": 0.012710896778378604,
+            "acc_norm": 0.7807167235494881,
+            "acc_norm_stderr": 0.012091245787615728
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.6385182234614618,
+            "acc_stderr": 0.004794478426382617,
+            "acc_norm": 0.7561242780322645,
+            "acc_norm_stderr": 0.004285410130466119
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.6900584795321637,
+            "acc_stderr": 0.035469769593931624,
+            "acc_norm": 0.6900584795321637,
+            "acc_norm_stderr": 0.035469769593931624
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.6601941747572816,
+            "acc_stderr": 0.046897659372781335,
+            "acc_norm": 0.6601941747572816,
+            "acc_norm_stderr": 0.046897659372781335
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.6845466155810983,
+            "acc_stderr": 0.016617501738763408,
+            "acc_norm": 0.6845466155810983,
+            "acc_norm_stderr": 0.016617501738763408
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.48148148148148145,
+            "acc_stderr": 0.04316378599511324,
+            "acc_norm": 0.48148148148148145,
+            "acc_norm_stderr": 0.04316378599511324
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.047258156262526045,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.047258156262526045
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.46808510638297873,
+            "acc_stderr": 0.03261936918467383,
+            "acc_norm": 0.46808510638297873,
+            "acc_norm_stderr": 0.03261936918467383
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.4759036144578313,
+            "acc_stderr": 0.03887971849597264,
+            "acc_norm": 0.4759036144578313,
+            "acc_norm_stderr": 0.03887971849597264
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.6334405144694534,
+            "acc_stderr": 0.02736807824397163,
+            "acc_norm": 0.6334405144694534,
+            "acc_norm_stderr": 0.02736807824397163
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.6681614349775785,
+            "acc_stderr": 0.03160295143776679,
+            "acc_norm": 0.6681614349775785,
+            "acc_norm_stderr": 0.03160295143776679
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.6030534351145038,
+            "acc_stderr": 0.04291135671009224,
+            "acc_norm": 0.6030534351145038,
+            "acc_norm_stderr": 0.04291135671009224
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.51,
+            "acc_stderr": 0.05024183937956911,
+            "acc_norm": 0.51,
+            "acc_norm_stderr": 0.05024183937956911
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.7222222222222222,
+            "acc_stderr": 0.03191178226713547,
+            "acc_norm": 0.7222222222222222,
+            "acc_norm_stderr": 0.03191178226713547
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.47586206896551725,
+            "acc_stderr": 0.041618085035015295,
+            "acc_norm": 0.47586206896551725,
+            "acc_norm_stderr": 0.041618085035015295
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.2549019607843137,
+            "acc_stderr": 0.04336432707993178,
+            "acc_norm": 0.2549019607843137,
+            "acc_norm_stderr": 0.04336432707993178
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.592436974789916,
+            "acc_stderr": 0.031918633744784666,
+            "acc_norm": 0.592436974789916,
+            "acc_norm_stderr": 0.031918633744784666
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.5948717948717949,
+            "acc_stderr": 0.024890471769938142,
+            "acc_norm": 0.5948717948717949,
+            "acc_norm_stderr": 0.024890471769938142
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.66,
+            "acc_stderr": 0.04760952285695237,
+            "acc_norm": 0.66,
+            "acc_norm_stderr": 0.04760952285695237
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.27,
+            "acc_stderr": 0.04461960433384739,
+            "acc_norm": 0.27,
+            "acc_norm_stderr": 0.04461960433384739
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.6388888888888888,
+            "acc_stderr": 0.04643454608906275,
+            "acc_norm": 0.6388888888888888,
+            "acc_norm_stderr": 0.04643454608906275
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.4433497536945813,
+            "acc_stderr": 0.034953345821629345,
+            "acc_norm": 0.4433497536945813,
+            "acc_norm_stderr": 0.034953345821629345
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.5806451612903226,
+            "acc_stderr": 0.028071588901091838,
+            "acc_norm": 0.5806451612903226,
+            "acc_norm_stderr": 0.028071588901091838
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.811965811965812,
+            "acc_stderr": 0.025598193686652254,
+            "acc_norm": 0.811965811965812,
+            "acc_norm_stderr": 0.025598193686652254
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.5169811320754717,
+            "acc_stderr": 0.030755120364119898,
+            "acc_norm": 0.5169811320754717,
+            "acc_norm_stderr": 0.030755120364119898
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.5818181818181818,
+            "acc_stderr": 0.04724577405731573,
+            "acc_norm": 0.5818181818181818,
+            "acc_norm_stderr": 0.04724577405731573
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.3888888888888889,
+            "acc_stderr": 0.029723278961476664,
+            "acc_norm": 0.3888888888888889,
+            "acc_norm_stderr": 0.029723278961476664
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.3708609271523179,
+            "acc_stderr": 0.03943966699183629,
+            "acc_norm": 0.3708609271523179,
+            "acc_norm_stderr": 0.03943966699183629
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.6666666666666666,
+            "acc_stderr": 0.033333333333333326,
+            "acc_norm": 0.6666666666666666,
+            "acc_norm_stderr": 0.033333333333333326
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.47398843930635837,
+            "acc_stderr": 0.038073017265045125,
+            "acc_norm": 0.47398843930635837,
+            "acc_norm_stderr": 0.038073017265045125
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.42328042328042326,
+            "acc_stderr": 0.025446365634406793,
+            "acc_norm": 0.42328042328042326,
+            "acc_norm_stderr": 0.025446365634406793
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.5625,
+            "acc_stderr": 0.04148415739394154,
+            "acc_norm": 0.5625,
+            "acc_norm_stderr": 0.04148415739394154
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.39,
+            "acc_stderr": 0.04902071300001975,
+            "acc_norm": 0.39,
+            "acc_norm_stderr": 0.04902071300001975
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.78,
+            "acc_stderr": 0.04163331998932263,
+            "acc_norm": 0.78,
+            "acc_norm_stderr": 0.04163331998932263
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.5491329479768786,
+            "acc_stderr": 0.026788811931562767,
+            "acc_norm": 0.5491329479768786,
+            "acc_norm_stderr": 0.026788811931562767
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.6319018404907976,
+            "acc_stderr": 0.03789213935838396,
+            "acc_norm": 0.6319018404907976,
+            "acc_norm_stderr": 0.03789213935838396
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.5925925925925926,
+            "acc_stderr": 0.02733954664066273,
+            "acc_norm": 0.5925925925925926,
+            "acc_norm_stderr": 0.02733954664066273
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.4,
+            "acc_stderr": 0.049236596391733084,
+            "acc_norm": 0.4,
+            "acc_norm_stderr": 0.049236596391733084
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.7668393782383419,
+            "acc_stderr": 0.03051611137147601,
+            "acc_norm": 0.7668393782383419,
+            "acc_norm_stderr": 0.03051611137147601
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.4473684210526316,
+            "acc_stderr": 0.046774730044912,
+            "acc_norm": 0.4473684210526316,
+            "acc_norm_stderr": 0.046774730044912
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.726605504587156,
+            "acc_stderr": 0.01910929984609827,
+            "acc_norm": 0.726605504587156,
+            "acc_norm_stderr": 0.01910929984609827
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.3968253968253968,
+            "acc_stderr": 0.04375888492727061,
+            "acc_norm": 0.3968253968253968,
+            "acc_norm_stderr": 0.04375888492727061
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.6078431372549019,
+            "acc_stderr": 0.027956046165424516,
+            "acc_norm": 0.6078431372549019,
+            "acc_norm_stderr": 0.027956046165424516
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.55,
+            "acc_stderr": 0.05,
+            "acc_norm": 0.55,
+            "acc_norm_stderr": 0.05
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.6942148760330579,
+            "acc_stderr": 0.04205953933884122,
+            "acc_norm": 0.6942148760330579,
+            "acc_norm_stderr": 0.04205953933884122
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.618421052631579,
+            "acc_stderr": 0.03953173377749194,
+            "acc_norm": 0.618421052631579,
+            "acc_norm_stderr": 0.03953173377749194
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.5669934640522876,
+            "acc_stderr": 0.02004544247332422,
+            "acc_norm": 0.5669934640522876,
+            "acc_norm_stderr": 0.02004544247332422
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.4219858156028369,
+            "acc_stderr": 0.029462189233370586,
+            "acc_norm": 0.4219858156028369,
+            "acc_norm_stderr": 0.029462189233370586
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.5089285714285714,
+            "acc_stderr": 0.04745033255489123,
+            "acc_norm": 0.5089285714285714,
+            "acc_norm_stderr": 0.04745033255489123
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.4351851851851852,
+            "acc_stderr": 0.03381200005643526,
+            "acc_norm": 0.4351851851851852,
+            "acc_norm_stderr": 0.03381200005643526
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.3787709497206704,
+            "acc_stderr": 0.016223533510365117,
+            "acc_norm": 0.3787709497206704,
+            "acc_norm_stderr": 0.016223533510365117
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.47,
+            "acc_stderr": 0.05016135580465919,
+            "acc_norm": 0.47,
+            "acc_norm_stderr": 0.05016135580465919
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.66,
+            "acc_stderr": 0.04760952285695238,
+            "acc_norm": 0.66,
+            "acc_norm_stderr": 0.04760952285695238
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.48161764705882354,
+            "acc_stderr": 0.03035230339535196,
+            "acc_norm": 0.48161764705882354,
+            "acc_norm_stderr": 0.03035230339535196
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6448979591836734,
+            "acc_stderr": 0.030635655150387634,
+            "acc_norm": 0.6448979591836734,
+            "acc_norm_stderr": 0.030635655150387634
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.729957805907173,
+            "acc_stderr": 0.028900721906293426,
+            "acc_norm": 0.729957805907173,
+            "acc_norm_stderr": 0.028900721906293426
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.41460234680573665,
+            "acc_stderr": 0.012582597058908284,
+            "acc_norm": 0.41460234680573665,
+            "acc_norm_stderr": 0.012582597058908284
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.6421568627450981,
+            "acc_stderr": 0.03364487286088298,
+            "acc_norm": 0.6421568627450981,
+            "acc_norm_stderr": 0.03364487286088298
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.6181818181818182,
+            "acc_stderr": 0.03793713171165635,
+            "acc_norm": 0.6181818181818182,
+            "acc_norm_stderr": 0.03793713171165635
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.6328029375764994,
+            "mc1_stderr": 0.01687480500145318,
+            "mc2": 0.7522925779273922,
+            "mc2_stderr": 0.014568927682929578
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.45218417945690675,
+            "acc_stderr": 0.017111567130916785,
+            "acc_norm": 0.45454545454545453,
+            "acc_norm_stderr": 0.017119172208061504
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "x2bee/POLAR-14B-DPO-v1.3",
+        "model_sha": "337edbed4c86db2da27e3b0e07086134f8d27a09",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}

eval-results/x2bee/POLAR-14B-DPO-v1.4/result.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+    "results": {
+        "daily": {
+          "daily": 7
+        },
+        "quarterly": {
+          "quarterly": 7
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.7363481228668942,
+            "acc_stderr": 0.012875929151297058,
+            "acc_norm": 0.7491467576791809,
+            "acc_norm_stderr": 0.012668198621315433
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.7228639713204541,
+            "acc_stderr": 0.004466695023677848,
+            "acc_norm": 0.7422824138617805,
+            "acc_norm_stderr": 0.004364838000335614
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.6140350877192983,
+            "acc_stderr": 0.03733756969066164,
+            "acc_norm": 0.6140350877192983,
+            "acc_norm_stderr": 0.03733756969066164
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.6893203883495146,
+            "acc_stderr": 0.045821241601615506,
+            "acc_norm": 0.6893203883495146,
+            "acc_norm_stderr": 0.045821241601615506
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.6526181353767561,
+            "acc_stderr": 0.017026671748655728,
+            "acc_norm": 0.6526181353767561,
+            "acc_norm_stderr": 0.017026671748655728
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.5037037037037037,
+            "acc_stderr": 0.043192236258113324,
+            "acc_norm": 0.5037037037037037,
+            "acc_norm_stderr": 0.043192236258113324
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.37,
+            "acc_stderr": 0.048523658709391,
+            "acc_norm": 0.37,
+            "acc_norm_stderr": 0.048523658709391
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.451063829787234,
+            "acc_stderr": 0.032529096196131965,
+            "acc_norm": 0.451063829787234,
+            "acc_norm_stderr": 0.032529096196131965
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.4939759036144578,
+            "acc_stderr": 0.03892212195333045,
+            "acc_norm": 0.4939759036144578,
+            "acc_norm_stderr": 0.03892212195333045
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.5852090032154341,
+            "acc_stderr": 0.02798268045975956,
+            "acc_norm": 0.5852090032154341,
+            "acc_norm_stderr": 0.02798268045975956
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.6412556053811659,
+            "acc_stderr": 0.032190792004199956,
+            "acc_norm": 0.6412556053811659,
+            "acc_norm_stderr": 0.032190792004199956
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.5954198473282443,
+            "acc_stderr": 0.043046937953806645,
+            "acc_norm": 0.5954198473282443,
+            "acc_norm_stderr": 0.043046937953806645
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.47,
+            "acc_stderr": 0.05016135580465919,
+            "acc_norm": 0.47,
+            "acc_norm_stderr": 0.05016135580465919
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.6616161616161617,
+            "acc_stderr": 0.033711241426263014,
+            "acc_norm": 0.6616161616161617,
+            "acc_norm_stderr": 0.033711241426263014
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.4827586206896552,
+            "acc_stderr": 0.041641887201693775,
+            "acc_norm": 0.4827586206896552,
+            "acc_norm_stderr": 0.041641887201693775
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.2549019607843137,
+            "acc_stderr": 0.04336432707993178,
+            "acc_norm": 0.2549019607843137,
+            "acc_norm_stderr": 0.04336432707993178
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.5882352941176471,
+            "acc_stderr": 0.031968769891957786,
+            "acc_norm": 0.5882352941176471,
+            "acc_norm_stderr": 0.031968769891957786
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.6025641025641025,
+            "acc_stderr": 0.024811920017903836,
+            "acc_norm": 0.6025641025641025,
+            "acc_norm_stderr": 0.024811920017903836
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.66,
+            "acc_stderr": 0.04760952285695237,
+            "acc_norm": 0.66,
+            "acc_norm_stderr": 0.04760952285695237
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.35,
+            "acc_stderr": 0.047937248544110196,
+            "acc_norm": 0.35,
+            "acc_norm_stderr": 0.047937248544110196
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.5925925925925926,
+            "acc_stderr": 0.04750077341199984,
+            "acc_norm": 0.5925925925925926,
+            "acc_norm_stderr": 0.04750077341199984
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.43842364532019706,
+            "acc_stderr": 0.03491207857486518,
+            "acc_norm": 0.43842364532019706,
+            "acc_norm_stderr": 0.03491207857486518
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.567741935483871,
+            "acc_stderr": 0.028181739720019413,
+            "acc_norm": 0.567741935483871,
+            "acc_norm_stderr": 0.028181739720019413
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.7948717948717948,
+            "acc_stderr": 0.026453508054040356,
+            "acc_norm": 0.7948717948717948,
+            "acc_norm_stderr": 0.026453508054040356
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.5169811320754717,
+            "acc_stderr": 0.030755120364119905,
+            "acc_norm": 0.5169811320754717,
+            "acc_norm_stderr": 0.030755120364119905
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.5727272727272728,
+            "acc_stderr": 0.047381987035454834,
+            "acc_norm": 0.5727272727272728,
+            "acc_norm_stderr": 0.047381987035454834
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.3962962962962963,
+            "acc_stderr": 0.029822619458533997,
+            "acc_norm": 0.3962962962962963,
+            "acc_norm_stderr": 0.029822619458533997
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.3708609271523179,
+            "acc_stderr": 0.03943966699183629,
+            "acc_norm": 0.3708609271523179,
+            "acc_norm_stderr": 0.03943966699183629
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.6766169154228856,
+            "acc_stderr": 0.03307615947979035,
+            "acc_norm": 0.6766169154228856,
+            "acc_norm_stderr": 0.03307615947979035
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.49710982658959535,
+            "acc_stderr": 0.038124005659748335,
+            "acc_norm": 0.49710982658959535,
+            "acc_norm_stderr": 0.038124005659748335
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.42592592592592593,
+            "acc_stderr": 0.02546714904546955,
+            "acc_norm": 0.42592592592592593,
+            "acc_norm_stderr": 0.02546714904546955
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.5555555555555556,
+            "acc_stderr": 0.04155319955593146,
+            "acc_norm": 0.5555555555555556,
+            "acc_norm_stderr": 0.04155319955593146
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.4,
+            "acc_stderr": 0.04923659639173309,
+            "acc_norm": 0.4,
+            "acc_norm_stderr": 0.04923659639173309
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.73,
+            "acc_stderr": 0.044619604333847394,
+            "acc_norm": 0.73,
+            "acc_norm_stderr": 0.044619604333847394
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.5549132947976878,
+            "acc_stderr": 0.02675625512966377,
+            "acc_norm": 0.5549132947976878,
+            "acc_norm_stderr": 0.02675625512966377
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.588957055214724,
+            "acc_stderr": 0.038656978537853624,
+            "acc_norm": 0.588957055214724,
+            "acc_norm_stderr": 0.038656978537853624
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.5771604938271605,
+            "acc_stderr": 0.027487472980871595,
+            "acc_norm": 0.5771604938271605,
+            "acc_norm_stderr": 0.027487472980871595
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.4,
+            "acc_stderr": 0.049236596391733084,
+            "acc_norm": 0.4,
+            "acc_norm_stderr": 0.049236596391733084
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.7305699481865285,
+            "acc_stderr": 0.032018671228777947,
+            "acc_norm": 0.7305699481865285,
+            "acc_norm_stderr": 0.032018671228777947
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.42105263157894735,
+            "acc_stderr": 0.046446020912223177,
+            "acc_norm": 0.42105263157894735,
+            "acc_norm_stderr": 0.046446020912223177
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.7064220183486238,
+            "acc_stderr": 0.019525151122639663,
+            "acc_norm": 0.7064220183486238,
+            "acc_norm_stderr": 0.019525151122639663
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.3968253968253968,
+            "acc_stderr": 0.04375888492727061,
+            "acc_norm": 0.3968253968253968,
+            "acc_norm_stderr": 0.04375888492727061
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.545751633986928,
+            "acc_stderr": 0.02850980780262659,
+            "acc_norm": 0.545751633986928,
+            "acc_norm_stderr": 0.02850980780262659
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.55,
+            "acc_stderr": 0.05000000000000001,
+            "acc_norm": 0.55,
+            "acc_norm_stderr": 0.05000000000000001
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.6859504132231405,
+            "acc_stderr": 0.04236964753041019,
+            "acc_norm": 0.6859504132231405,
+            "acc_norm_stderr": 0.04236964753041019
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.6052631578947368,
+            "acc_stderr": 0.039777499346220734,
+            "acc_norm": 0.6052631578947368,
+            "acc_norm_stderr": 0.039777499346220734
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.5392156862745098,
+            "acc_stderr": 0.02016552331390791,
+            "acc_norm": 0.5392156862745098,
+            "acc_norm_stderr": 0.02016552331390791
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.35815602836879434,
+            "acc_stderr": 0.02860208586275942,
+            "acc_norm": 0.35815602836879434,
+            "acc_norm_stderr": 0.02860208586275942
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.4107142857142857,
+            "acc_stderr": 0.04669510663875192,
+            "acc_norm": 0.4107142857142857,
+            "acc_norm_stderr": 0.04669510663875192
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.44907407407407407,
+            "acc_stderr": 0.03392238405321617,
+            "acc_norm": 0.44907407407407407,
+            "acc_norm_stderr": 0.03392238405321617
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.3452513966480447,
+            "acc_stderr": 0.015901432608930354,
+            "acc_norm": 0.3452513966480447,
+            "acc_norm_stderr": 0.015901432608930354
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.43,
+            "acc_stderr": 0.049756985195624284,
+            "acc_norm": 0.43,
+            "acc_norm_stderr": 0.049756985195624284
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.66,
+            "acc_stderr": 0.04760952285695238,
+            "acc_norm": 0.66,
+            "acc_norm_stderr": 0.04760952285695238
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.45588235294117646,
+            "acc_stderr": 0.030254372573976694,
+            "acc_norm": 0.45588235294117646,
+            "acc_norm_stderr": 0.030254372573976694
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6204081632653061,
+            "acc_stderr": 0.031067211262872457,
+            "acc_norm": 0.6204081632653061,
+            "acc_norm_stderr": 0.031067211262872457
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.6582278481012658,
+            "acc_stderr": 0.030874537537553617,
+            "acc_norm": 0.6582278481012658,
+            "acc_norm_stderr": 0.030874537537553617
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.4152542372881356,
+            "acc_stderr": 0.012585471793400667,
+            "acc_norm": 0.4152542372881356,
+            "acc_norm_stderr": 0.012585471793400667
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.5343137254901961,
+            "acc_stderr": 0.03501038327635896,
+            "acc_norm": 0.5343137254901961,
+            "acc_norm_stderr": 0.03501038327635896
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.5454545454545454,
+            "acc_stderr": 0.038881769216741004,
+            "acc_norm": 0.5454545454545454,
+            "acc_norm_stderr": 0.038881769216741004
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.4663402692778458,
+            "mc1_stderr": 0.01746379386716811,
+            "mc2": NaN,
+            "mc2_stderr": NaN
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.44037780401416765,
+            "acc_stderr": 0.01706769977431298,
+            "acc_norm": 0.44510035419126326,
+            "acc_norm_stderr": 0.01708641743100547
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "x2bee/POLAR-14B-DPO-v1.4",
+        "model_sha": "a6e64075fafaa3d5e393ff89c3cb26f9615e6de9",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}

eval-results/x2bee/POLAR-14B-HES-DPO-v1.5/result.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+    "results": {
+        "daily": {
+          "daily": 5
+        },
+        "quarterly": {
+          "quarterly": 5
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.6638225255972696,
+            "acc_stderr": 0.013804855026205756,
+            "acc_norm": 0.7278156996587031,
+            "acc_norm_stderr": 0.013006600406423709
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.45648277235610435,
+            "acc_stderr": 0.004970846697552306,
+            "acc_norm": 0.6349332802230632,
+            "acc_norm_stderr": 0.004804649197163697
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.7309941520467836,
+            "acc_stderr": 0.0340105262010409,
+            "acc_norm": 0.7309941520467836,
+            "acc_norm_stderr": 0.0340105262010409
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.7766990291262136,
+            "acc_stderr": 0.04123553189891431,
+            "acc_norm": 0.7766990291262136,
+            "acc_norm_stderr": 0.04123553189891431
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.7343550446998723,
+            "acc_stderr": 0.01579430248788872,
+            "acc_norm": 0.7343550446998723,
+            "acc_norm_stderr": 0.01579430248788872
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.45185185185185184,
+            "acc_stderr": 0.04299268905480863,
+            "acc_norm": 0.45185185185185184,
+            "acc_norm_stderr": 0.04299268905480863
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.35,
+            "acc_stderr": 0.04793724854411019,
+            "acc_norm": 0.35,
+            "acc_norm_stderr": 0.04793724854411019
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.5276595744680851,
+            "acc_stderr": 0.03263597118409769,
+            "acc_norm": 0.5276595744680851,
+            "acc_norm_stderr": 0.03263597118409769
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.4759036144578313,
+            "acc_stderr": 0.03887971849597264,
+            "acc_norm": 0.4759036144578313,
+            "acc_norm_stderr": 0.03887971849597264
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.6559485530546624,
+            "acc_stderr": 0.026981478043648043,
+            "acc_norm": 0.6559485530546624,
+            "acc_norm_stderr": 0.026981478043648043
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.6412556053811659,
+            "acc_stderr": 0.032190792004199956,
+            "acc_norm": 0.6412556053811659,
+            "acc_norm_stderr": 0.032190792004199956
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.648854961832061,
+            "acc_stderr": 0.04186445163013751,
+            "acc_norm": 0.648854961832061,
+            "acc_norm_stderr": 0.04186445163013751
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.54,
+            "acc_stderr": 0.05009082659620333,
+            "acc_norm": 0.54,
+            "acc_norm_stderr": 0.05009082659620333
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.7777777777777778,
+            "acc_stderr": 0.029620227874790465,
+            "acc_norm": 0.7777777777777778,
+            "acc_norm_stderr": 0.029620227874790465
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.5103448275862069,
+            "acc_stderr": 0.04165774775728762,
+            "acc_norm": 0.5103448275862069,
+            "acc_norm_stderr": 0.04165774775728762
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.3627450980392157,
+            "acc_stderr": 0.04784060704105655,
+            "acc_norm": 0.3627450980392157,
+            "acc_norm_stderr": 0.04784060704105655
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.6680672268907563,
+            "acc_stderr": 0.03058869701378364,
+            "acc_norm": 0.6680672268907563,
+            "acc_norm_stderr": 0.03058869701378364
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.6384615384615384,
+            "acc_stderr": 0.024359581465397,
+            "acc_norm": 0.6384615384615384,
+            "acc_norm_stderr": 0.024359581465397
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.65,
+            "acc_stderr": 0.04793724854411021,
+            "acc_norm": 0.65,
+            "acc_norm_stderr": 0.04793724854411021
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.37,
+            "acc_stderr": 0.04852365870939099,
+            "acc_norm": 0.37,
+            "acc_norm_stderr": 0.04852365870939099
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.6851851851851852,
+            "acc_stderr": 0.04489931073591312,
+            "acc_norm": 0.6851851851851852,
+            "acc_norm_stderr": 0.04489931073591312
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.46798029556650245,
+            "acc_stderr": 0.035107665979592154,
+            "acc_norm": 0.46798029556650245,
+            "acc_norm_stderr": 0.035107665979592154
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.6548387096774193,
+            "acc_stderr": 0.02704574657353432,
+            "acc_norm": 0.6548387096774193,
+            "acc_norm_stderr": 0.02704574657353432
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.8162393162393162,
+            "acc_stderr": 0.025372139671722933,
+            "acc_norm": 0.8162393162393162,
+            "acc_norm_stderr": 0.025372139671722933
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.5773584905660377,
+            "acc_stderr": 0.03040233144576954,
+            "acc_norm": 0.5773584905660377,
+            "acc_norm_stderr": 0.03040233144576954
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.6454545454545455,
+            "acc_stderr": 0.045820048415054174,
+            "acc_norm": 0.6454545454545455,
+            "acc_norm_stderr": 0.045820048415054174
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.4074074074074074,
+            "acc_stderr": 0.029958249250082118,
+            "acc_norm": 0.4074074074074074,
+            "acc_norm_stderr": 0.029958249250082118
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.3509933774834437,
+            "acc_stderr": 0.03896981964257375,
+            "acc_norm": 0.3509933774834437,
+            "acc_norm_stderr": 0.03896981964257375
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.7263681592039801,
+            "acc_stderr": 0.03152439186555404,
+            "acc_norm": 0.7263681592039801,
+            "acc_norm_stderr": 0.03152439186555404
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.5375722543352601,
+            "acc_stderr": 0.0380168510452446,
+            "acc_norm": 0.5375722543352601,
+            "acc_norm_stderr": 0.0380168510452446
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.4365079365079365,
+            "acc_stderr": 0.025542846817400496,
+            "acc_norm": 0.4365079365079365,
+            "acc_norm_stderr": 0.025542846817400496
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.5694444444444444,
+            "acc_stderr": 0.04140685639111503,
+            "acc_norm": 0.5694444444444444,
+            "acc_norm_stderr": 0.04140685639111503
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.43,
+            "acc_stderr": 0.049756985195624284,
+            "acc_norm": 0.43,
+            "acc_norm_stderr": 0.049756985195624284
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.78,
+            "acc_stderr": 0.04163331998932263,
+            "acc_norm": 0.78,
+            "acc_norm_stderr": 0.04163331998932263
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.6098265895953757,
+            "acc_stderr": 0.026261677607806642,
+            "acc_norm": 0.6098265895953757,
+            "acc_norm_stderr": 0.026261677607806642
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.656441717791411,
+            "acc_stderr": 0.03731133519673893,
+            "acc_norm": 0.656441717791411,
+            "acc_norm_stderr": 0.03731133519673893
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.6574074074074074,
+            "acc_stderr": 0.02640614597362568,
+            "acc_norm": 0.6574074074074074,
+            "acc_norm_stderr": 0.02640614597362568
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.38,
+            "acc_stderr": 0.04878317312145632,
+            "acc_norm": 0.38,
+            "acc_norm_stderr": 0.04878317312145632
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.7668393782383419,
+            "acc_stderr": 0.03051611137147601,
+            "acc_norm": 0.7668393782383419,
+            "acc_norm_stderr": 0.03051611137147601
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.45614035087719296,
+            "acc_stderr": 0.046854730419077895,
+            "acc_norm": 0.45614035087719296,
+            "acc_norm_stderr": 0.046854730419077895
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.7853211009174312,
+            "acc_stderr": 0.017604304149256494,
+            "acc_norm": 0.7853211009174312,
+            "acc_norm_stderr": 0.017604304149256494
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.4523809523809524,
+            "acc_stderr": 0.044518079590553275,
+            "acc_norm": 0.4523809523809524,
+            "acc_norm_stderr": 0.044518079590553275
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.6405228758169934,
+            "acc_stderr": 0.027475969910660952,
+            "acc_norm": 0.6405228758169934,
+            "acc_norm_stderr": 0.027475969910660952
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.66,
+            "acc_stderr": 0.04760952285695237,
+            "acc_norm": 0.66,
+            "acc_norm_stderr": 0.04760952285695237
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.7933884297520661,
+            "acc_stderr": 0.03695980128098824,
+            "acc_norm": 0.7933884297520661,
+            "acc_norm_stderr": 0.03695980128098824
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.6842105263157895,
+            "acc_stderr": 0.0378272898086547,
+            "acc_norm": 0.6842105263157895,
+            "acc_norm_stderr": 0.0378272898086547
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.5964052287581699,
+            "acc_stderr": 0.019848280168401164,
+            "acc_norm": 0.5964052287581699,
+            "acc_norm_stderr": 0.019848280168401164
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.4397163120567376,
+            "acc_stderr": 0.02960991207559411,
+            "acc_norm": 0.4397163120567376,
+            "acc_norm_stderr": 0.02960991207559411
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.39285714285714285,
+            "acc_stderr": 0.04635550135609976,
+            "acc_norm": 0.39285714285714285,
+            "acc_norm_stderr": 0.04635550135609976
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.5787037037037037,
+            "acc_stderr": 0.03367462138896078,
+            "acc_norm": 0.5787037037037037,
+            "acc_norm_stderr": 0.03367462138896078
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.264804469273743,
+            "acc_stderr": 0.01475690648326066,
+            "acc_norm": 0.264804469273743,
+            "acc_norm_stderr": 0.01475690648326066
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.52,
+            "acc_stderr": 0.050211673156867795,
+            "acc_norm": 0.52,
+            "acc_norm_stderr": 0.050211673156867795
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.7,
+            "acc_stderr": 0.04605661864718381,
+            "acc_norm": 0.7,
+            "acc_norm_stderr": 0.04605661864718381
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.5588235294117647,
+            "acc_stderr": 0.03016191193076711,
+            "acc_norm": 0.5588235294117647,
+            "acc_norm_stderr": 0.03016191193076711
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6448979591836734,
+            "acc_stderr": 0.030635655150387634,
+            "acc_norm": 0.6448979591836734,
+            "acc_norm_stderr": 0.030635655150387634
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.7426160337552743,
+            "acc_stderr": 0.028458820991460302,
+            "acc_norm": 0.7426160337552743,
+            "acc_norm_stderr": 0.028458820991460302
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.44654498044328556,
+            "acc_stderr": 0.012697046024399661,
+            "acc_norm": 0.44654498044328556,
+            "acc_norm_stderr": 0.012697046024399661
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.6225490196078431,
+            "acc_stderr": 0.03402272044340703,
+            "acc_norm": 0.6225490196078431,
+            "acc_norm_stderr": 0.03402272044340703
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.6303030303030303,
+            "acc_stderr": 0.03769430314512569,
+            "acc_norm": 0.6303030303030303,
+            "acc_norm_stderr": 0.03769430314512569
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.6634026927784578,
+            "mc1_stderr": 0.0165424128094949,
+            "mc2": 0.7515104740134964,
+            "mc2_stderr": 0.014200593490054807
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.5147579693034239,
+            "acc_stderr": 0.01718286443499856,
+            "acc_norm": 0.526564344746163,
+            "acc_norm_stderr": 0.017166075717577747
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "x2bee/POLAR-14B-HES-DPO-v1.5",
+        "model_sha": "f0bc8e2566ba28c8232d7c690098e634ea894e8d",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}

eval-results/x2bee/POLAR-14B-SON-SFT-v0.1/result.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+    "results": {
+        "daily": {
+          "daily": 3
+        },
+        "quarterly": {
+          "quarterly": 3
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.6646757679180887,
+            "acc_stderr": 0.013796182947785564,
+            "acc_norm": 0.7244027303754266,
+            "acc_norm_stderr": 0.01305716965576184
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.46036646086436966,
+            "acc_stderr": 0.004974080638364276,
+            "acc_norm": 0.6195976897032464,
+            "acc_norm_stderr": 0.004844935327599196
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.7602339181286549,
+            "acc_stderr": 0.03274485211946956,
+            "acc_norm": 0.7602339181286549,
+            "acc_norm_stderr": 0.03274485211946956
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.7766990291262136,
+            "acc_stderr": 0.04123553189891431,
+            "acc_norm": 0.7766990291262136,
+            "acc_norm_stderr": 0.04123553189891431
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.7381864623243933,
+            "acc_stderr": 0.01572083867844526,
+            "acc_norm": 0.7381864623243933,
+            "acc_norm_stderr": 0.01572083867844526
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.5037037037037037,
+            "acc_stderr": 0.04319223625811331,
+            "acc_norm": 0.5037037037037037,
+            "acc_norm_stderr": 0.04319223625811331
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.35,
+            "acc_stderr": 0.04793724854411019,
+            "acc_norm": 0.35,
+            "acc_norm_stderr": 0.04793724854411019
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.5404255319148936,
+            "acc_stderr": 0.032579014820998335,
+            "acc_norm": 0.5404255319148936,
+            "acc_norm_stderr": 0.032579014820998335
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.5180722891566265,
+            "acc_stderr": 0.038899512528272166,
+            "acc_norm": 0.5180722891566265,
+            "acc_norm_stderr": 0.038899512528272166
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.6559485530546624,
+            "acc_stderr": 0.026981478043648043,
+            "acc_norm": 0.6559485530546624,
+            "acc_norm_stderr": 0.026981478043648043
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.6591928251121076,
+            "acc_stderr": 0.0318114974705536,
+            "acc_norm": 0.6591928251121076,
+            "acc_norm_stderr": 0.0318114974705536
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.6564885496183206,
+            "acc_stderr": 0.041649760719448786,
+            "acc_norm": 0.6564885496183206,
+            "acc_norm_stderr": 0.041649760719448786
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.52,
+            "acc_stderr": 0.050211673156867795,
+            "acc_norm": 0.52,
+            "acc_norm_stderr": 0.050211673156867795
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.7575757575757576,
+            "acc_stderr": 0.030532892233932036,
+            "acc_norm": 0.7575757575757576,
+            "acc_norm_stderr": 0.030532892233932036
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.5586206896551724,
+            "acc_stderr": 0.04137931034482757,
+            "acc_norm": 0.5586206896551724,
+            "acc_norm_stderr": 0.04137931034482757
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.3137254901960784,
+            "acc_stderr": 0.04617034827006717,
+            "acc_norm": 0.3137254901960784,
+            "acc_norm_stderr": 0.04617034827006717
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.6512605042016807,
+            "acc_stderr": 0.03095663632856655,
+            "acc_norm": 0.6512605042016807,
+            "acc_norm_stderr": 0.03095663632856655
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.6230769230769231,
+            "acc_stderr": 0.024570975364225995,
+            "acc_norm": 0.6230769230769231,
+            "acc_norm_stderr": 0.024570975364225995
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.73,
+            "acc_stderr": 0.04461960433384739,
+            "acc_norm": 0.73,
+            "acc_norm_stderr": 0.04461960433384739
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.32,
+            "acc_stderr": 0.04688261722621505,
+            "acc_norm": 0.32,
+            "acc_norm_stderr": 0.04688261722621505
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.7037037037037037,
+            "acc_stderr": 0.04414343666854933,
+            "acc_norm": 0.7037037037037037,
+            "acc_norm_stderr": 0.04414343666854933
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.4630541871921182,
+            "acc_stderr": 0.035083705204426656,
+            "acc_norm": 0.4630541871921182,
+            "acc_norm_stderr": 0.035083705204426656
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.603225806451613,
+            "acc_stderr": 0.027831231605767944,
+            "acc_norm": 0.603225806451613,
+            "acc_norm_stderr": 0.027831231605767944
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.8205128205128205,
+            "acc_stderr": 0.025140935950335435,
+            "acc_norm": 0.8205128205128205,
+            "acc_norm_stderr": 0.025140935950335435
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.5962264150943396,
+            "acc_stderr": 0.03019761160019795,
+            "acc_norm": 0.5962264150943396,
+            "acc_norm_stderr": 0.03019761160019795
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.6181818181818182,
+            "acc_stderr": 0.046534298079135075,
+            "acc_norm": 0.6181818181818182,
+            "acc_norm_stderr": 0.046534298079135075
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.37407407407407406,
+            "acc_stderr": 0.029502861128955293,
+            "acc_norm": 0.37407407407407406,
+            "acc_norm_stderr": 0.029502861128955293
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.32450331125827814,
+            "acc_stderr": 0.038227469376587525,
+            "acc_norm": 0.32450331125827814,
+            "acc_norm_stderr": 0.038227469376587525
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.7164179104477612,
+            "acc_stderr": 0.03187187537919796,
+            "acc_norm": 0.7164179104477612,
+            "acc_norm_stderr": 0.03187187537919796
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.5375722543352601,
+            "acc_stderr": 0.03801685104524458,
+            "acc_norm": 0.5375722543352601,
+            "acc_norm_stderr": 0.03801685104524458
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.42857142857142855,
+            "acc_stderr": 0.025487187147859372,
+            "acc_norm": 0.42857142857142855,
+            "acc_norm_stderr": 0.025487187147859372
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.5902777777777778,
+            "acc_stderr": 0.04112490974670787,
+            "acc_norm": 0.5902777777777778,
+            "acc_norm_stderr": 0.04112490974670787
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.45,
+            "acc_stderr": 0.049999999999999996,
+            "acc_norm": 0.45,
+            "acc_norm_stderr": 0.049999999999999996
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.78,
+            "acc_stderr": 0.04163331998932263,
+            "acc_norm": 0.78,
+            "acc_norm_stderr": 0.04163331998932263
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.6184971098265896,
+            "acc_stderr": 0.026152198619726803,
+            "acc_norm": 0.6184971098265896,
+            "acc_norm_stderr": 0.026152198619726803
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.6441717791411042,
+            "acc_stderr": 0.03761521380046734,
+            "acc_norm": 0.6441717791411042,
+            "acc_norm_stderr": 0.03761521380046734
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.6944444444444444,
+            "acc_stderr": 0.025630824975621365,
+            "acc_norm": 0.6944444444444444,
+            "acc_norm_stderr": 0.025630824975621365
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.43,
+            "acc_stderr": 0.049756985195624284,
+            "acc_norm": 0.43,
+            "acc_norm_stderr": 0.049756985195624284
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.7927461139896373,
+            "acc_stderr": 0.029252823291803638,
+            "acc_norm": 0.7927461139896373,
+            "acc_norm_stderr": 0.029252823291803638
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.43859649122807015,
+            "acc_stderr": 0.04668000738510455,
+            "acc_norm": 0.43859649122807015,
+            "acc_norm_stderr": 0.04668000738510455
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.7853211009174312,
+            "acc_stderr": 0.017604304149256494,
+            "acc_norm": 0.7853211009174312,
+            "acc_norm_stderr": 0.017604304149256494
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.3968253968253968,
+            "acc_stderr": 0.04375888492727062,
+            "acc_norm": 0.3968253968253968,
+            "acc_norm_stderr": 0.04375888492727062
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.6437908496732027,
+            "acc_stderr": 0.027420477662629245,
+            "acc_norm": 0.6437908496732027,
+            "acc_norm_stderr": 0.027420477662629245
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.63,
+            "acc_stderr": 0.04852365870939099,
+            "acc_norm": 0.63,
+            "acc_norm_stderr": 0.04852365870939099
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.7603305785123967,
+            "acc_stderr": 0.03896878985070415,
+            "acc_norm": 0.7603305785123967,
+            "acc_norm_stderr": 0.03896878985070415
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.625,
+            "acc_stderr": 0.039397364351956274,
+            "acc_norm": 0.625,
+            "acc_norm_stderr": 0.039397364351956274
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.619281045751634,
+            "acc_stderr": 0.019643801557924806,
+            "acc_norm": 0.619281045751634,
+            "acc_norm_stderr": 0.019643801557924806
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.45390070921985815,
+            "acc_stderr": 0.029700453247291467,
+            "acc_norm": 0.45390070921985815,
+            "acc_norm_stderr": 0.029700453247291467
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.41964285714285715,
+            "acc_stderr": 0.04684099321077106,
+            "acc_norm": 0.41964285714285715,
+            "acc_norm_stderr": 0.04684099321077106
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.5555555555555556,
+            "acc_stderr": 0.03388857118502326,
+            "acc_norm": 0.5555555555555556,
+            "acc_norm_stderr": 0.03388857118502326
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.3575418994413408,
+            "acc_stderr": 0.016029394474894893,
+            "acc_norm": 0.3575418994413408,
+            "acc_norm_stderr": 0.016029394474894893
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.52,
+            "acc_stderr": 0.050211673156867795,
+            "acc_norm": 0.52,
+            "acc_norm_stderr": 0.050211673156867795
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.75,
+            "acc_stderr": 0.04351941398892446,
+            "acc_norm": 0.75,
+            "acc_norm_stderr": 0.04351941398892446
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.5735294117647058,
+            "acc_stderr": 0.03004261583271486,
+            "acc_norm": 0.5735294117647058,
+            "acc_norm_stderr": 0.03004261583271486
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6816326530612244,
+            "acc_stderr": 0.02982253379398204,
+            "acc_norm": 0.6816326530612244,
+            "acc_norm_stderr": 0.02982253379398204
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.7468354430379747,
+            "acc_stderr": 0.028304657943035293,
+            "acc_norm": 0.7468354430379747,
+            "acc_norm_stderr": 0.028304657943035293
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.455019556714472,
+            "acc_stderr": 0.012718456618701789,
+            "acc_norm": 0.455019556714472,
+            "acc_norm_stderr": 0.012718456618701789
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.6666666666666666,
+            "acc_stderr": 0.033086111132364364,
+            "acc_norm": 0.6666666666666666,
+            "acc_norm_stderr": 0.033086111132364364
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.6484848484848484,
+            "acc_stderr": 0.037282069986826503,
+            "acc_norm": 0.6484848484848484,
+            "acc_norm_stderr": 0.037282069986826503
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.605875152998776,
+            "mc1_stderr": 0.017106588140700332,
+            "mc2": 0.7254831072808595,
+            "mc2_stderr": 0.014162522228042162
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.5926800472255017,
+            "acc_stderr": 0.01689245669519127,
+            "acc_norm": 0.6269185360094451,
+            "acc_norm_stderr": 0.016627318275137453
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "x2bee/POLAR-14B-SON-SFT-v0.1",
+        "model_sha": "01286a13088332c1eda4279b5bcfa7a0a33e145f",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}

eval-results/x2bee/POLAR-14B-v0.2/result.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+    "results": {
+        "daily": {
+          "daily": 2
+        },
+        "quarterly": {
+          "quarterly": 2
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.7465870307167235,
+            "acc_stderr": 0.012710896778378602,
+            "acc_norm": 0.7687713310580204,
+            "acc_norm_stderr": 0.012320858834772264
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.681736705835491,
+            "acc_stderr": 0.004648503177353952,
+            "acc_norm": 0.7999402509460267,
+            "acc_norm_stderr": 0.003992272261659531
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.6549707602339181,
+            "acc_stderr": 0.036459813773888065,
+            "acc_norm": 0.6549707602339181,
+            "acc_norm_stderr": 0.036459813773888065
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.7378640776699029,
+            "acc_stderr": 0.043546310772605956,
+            "acc_norm": 0.7378640776699029,
+            "acc_norm_stderr": 0.043546310772605956
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.6922094508301405,
+            "acc_stderr": 0.016506045045155633,
+            "acc_norm": 0.6922094508301405,
+            "acc_norm_stderr": 0.016506045045155633
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.4666666666666667,
+            "acc_stderr": 0.043097329010363554,
+            "acc_norm": 0.4666666666666667,
+            "acc_norm_stderr": 0.043097329010363554
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.35,
+            "acc_stderr": 0.047937248544110196,
+            "acc_norm": 0.35,
+            "acc_norm_stderr": 0.047937248544110196
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.4595744680851064,
+            "acc_stderr": 0.03257901482099836,
+            "acc_norm": 0.4595744680851064,
+            "acc_norm_stderr": 0.03257901482099836
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.4879518072289157,
+            "acc_stderr": 0.03891364495835821,
+            "acc_norm": 0.4879518072289157,
+            "acc_norm_stderr": 0.03891364495835821
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.6045016077170418,
+            "acc_stderr": 0.027770918531427834,
+            "acc_norm": 0.6045016077170418,
+            "acc_norm_stderr": 0.027770918531427834
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.6233183856502242,
+            "acc_stderr": 0.03252113489929188,
+            "acc_norm": 0.6233183856502242,
+            "acc_norm_stderr": 0.03252113489929188
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.6412213740458015,
+            "acc_stderr": 0.04206739313864908,
+            "acc_norm": 0.6412213740458015,
+            "acc_norm_stderr": 0.04206739313864908
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.51,
+            "acc_stderr": 0.05024183937956911,
+            "acc_norm": 0.51,
+            "acc_norm_stderr": 0.05024183937956911
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.7222222222222222,
+            "acc_stderr": 0.03191178226713547,
+            "acc_norm": 0.7222222222222222,
+            "acc_norm_stderr": 0.03191178226713547
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.5241379310344828,
+            "acc_stderr": 0.0416180850350153,
+            "acc_norm": 0.5241379310344828,
+            "acc_norm_stderr": 0.0416180850350153
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.3235294117647059,
+            "acc_stderr": 0.046550104113196177,
+            "acc_norm": 0.3235294117647059,
+            "acc_norm_stderr": 0.046550104113196177
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.6764705882352942,
+            "acc_stderr": 0.030388353551886793,
+            "acc_norm": 0.6764705882352942,
+            "acc_norm_stderr": 0.030388353551886793
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.6384615384615384,
+            "acc_stderr": 0.024359581465397,
+            "acc_norm": 0.6384615384615384,
+            "acc_norm_stderr": 0.024359581465397
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.65,
+            "acc_stderr": 0.0479372485441102,
+            "acc_norm": 0.65,
+            "acc_norm_stderr": 0.0479372485441102
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.31,
+            "acc_stderr": 0.04648231987117316,
+            "acc_norm": 0.31,
+            "acc_norm_stderr": 0.04648231987117316
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.6296296296296297,
+            "acc_stderr": 0.04668408033024931,
+            "acc_norm": 0.6296296296296297,
+            "acc_norm_stderr": 0.04668408033024931
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.4729064039408867,
+            "acc_stderr": 0.03512819077876105,
+            "acc_norm": 0.4729064039408867,
+            "acc_norm_stderr": 0.03512819077876105
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.5709677419354838,
+            "acc_stderr": 0.028156036538233193,
+            "acc_norm": 0.5709677419354838,
+            "acc_norm_stderr": 0.028156036538233193
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.8034188034188035,
+            "acc_stderr": 0.026035386098951292,
+            "acc_norm": 0.8034188034188035,
+            "acc_norm_stderr": 0.026035386098951292
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.5547169811320755,
+            "acc_stderr": 0.030588052974270655,
+            "acc_norm": 0.5547169811320755,
+            "acc_norm_stderr": 0.030588052974270655
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.6363636363636364,
+            "acc_stderr": 0.04607582090719976,
+            "acc_norm": 0.6363636363636364,
+            "acc_norm_stderr": 0.04607582090719976
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.3592592592592593,
+            "acc_stderr": 0.029252905927251976,
+            "acc_norm": 0.3592592592592593,
+            "acc_norm_stderr": 0.029252905927251976
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.3576158940397351,
+            "acc_stderr": 0.03913453431177258,
+            "acc_norm": 0.3576158940397351,
+            "acc_norm_stderr": 0.03913453431177258
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.6268656716417911,
+            "acc_stderr": 0.034198326081760065,
+            "acc_norm": 0.6268656716417911,
+            "acc_norm_stderr": 0.034198326081760065
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.48554913294797686,
+            "acc_stderr": 0.03810871630454764,
+            "acc_norm": 0.48554913294797686,
+            "acc_norm_stderr": 0.03810871630454764
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.4497354497354497,
+            "acc_stderr": 0.025620857042936648,
+            "acc_norm": 0.4497354497354497,
+            "acc_norm_stderr": 0.025620857042936648
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.6041666666666666,
+            "acc_stderr": 0.04089465449325582,
+            "acc_norm": 0.6041666666666666,
+            "acc_norm_stderr": 0.04089465449325582
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.32,
+            "acc_stderr": 0.046882617226215034,
+            "acc_norm": 0.32,
+            "acc_norm_stderr": 0.046882617226215034
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206824,
+            "acc_norm": 0.71,
+            "acc_norm_stderr": 0.045604802157206824
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.5664739884393064,
+            "acc_stderr": 0.026680134761679217,
+            "acc_norm": 0.5664739884393064,
+            "acc_norm_stderr": 0.026680134761679217
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.6196319018404908,
+            "acc_stderr": 0.038142698932618374,
+            "acc_norm": 0.6196319018404908,
+            "acc_norm_stderr": 0.038142698932618374
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.6574074074074074,
+            "acc_stderr": 0.026406145973625686,
+            "acc_norm": 0.6574074074074074,
+            "acc_norm_stderr": 0.026406145973625686
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.37,
+            "acc_stderr": 0.04852365870939098,
+            "acc_norm": 0.37,
+            "acc_norm_stderr": 0.04852365870939098
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.7616580310880829,
+            "acc_stderr": 0.030748905363909895,
+            "acc_norm": 0.7616580310880829,
+            "acc_norm_stderr": 0.030748905363909895
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.5,
+            "acc_stderr": 0.047036043419179864,
+            "acc_norm": 0.5,
+            "acc_norm_stderr": 0.047036043419179864
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.7211009174311926,
+            "acc_stderr": 0.01922746887646353,
+            "acc_norm": 0.7211009174311926,
+            "acc_norm_stderr": 0.01922746887646353
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.42857142857142855,
+            "acc_stderr": 0.0442626668137991,
+            "acc_norm": 0.42857142857142855,
+            "acc_norm_stderr": 0.0442626668137991
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.5816993464052288,
+            "acc_stderr": 0.0282451340243873,
+            "acc_norm": 0.5816993464052288,
+            "acc_norm_stderr": 0.0282451340243873
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.73,
+            "acc_stderr": 0.044619604333847394,
+            "acc_norm": 0.73,
+            "acc_norm_stderr": 0.044619604333847394
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.7107438016528925,
+            "acc_stderr": 0.041391127276354626,
+            "acc_norm": 0.7107438016528925,
+            "acc_norm_stderr": 0.041391127276354626
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.6513157894736842,
+            "acc_stderr": 0.038781398887976104,
+            "acc_norm": 0.6513157894736842,
+            "acc_norm_stderr": 0.038781398887976104
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.5686274509803921,
+            "acc_stderr": 0.020036393768352624,
+            "acc_norm": 0.5686274509803921,
+            "acc_norm_stderr": 0.020036393768352624
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.45390070921985815,
+            "acc_stderr": 0.029700453247291477,
+            "acc_norm": 0.45390070921985815,
+            "acc_norm_stderr": 0.029700453247291477
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.4642857142857143,
+            "acc_stderr": 0.04733667890053756,
+            "acc_norm": 0.4642857142857143,
+            "acc_norm_stderr": 0.04733667890053756
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.5092592592592593,
+            "acc_stderr": 0.034093869469927006,
+            "acc_norm": 0.5092592592592593,
+            "acc_norm_stderr": 0.034093869469927006
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.37206703910614525,
+            "acc_stderr": 0.016165847583563295,
+            "acc_norm": 0.37206703910614525,
+            "acc_norm_stderr": 0.016165847583563295
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.43,
+            "acc_stderr": 0.049756985195624284,
+            "acc_norm": 0.43,
+            "acc_norm_stderr": 0.049756985195624284
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.71,
+            "acc_stderr": 0.045604802157206845,
+            "acc_norm": 0.71,
+            "acc_norm_stderr": 0.045604802157206845
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.5404411764705882,
+            "acc_stderr": 0.030273325077345755,
+            "acc_norm": 0.5404411764705882,
+            "acc_norm_stderr": 0.030273325077345755
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6122448979591837,
+            "acc_stderr": 0.03119223072679566,
+            "acc_norm": 0.6122448979591837,
+            "acc_norm_stderr": 0.03119223072679566
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.7257383966244726,
+            "acc_stderr": 0.029041333510598025,
+            "acc_norm": 0.7257383966244726,
+            "acc_norm_stderr": 0.029041333510598025
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.4641460234680574,
+            "acc_stderr": 0.01273736131873058,
+            "acc_norm": 0.4641460234680574,
+            "acc_norm_stderr": 0.01273736131873058
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.6568627450980392,
+            "acc_stderr": 0.03332139944668086,
+            "acc_norm": 0.6568627450980392,
+            "acc_norm_stderr": 0.03332139944668086
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.6,
+            "acc_stderr": 0.03825460278380025,
+            "acc_norm": 0.6,
+            "acc_norm_stderr": 0.03825460278380025
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.7246022031823746,
+            "mc1_stderr": 0.01563813566777552,
+            "mc2": 0.8107575910195236,
+            "mc2_stderr": 0.013335029489665237
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.525383707201889,
+            "acc_stderr": 0.017168187201429253,
+            "acc_norm": 0.5442739079102715,
+            "acc_norm_stderr": 0.017122829143292655
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "x2bee/POLAR-14B-v0.2",
+        "model_sha": "8d905623a3972e11260420130039c62e115cbbaa",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}

eval-results/x2bee/POLAR-14B-v0.5/result.json ADDED Viewed

	@@ -0,0 +1,450 @@

+{
+    "results": {
+        "daily": {
+          "daily": 1
+        },
+        "quarterly": {
+          "quarterly": 1
+        },
+        "harness|arc_challenge|25": {
+            "acc": 0.75,
+            "acc_stderr": 0.012653835621466646,
+            "acc_norm": 0.7798634812286689,
+            "acc_norm_stderr": 0.012108124883460988
+        },
+        "harness|hellaswag|10": {
+            "acc": 0.6500697072296355,
+            "acc_stderr": 0.004759729267943182,
+            "acc_norm": 0.775542720573591,
+            "acc_norm_stderr": 0.004163717220873764
+        },
+        "harness|mmlu_world_religions|5": {
+            "acc": 0.6374269005847953,
+            "acc_stderr": 0.036871306155620606,
+            "acc_norm": 0.6374269005847953,
+            "acc_norm_stderr": 0.036871306155620606
+        },
+        "harness|mmlu_management|5": {
+            "acc": 0.7087378640776699,
+            "acc_stderr": 0.044986763205729224,
+            "acc_norm": 0.7087378640776699,
+            "acc_norm_stderr": 0.044986763205729224
+        },
+        "harness|mmlu_miscellaneous|5": {
+            "acc": 0.6730523627075351,
+            "acc_stderr": 0.016774908180131484,
+            "acc_norm": 0.6730523627075351,
+            "acc_norm_stderr": 0.016774908180131484
+        },
+        "harness|mmlu_anatomy|5": {
+            "acc": 0.45185185185185184,
+            "acc_stderr": 0.04299268905480864,
+            "acc_norm": 0.45185185185185184,
+            "acc_norm_stderr": 0.04299268905480864
+        },
+        "harness|mmlu_abstract_algebra|5": {
+            "acc": 0.36,
+            "acc_stderr": 0.048241815132442176,
+            "acc_norm": 0.36,
+            "acc_norm_stderr": 0.048241815132442176
+        },
+        "harness|mmlu_conceptual_physics|5": {
+            "acc": 0.4723404255319149,
+            "acc_stderr": 0.03263597118409769,
+            "acc_norm": 0.4723404255319149,
+            "acc_norm_stderr": 0.03263597118409769
+        },
+        "harness|mmlu_virology|5": {
+            "acc": 0.46987951807228917,
+            "acc_stderr": 0.03885425420866766,
+            "acc_norm": 0.46987951807228917,
+            "acc_norm_stderr": 0.03885425420866766
+        },
+        "harness|mmlu_philosophy|5": {
+            "acc": 0.594855305466238,
+            "acc_stderr": 0.027882383791325963,
+            "acc_norm": 0.594855305466238,
+            "acc_norm_stderr": 0.027882383791325963
+        },
+        "harness|mmlu_human_aging|5": {
+            "acc": 0.6412556053811659,
+            "acc_stderr": 0.032190792004199956,
+            "acc_norm": 0.6412556053811659,
+            "acc_norm_stderr": 0.032190792004199956
+        },
+        "harness|mmlu_human_sexuality|5": {
+            "acc": 0.5954198473282443,
+            "acc_stderr": 0.043046937953806645,
+            "acc_norm": 0.5954198473282443,
+            "acc_norm_stderr": 0.043046937953806645
+        },
+        "harness|mmlu_medical_genetics|5": {
+            "acc": 0.5,
+            "acc_stderr": 0.050251890762960605,
+            "acc_norm": 0.5,
+            "acc_norm_stderr": 0.050251890762960605
+        },
+        "harness|mmlu_high_school_geography|5": {
+            "acc": 0.7272727272727273,
+            "acc_stderr": 0.03173071239071724,
+            "acc_norm": 0.7272727272727273,
+            "acc_norm_stderr": 0.03173071239071724
+        },
+        "harness|mmlu_electrical_engineering|5": {
+            "acc": 0.503448275862069,
+            "acc_stderr": 0.0416656757710158,
+            "acc_norm": 0.503448275862069,
+            "acc_norm_stderr": 0.0416656757710158
+        },
+        "harness|mmlu_college_physics|5": {
+            "acc": 0.3431372549019608,
+            "acc_stderr": 0.04724007352383888,
+            "acc_norm": 0.3431372549019608,
+            "acc_norm_stderr": 0.04724007352383888
+        },
+        "harness|mmlu_high_school_microeconomics|5": {
+            "acc": 0.6596638655462185,
+            "acc_stderr": 0.03077805742293167,
+            "acc_norm": 0.6596638655462185,
+            "acc_norm_stderr": 0.03077805742293167
+        },
+        "harness|mmlu_high_school_macroeconomics|5": {
+            "acc": 0.6102564102564103,
+            "acc_stderr": 0.024726967886647078,
+            "acc_norm": 0.6102564102564103,
+            "acc_norm_stderr": 0.024726967886647078
+        },
+        "harness|mmlu_computer_security|5": {
+            "acc": 0.67,
+            "acc_stderr": 0.047258156262526094,
+            "acc_norm": 0.67,
+            "acc_norm_stderr": 0.047258156262526094
+        },
+        "harness|mmlu_global_facts|5": {
+            "acc": 0.33,
+            "acc_stderr": 0.047258156262526045,
+            "acc_norm": 0.33,
+            "acc_norm_stderr": 0.047258156262526045
+        },
+        "harness|mmlu_jurisprudence|5": {
+            "acc": 0.6481481481481481,
+            "acc_stderr": 0.04616631111801714,
+            "acc_norm": 0.6481481481481481,
+            "acc_norm_stderr": 0.04616631111801714
+        },
+        "harness|mmlu_high_school_chemistry|5": {
+            "acc": 0.4729064039408867,
+            "acc_stderr": 0.03512819077876105,
+            "acc_norm": 0.4729064039408867,
+            "acc_norm_stderr": 0.03512819077876105
+        },
+        "harness|mmlu_high_school_biology|5": {
+            "acc": 0.5709677419354838,
+            "acc_stderr": 0.028156036538233193,
+            "acc_norm": 0.5709677419354838,
+            "acc_norm_stderr": 0.028156036538233193
+        },
+        "harness|mmlu_marketing|5": {
+            "acc": 0.7735042735042735,
+            "acc_stderr": 0.027421007295392943,
+            "acc_norm": 0.7735042735042735,
+            "acc_norm_stderr": 0.027421007295392943
+        },
+        "harness|mmlu_clinical_knowledge|5": {
+            "acc": 0.5660377358490566,
+            "acc_stderr": 0.030503292013342596,
+            "acc_norm": 0.5660377358490566,
+            "acc_norm_stderr": 0.030503292013342596
+        },
+        "harness|mmlu_public_relations|5": {
+            "acc": 0.6272727272727273,
+            "acc_stderr": 0.04631381319425465,
+            "acc_norm": 0.6272727272727273,
+            "acc_norm_stderr": 0.04631381319425465
+        },
+        "harness|mmlu_high_school_mathematics|5": {
+            "acc": 0.3333333333333333,
+            "acc_stderr": 0.0287420409039485,
+            "acc_norm": 0.3333333333333333,
+            "acc_norm_stderr": 0.0287420409039485
+        },
+        "harness|mmlu_high_school_physics|5": {
+            "acc": 0.39072847682119205,
+            "acc_stderr": 0.039837983066598075,
+            "acc_norm": 0.39072847682119205,
+            "acc_norm_stderr": 0.039837983066598075
+        },
+        "harness|mmlu_sociology|5": {
+            "acc": 0.6417910447761194,
+            "acc_stderr": 0.03390393042268814,
+            "acc_norm": 0.6417910447761194,
+            "acc_norm_stderr": 0.03390393042268814
+        },
+        "harness|mmlu_college_medicine|5": {
+            "acc": 0.5028901734104047,
+            "acc_stderr": 0.038124005659748335,
+            "acc_norm": 0.5028901734104047,
+            "acc_norm_stderr": 0.038124005659748335
+        },
+        "harness|mmlu_elementary_mathematics|5": {
+            "acc": 0.42857142857142855,
+            "acc_stderr": 0.025487187147859372,
+            "acc_norm": 0.42857142857142855,
+            "acc_norm_stderr": 0.025487187147859372
+        },
+        "harness|mmlu_college_biology|5": {
+            "acc": 0.6180555555555556,
+            "acc_stderr": 0.040629907841466674,
+            "acc_norm": 0.6180555555555556,
+            "acc_norm_stderr": 0.040629907841466674
+        },
+        "harness|mmlu_college_chemistry|5": {
+            "acc": 0.3,
+            "acc_stderr": 0.046056618647183814,
+            "acc_norm": 0.3,
+            "acc_norm_stderr": 0.046056618647183814
+        },
+        "harness|mmlu_us_foreign_policy|5": {
+            "acc": 0.72,
+            "acc_stderr": 0.04512608598542127,
+            "acc_norm": 0.72,
+            "acc_norm_stderr": 0.04512608598542127
+        },
+        "harness|mmlu_moral_disputes|5": {
+            "acc": 0.5809248554913294,
+            "acc_stderr": 0.026564178111422622,
+            "acc_norm": 0.5809248554913294,
+            "acc_norm_stderr": 0.026564178111422622
+        },
+        "harness|mmlu_logical_fallacies|5": {
+            "acc": 0.6257668711656442,
+            "acc_stderr": 0.03802068102899615,
+            "acc_norm": 0.6257668711656442,
+            "acc_norm_stderr": 0.03802068102899615
+        },
+        "harness|mmlu_prehistory|5": {
+            "acc": 0.5987654320987654,
+            "acc_stderr": 0.027272582849839803,
+            "acc_norm": 0.5987654320987654,
+            "acc_norm_stderr": 0.027272582849839803
+        },
+        "harness|mmlu_college_mathematics|5": {
+            "acc": 0.34,
+            "acc_stderr": 0.04760952285695235,
+            "acc_norm": 0.34,
+            "acc_norm_stderr": 0.04760952285695235
+        },
+        "harness|mmlu_high_school_government_and_politics|5": {
+            "acc": 0.7512953367875648,
+            "acc_stderr": 0.031195840877700304,
+            "acc_norm": 0.7512953367875648,
+            "acc_norm_stderr": 0.031195840877700304
+        },
+        "harness|mmlu_econometrics|5": {
+            "acc": 0.47368421052631576,
+            "acc_stderr": 0.046970851366478626,
+            "acc_norm": 0.47368421052631576,
+            "acc_norm_stderr": 0.046970851366478626
+        },
+        "harness|mmlu_high_school_psychology|5": {
+            "acc": 0.7229357798165138,
+            "acc_stderr": 0.019188482590169538,
+            "acc_norm": 0.7229357798165138,
+            "acc_norm_stderr": 0.019188482590169538
+        },
+        "harness|mmlu_formal_logic|5": {
+            "acc": 0.4523809523809524,
+            "acc_stderr": 0.044518079590553275,
+            "acc_norm": 0.4523809523809524,
+            "acc_norm_stderr": 0.044518079590553275
+        },
+        "harness|mmlu_nutrition|5": {
+            "acc": 0.5718954248366013,
+            "acc_stderr": 0.028332397483664278,
+            "acc_norm": 0.5718954248366013,
+            "acc_norm_stderr": 0.028332397483664278
+        },
+        "harness|mmlu_business_ethics|5": {
+            "acc": 0.68,
+            "acc_stderr": 0.04688261722621504,
+            "acc_norm": 0.68,
+            "acc_norm_stderr": 0.04688261722621504
+        },
+        "harness|mmlu_international_law|5": {
+            "acc": 0.7520661157024794,
+            "acc_stderr": 0.039418975265163025,
+            "acc_norm": 0.7520661157024794,
+            "acc_norm_stderr": 0.039418975265163025
+        },
+        "harness|mmlu_astronomy|5": {
+            "acc": 0.618421052631579,
+            "acc_stderr": 0.03953173377749194,
+            "acc_norm": 0.618421052631579,
+            "acc_norm_stderr": 0.03953173377749194
+        },
+        "harness|mmlu_professional_psychology|5": {
+            "acc": 0.5408496732026143,
+            "acc_stderr": 0.020160213617222516,
+            "acc_norm": 0.5408496732026143,
+            "acc_norm_stderr": 0.020160213617222516
+        },
+        "harness|mmlu_professional_accounting|5": {
+            "acc": 0.45390070921985815,
+            "acc_stderr": 0.029700453247291463,
+            "acc_norm": 0.45390070921985815,
+            "acc_norm_stderr": 0.029700453247291463
+        },
+        "harness|mmlu_machine_learning|5": {
+            "acc": 0.44642857142857145,
+            "acc_stderr": 0.04718471485219588,
+            "acc_norm": 0.44642857142857145,
+            "acc_norm_stderr": 0.04718471485219588
+        },
+        "harness|mmlu_high_school_statistics|5": {
+            "acc": 0.5416666666666666,
+            "acc_stderr": 0.03398110890294636,
+            "acc_norm": 0.5416666666666666,
+            "acc_norm_stderr": 0.03398110890294636
+        },
+        "harness|mmlu_moral_scenarios|5": {
+            "acc": 0.35195530726256985,
+            "acc_stderr": 0.01597266852368907,
+            "acc_norm": 0.35195530726256985,
+            "acc_norm_stderr": 0.01597266852368907
+        },
+        "harness|mmlu_college_computer_science|5": {
+            "acc": 0.44,
+            "acc_stderr": 0.0498887651569859,
+            "acc_norm": 0.44,
+            "acc_norm_stderr": 0.0498887651569859
+        },
+        "harness|mmlu_high_school_computer_science|5": {
+            "acc": 0.68,
+            "acc_stderr": 0.04688261722621503,
+            "acc_norm": 0.68,
+            "acc_norm_stderr": 0.04688261722621503
+        },
+        "harness|mmlu_professional_medicine|5": {
+            "acc": 0.5147058823529411,
+            "acc_stderr": 0.03035969707904612,
+            "acc_norm": 0.5147058823529411,
+            "acc_norm_stderr": 0.03035969707904612
+        },
+        "harness|mmlu_security_studies|5": {
+            "acc": 0.6122448979591837,
+            "acc_stderr": 0.031192230726795656,
+            "acc_norm": 0.6122448979591837,
+            "acc_norm_stderr": 0.031192230726795656
+        },
+        "harness|mmlu_high_school_world_history|5": {
+            "acc": 0.7215189873417721,
+            "acc_stderr": 0.029178682304842538,
+            "acc_norm": 0.7215189873417721,
+            "acc_norm_stderr": 0.029178682304842538
+        },
+        "harness|mmlu_professional_law|5": {
+            "acc": 0.4634941329856584,
+            "acc_stderr": 0.012736153390214963,
+            "acc_norm": 0.4634941329856584,
+            "acc_norm_stderr": 0.012736153390214963
+        },
+        "harness|mmlu_high_school_us_history|5": {
+            "acc": 0.6568627450980392,
+            "acc_stderr": 0.03332139944668086,
+            "acc_norm": 0.6568627450980392,
+            "acc_norm_stderr": 0.03332139944668086
+        },
+        "harness|mmlu_high_school_european_history|5": {
+            "acc": 0.5818181818181818,
+            "acc_stderr": 0.03851716319398393,
+            "acc_norm": 0.5818181818181818,
+            "acc_norm_stderr": 0.03851716319398393
+        },
+        "harness|truthfulqa_mc|0": {
+            "mc1": 0.7833537331701347,
+            "mc1_stderr": 0.014421468452506978,
+            "mc2": 0.8572574997405501,
+            "mc2_stderr": 0.01200311225898601
+        },
+        "harness|commongen_v2|2": {
+            "acc": 0.5159386068476978,
+            "acc_stderr": 0.017181617837190195,
+            "acc_norm": 0.5301062573789846,
+            "acc_norm_stderr": 0.01715916359017022
+        }
+    },
+    "versions": {
+        "all": 0,
+        "harness|arc_challenge|25": 0,
+        "harness|hellaswag|10": 0,
+        "harness|mmlu_world_religions|5": 1,
+        "harness|mmlu_management|5": 1,
+        "harness|mmlu_miscellaneous|5": 1,
+        "harness|mmlu_anatomy|5": 1,
+        "harness|mmlu_abstract_algebra|5": 1,
+        "harness|mmlu_conceptual_physics|5": 1,
+        "harness|mmlu_virology|5": 1,
+        "harness|mmlu_philosophy|5": 1,
+        "harness|mmlu_human_aging|5": 1,
+        "harness|mmlu_human_sexuality|5": 1,
+        "harness|mmlu_medical_genetics|5": 1,
+        "harness|mmlu_high_school_geography|5": 1,
+        "harness|mmlu_electrical_engineering|5": 1,
+        "harness|mmlu_college_physics|5": 1,
+        "harness|mmlu_high_school_microeconomics|5": 1,
+        "harness|mmlu_high_school_macroeconomics|5": 1,
+        "harness|mmlu_computer_security|5": 1,
+        "harness|mmlu_global_facts|5": 1,
+        "harness|mmlu_jurisprudence|5": 1,
+        "harness|mmlu_high_school_chemistry|5": 1,
+        "harness|mmlu_high_school_biology|5": 1,
+        "harness|mmlu_marketing|5": 1,
+        "harness|mmlu_clinical_knowledge|5": 1,
+        "harness|mmlu_public_relations|5": 1,
+        "harness|mmlu_high_school_mathematics|5": 1,
+        "harness|mmlu_high_school_physics|5": 1,
+        "harness|mmlu_sociology|5": 1,
+        "harness|mmlu_college_medicine|5": 1,
+        "harness|mmlu_elementary_mathematics|5": 1,
+        "harness|mmlu_college_biology|5": 1,
+        "harness|mmlu_college_chemistry|5": 1,
+        "harness|mmlu_us_foreign_policy|5": 1,
+        "harness|mmlu_moral_disputes|5": 1,
+        "harness|mmlu_logical_fallacies|5": 1,
+        "harness|mmlu_prehistory|5": 1,
+        "harness|mmlu_college_mathematics|5": 1,
+        "harness|mmlu_high_school_government_and_politics|5": 1,
+        "harness|mmlu_econometrics|5": 1,
+        "harness|mmlu_high_school_psychology|5": 1,
+        "harness|mmlu_formal_logic|5": 1,
+        "harness|mmlu_nutrition|5": 1,
+        "harness|mmlu_business_ethics|5": 1,
+        "harness|mmlu_international_law|5": 1,
+        "harness|mmlu_astronomy|5": 1,
+        "harness|mmlu_professional_psychology|5": 1,
+        "harness|mmlu_professional_accounting|5": 1,
+        "harness|mmlu_machine_learning|5": 1,
+        "harness|mmlu_high_school_statistics|5": 1,
+        "harness|mmlu_moral_scenarios|5": 1,
+        "harness|mmlu_college_computer_science|5": 1,
+        "harness|mmlu_high_school_computer_science|5": 1,
+        "harness|mmlu_professional_medicine|5": 1,
+        "harness|mmlu_security_studies|5": 1,
+        "harness|mmlu_high_school_world_history|5": 1,
+        "harness|mmlu_professional_law|5": 1,
+        "harness|mmlu_high_school_us_history|5": 1,
+        "harness|mmlu_high_school_european_history|5": 1,
+        "harness|truthfulqa_mc|0": 0,
+        "harness|commongen_v2|2": 1
+    },
+    "config_general": {
+        "model_name": "x2bee/POLAR-14B-v0.5",
+        "model_sha": "74a1ef65a8d650e5358be229def31688738d8c6a",
+        "model_dtype": "torch.float16",
+        "lighteval_sha": "",
+        "num_few_shot_default": 0,
+        "num_fewshot_seeds": 1,
+        "override_batch_size": 1,
+        "max_samples": null
+    }
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+APScheduler==3.10.1
+black==23.11.0
+click==8.1.3
+datasets==2.14.5
+gradio==4.19.2
+gradio_client==0.10.1
+huggingface-hub>=0.18.0
+matplotlib==3.7.1
+numpy==1.24.2
+pandas==2.0.0
+plotly==5.14.1
+python-dateutil==2.8.2
+requests==2.28.2
+sentencepiece
+tqdm==4.65.0
+transformers==4.38.2
+tokenizers>=0.15.0
+gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
+torch

scripts/create_request_file.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import json
+import os
+import pprint
+import re
+from datetime import datetime, timezone
+import click
+from colorama import Fore
+from huggingface_hub import HfApi, snapshot_download
+EVAL_REQUESTS_PATH = "eval-queue"
+QUEUE_REPO = "open-ko-llm-leaderboard/requests"
+precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
+model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
+weight_types = ("Original", "Delta", "Adapter")
+def get_model_size(model_info, precision: str):
+    size_pattern =  re.compile(r"(\d+\.)?\d+(b|m)")
+    try:
+        model_size = round(model_info.safetensors["total"] / 1e9, 3)
+    except (AttributeError, TypeError):
+        try:
+            size_match = re.search(size_pattern, model_info.modelId.lower())
+            model_size = size_match.group(0)
+            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
+        except AttributeError:
+            return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
+    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
+    model_size = size_factor * model_size
+    return model_size
+def main():
+    api = HfApi()
+    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset")
+    model_name = click.prompt("Enter model name")
+    revision = click.prompt("Enter revision", default="main")
+    precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
+    model_type = click.prompt("Enter model type", type=click.Choice(model_types))
+    weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
+    base_model = click.prompt("Enter base model", default="")
+    status = click.prompt("Enter status", default="FINISHED")
+    try:
+        model_info = api.model_info(repo_id=model_name, revision=revision)
+    except Exception as e:
+        print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
+        return 1
+    model_size = get_model_size(model_info=model_info, precision=precision)
+    try:
+        license = model_info.cardData["license"]
+    except Exception:
+        license = "?"
+    eval_entry = {
+        "model": model_name,
+        "base_model": base_model,
+        "revision": revision,
+        "private": False,
+        "precision": precision,
+        "weight_type": weight_type,
+        "status": status,
+        "submitted_time": current_time,
+        "model_type": model_type,
+        "likes": model_info.likes,
+        "params": model_size,
+        "license": license,
+    }
+    user_name = ""
+    model_path = model_name
+    if "/" in model_name:
+        user_name = model_name.split("/")[0]
+        model_path = model_name.split("/")[1]
+    pprint.pprint(eval_entry)
+    if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
+        click.echo("continuing...")
+        out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
+        os.makedirs(out_dir, exist_ok=True)
+        out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
+        with open(out_path, "w") as f:
+            f.write(json.dumps(eval_entry))
+        api.upload_file(
+            path_or_fileobj=out_path,
+            path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
+            repo_id=QUEUE_REPO,
+            repo_type="dataset",
+            commit_message=f"Add {model_name} to eval queue",
+        )
+    else:
+        click.echo("aborting...")
+if __name__ == "__main__":
+    main()

scripts/update_request_files.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import json
+import os
+import glob
+import pprint
+import re
+from datetime import datetime, timezone
+import click
+from colorama import Fore
+from huggingface_hub import HfApi, snapshot_download
+from huggingface_hub.hf_api import ModelInfo
+API = HfApi()
+def get_model_size(model_info: ModelInfo, precision: str):
+    size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
+    try:
+        model_size = round(model_info.safetensors["total"] / 1e9, 3)
+    except (AttributeError, TypeError ):
+        try:
+            size_match = re.search(size_pattern, model_info.modelId.split("/")[-1].lower())
+            model_size = size_match.group(0)
+            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
+        except AttributeError:
+            return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
+    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.split("/")[-1].lower()) else 1
+    model_size = size_factor * model_size
+    return model_size
+def update_request_files(requests_path):
+    request_files = os.path.join(
+        requests_path, "*/*.json"
+    )
+    request_files = glob.glob(request_files)
+    request_files = sorted(request_files, reverse=True)
+    for tmp_request_file in request_files:
+        with open(tmp_request_file, "r") as f:
+            req_content = json.load(f)
+            new_req_content = add_model_info(req_content)
+        # if new content is different, update the file
+        if new_req_content != req_content:
+            with open(tmp_request_file, "w") as f:
+                f.write(json.dumps(new_req_content, indent=4))
+def add_model_info(entry):
+    model = entry["model"]
+    revision = entry["revision"]
+    try:
+        model_info = API.model_info(repo_id=model, revision=revision)
+    except Exception:
+        print(f"Could not get model information for {model} revision {revision}")
+        return entry
+    new_entry = entry.copy()
+    model_size = get_model_size(model_info=model_info, precision='float16')
+    new_entry["params"] = model_size
+    new_entry["likes"] = model_info.likes
+    # Were the model card and license filled?
+    try:
+        license = model_info.cardData["license"]
+        new_entry["license"] = license
+    except Exception:
+        print(f"No license for {model} revision {revision}")
+    print(json.dumps(new_entry, indent=4))
+    return new_entry
+if __name__ == "__main__":
+    # update_request_files("/Users/sean/workspace/leaderboard/leaderboard-test-requests")
+    update_request_files("/Volumes/Data-case-sensitive/requests")

src/__pycache__/envs.cpython-310.pyc ADDED Viewed

Binary file (1.07 kB). View file

src/__pycache__/populate.cpython-310.pyc ADDED Viewed

Binary file (2.93 kB). View file

src/display/__pycache__/about.cpython-310.pyc ADDED Viewed

Binary file (5.36 kB). View file

src/display/__pycache__/css_html_js.cpython-310.pyc ADDED Viewed

Binary file (1.69 kB). View file

src/display/__pycache__/formatting.cpython-310.pyc ADDED Viewed

Binary file (1.78 kB). View file

src/display/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (5.94 kB). View file

src/display/about.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from src.display.utils import ModelType
+TITLE = """<img src="https://i.postimg.cc/250G53CJ/src-display-SIL-logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
+INTRODUCTION_TEXT = f"""
+Welcome to the Self-Improving Leaderboard (SIL) - A Revolutionary Platform for Evaluating Large Language Models
+The SIL offers a dynamic approach to assessing and ranking open-source LLMs and chatbots. Our innovative system continuously updates test datasets and recalculates rankings daily, ensuring evaluations reflect the rapid evolution of language processing capabilities.
+Key Features:
+• Daily-refreshed test datasets
+• Adaptive ranking system
+• Real-world language processing challenges
+• Comprehensive model performance insights
+Explore our cutting-edge evaluation process, gain deep insights into model capabilities, and see how different LLMs compare in this ever-changing landscape.
+Ready to participate? Submit your model for evaluation on the 'Submit' page and join the forefront of LLM advancement. For a detailed look at our methodology, visit the 'About' page.
+The SIL is proudly developed and maintained by [Your Organization/Team Name]. Together, let's push the boundaries of language AI!
+"""
+LLM_BENCHMARKS_TEXT = f"""
+# How it works
+🔄 The Self-Improving Leaderboard (SIL) operates on a dynamic evaluation system that continuously evolves to reflect real-world language processing challenges. Here's an overview of our process:
+Daily Dataset Refresh
+Our system generates new test data daily from diverse, reputable sources.
+Advanced Large Language Models (LLMs) are utilized to synthesize additional relevant content.
+The dataset is divided into two sections:
+A primary dataset maintaining the integrity of sourced data
+A noise-injected dataset simulating real-world data complexities
+Model Evaluation
+Participating models are rigorously evaluated against the refreshed dataset every 24 hours.
+We employ a comprehensive set of metrics aligned with industry-standard benchmarks.
+Our evaluation framework is built on the Eleuther AI Language Model Evaluation Harness, ensuring a robust and consistent assessment.
+Ranking System
+Model rankings are updated daily based on their performance across various tasks.
+The leaderboard reflects not only the latest scores but also tracks consistency and adaptability over time.
+Quarterly Comprehensive Evaluation
+Every three months, we conduct an in-depth analysis of model performance.
+This evaluation considers long-term trends, adaptability to evolving data, and overall efficacy.
+Special recognition (e.g., medals or badges) may be awarded based on sustained excellence.
+By continuously refreshing our test data and evaluation criteria, SIL aims to provide a more accurate representation of model performance in real-world scenarios, driving innovation in the field of Natural Language Processing.
+## Icons
+{ModelType.PT.to_str(" : ")} model
+{ModelType.IFT.to_str(" : ")} model
+{ModelType.RL.to_str(" : ")} model
+If there is no icon, it indicates that there is insufficient information about the model.
+Please provide information about the model through an issue! 🤩
+## Details and Logs
+- Detailed numerical results in the `results` dataset: https://huggingface.co/datasets/junkim100/SIL_results
+- Community queries and running status in the `requests` dataset: https://huggingface.co/datasets/junkim100/SIL_requests
+"""
+EVALUATION_QUEUE_TEXT = f"""
+# Evaluation Queue for the 🔄 Self-Improving Leaderboard
+## <Some good practices before submitting a model>
+### 1️⃣ Make sure you can load your model and tokenizer using AutoClasses
+```python
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+config = AutoConfig.from_pretrained("your model name", revision=revision)
+model = AutoModel.from_pretrained("your model name", revision=revision)
+tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
+```
+If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
+⚠️ Make sure your model is public!
+⚠️ Maker sure your model runs with [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)
+### 2️⃣ Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
+It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
+### 3️⃣ Make sure your model has an open license!
+We'd love for as many people as possible to know they can use your model
+### 4️⃣ Fill up your model card
+When we add extra information about models to the leaderboard, it will be automatically taken from the model card
+## In case of model failure
+If your model is displayed in the `FAILED` category, its execution stopped. Make sure you have followed the above steps first. If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
+"""

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,84 @@

+custom_css = """
+/* Hides the final AutoEvalColumn */
+#llm-benchmark-tab-table table td:last-child,
+#llm-benchmark-tab-table table th:last-child {
+    display: none;
+}
+/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
+table td:first-child,
+table th:first-child {
+    max-width: 400px;
+    overflow: auto;
+    white-space: nowrap;
+}
+/* Full width space */
+.gradio-container {
+  max-width: 95%!important;
+}
+/* Text style and margins */
+.markdown-text {
+    font-size: 16px !important;
+}
+#models-to-add-text {
+    font-size: 18px !important;
+}
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+#search-bar {
+    padding: 0px;
+}
+.tab-buttons button {
+    font-size: 20px;
+}
+/* Filters style */
+#filter_type{
+    border: 0;
+    padding-left: 0;
+    padding-top: 0;
+}
+#filter_type label {
+    display: flex;
+}
+#filter_type label > span{
+    margin-top: var(--spacing-lg);
+    margin-right: 0.5em;
+}
+#filter_type label > .wrap{
+    width: 103px;
+}
+#filter_type label > .wrap .wrap-inner{
+    padding: 2px;
+}
+#filter_type label > .wrap .wrap-inner input{
+    width: 1px
+}
+#filter-columns-type{
+    border:0;
+    padding:0.5;
+}
+#filter-columns-size{
+    border:0;
+    padding:0.5;
+}
+#box-filter > .form{
+    border: 0
+}
+"""
+get_window_url_params = """
+    function(url_params) {
+        const params = new URLSearchParams(window.location.search);
+        url_params = Object.fromEntries(params);
+        return url_params;
+    }
+    """

src/display/formatting.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from datetime import datetime, timezone
+from huggingface_hub import HfApi
+from huggingface_hub.hf_api import ModelInfo
+API = HfApi()
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+    details_model_name = model_name.replace("/", "__")
+    details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
+    return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "📑")
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
+def has_no_nan_values(df, columns):
+    return df[columns].notna().all(axis=1)
+def has_nan_values(df, columns):
+    return df[columns].isna().any(axis=1)

src/display/utils.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from dataclasses import dataclass, make_dataclass
+from enum import Enum
+import pandas as pd
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+class Tasks(Enum):
+    arc = Task("arc_challenge", "acc_norm", "ARC")
+    hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
+    mmlu = Task("mmlu", "acc", "MMLU")
+    truthfulqa = Task("truthfulqa_mc", "mc2", "TruthfulQA")
+    # winogrande = Task("winogrande", "acc_norm", "Winogrande")
+    # gsm8k = Task("gsm8k", "acc_norm", "GSM8k")
+    commongen_v2 = Task("commongen_v2", "acc_norm", "CommonGen V2")
+    # eqBench = Task("eq_bench", "acc_norm", "EQ Bench")
+    # instFollow = Task("inst_follow", "acc_norm", "InstFollow")
+    # harmlessness = Task("harmlessness", "acc_norm", "Harmlessness")
+    # helpfulness = Task("helpfulness", "acc_norm", "Helpfulness")
+class Ranks(Enum):
+    daily = Task("daily", "daily", "Daily Rank")
+    quarterly = Task("quarterly", "quarterly", "Quarterly Rank")
+# These classes are for user facing column names,
+# to avoid having to change them all around the code
+# when a modif is needed
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+    never_hidden: bool = False
+    dummy: bool = False
+auto_eval_column_dict = []
+# Init
+auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
+auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+# Ranks
+auto_eval_column_dict.append(["daily", ColumnContent, ColumnContent("Daily Rank", "number", True)])
+auto_eval_column_dict.append(["quarterly", ColumnContent, ColumnContent("Quarterly Rank", "number", True)])
+# Scores
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
+for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+# Model information
+auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "bool", False)])
+auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
+auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, False)])
+# Dummy column for the search bar (hidden by the custom CSS)
+auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
+# We use make dataclass to dynamically fill the scores from Tasks
+AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
+@dataclass(frozen=True)
+class EvalQueueColumn:  # Queue column
+    model = ColumnContent("model", "markdown", True)
+    revision = ColumnContent("revision", "str", True)
+    private = ColumnContent("private", "bool", True)
+    precision = ColumnContent("precision", "str", True)
+    weight_type = ColumnContent("weight_type", "str", "Original")
+    status = ColumnContent("status", "str", True)
+# Define the human baselines
+human_baseline_row = {
+    AutoEvalColumn.model.name: "<p>Human performance</p>",
+}
+@dataclass
+class ModelDetails:
+    name: str
+    symbol: str = "" # emoji, only for the model type
+class ModelType(Enum):
+    PT = ModelDetails(name="pretrained", symbol="🟢")
+    # FT = ModelDetails(name="fine-tuned", symbol="🔶")
+    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
+    RL = ModelDetails(name="RL-tuned", symbol="🟦")
+    Unknown = ModelDetails(name="", symbol="?")
+    def to_str(self, separator=" "):
+        return f"{self.value.symbol}{separator}{self.value.name}"
+    @staticmethod
+    def from_str(type):
+        # if "fine-tuned" in type or "🔶" in type:
+        #     return ModelType.FT
+        if "pretrained" in type or "🟢" in type:
+            return ModelType.PT
+        if "RL-tuned" in type or "🟦" in type:
+            return ModelType.RL
+        if "instruction-tuned" in type or "⭕" in type:
+            return ModelType.IFT
+        return ModelType.Unknown
+class WeightType(Enum):
+    Adapter = ModelDetails("Adapter")
+    Original = ModelDetails("Original")
+    Delta = ModelDetails("Delta")
+class Precision(Enum):
+    float16 = ModelDetails("float16")
+    # bfloat16 = ModelDetails("bfloat16")
+    # qt_8bit = ModelDetails("8bit")
+    # qt_4bit = ModelDetails("4bit")
+    # qt_GPTQ = ModelDetails("GPTQ")
+    Unknown = ModelDetails("?")
+    def from_str(precision):
+        if precision in ["torch.float16", "float16"]:
+            return Precision.float16
+        if precision in ["8bit"]:
+            return Precision.qt_8bit
+        if precision in ["4bit"]:
+            return Precision.qt_4bit
+        if precision in ["GPTQ", "None"]:
+            return Precision.qt_GPTQ
+        return Precision.Unknown
+# Column selection
+COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
+COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
+TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
+EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
+EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]
+NUMERIC_INTERVALS = {
+    "Unknown": pd.Interval(-1, 0, closed="right"),
+    "0~3B": pd.Interval(0, 3, closed="right"),
+    "3~7B": pd.Interval(3, 7.3, closed="right"),
+    "7~13B": pd.Interval(7.3, 13, closed="right"),
+    "13~35B": pd.Interval(13, 35, closed="right"),
+    "35~60B": pd.Interval(35, 60, closed="right"),
+    "60B+": pd.Interval(60, 10000, closed="right"),
+}

src/envs.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+from huggingface_hub import HfApi
+# clone / pull the lmeh eval data
+H4_TOKEN = os.environ.get("H4_TOKEN", None)
+REPO_ID = "junkim100/self-improving-leaderboard"
+QUEUE_REPO = "junkim100/SIL_requests"
+RESULTS_REPO = "junkim100/SIL_results"
+PRIVATE_QUEUE_REPO = "open-ko-llm-leaderboard/private-requests"
+PRIVATE_RESULTS_REPO = "open-ko-llm-leaderboard/private-results"
+IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
+CACHE_PATH=os.getenv("HF_HOME", ".")
+EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
+EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
+PATH_TO_COLLECTION = "open-ko-llm-leaderboard/ko-llm-leaderboard-best-models-659c7e45a481ceea4c883506"
+# Rate limit variables
+RATE_LIMIT_PERIOD = 7
+RATE_LIMIT_QUOTA = 5
+HAS_HIGHER_RATE_LIMIT = []
+API = HfApi(token=H4_TOKEN)

src/leaderboard/__pycache__/filter_models.cpython-310.pyc ADDED Viewed

Binary file (1.47 kB). View file

src/leaderboard/__pycache__/read_evals.cpython-310.pyc ADDED Viewed

Binary file (7.78 kB). View file

src/leaderboard/filter_models.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from src.display.formatting import model_hyperlink
+from src.display.utils import AutoEvalColumn
+# Models which have been flagged by users as being problematic for a reason or another
+# (Model name to forum discussion link)
+FLAGGED_MODELS = {
+    "merged": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
+    "TeamUNIVA/Komodo_7B_v0.1.0": "https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/44",
+}
+# Models which have been requested by orgs to not be submitted on the leaderboard
+DO_NOT_SUBMIT_MODELS = [
+]
+def flag_models(leaderboard_data: list[dict]):
+    for model_data in leaderboard_data:
+        # Merges are flagged automatically
+        if model_data[AutoEvalColumn.flagged.name] == True:
+            flag_key = "merged"
+        else:
+            flag_key = model_data["model_name_for_query"]
+        if flag_key in FLAGGED_MODELS:
+            issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
+            issue_link = model_hyperlink(
+                FLAGGED_MODELS[flag_key],
+                f"See discussion #{issue_num}",
+            )
+            model_data[
+                AutoEvalColumn.model.name
+            ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
+            model_data[AutoEvalColumn.flagged.name] = True
+        else:
+            model_data[AutoEvalColumn.flagged.name] = False
+def remove_forbidden_models(leaderboard_data: list[dict]):
+    indices_to_remove = []
+    for ix, model in enumerate(leaderboard_data):
+        if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
+            indices_to_remove.append(ix)
+    for ix in reversed(indices_to_remove):
+        leaderboard_data.pop(ix)
+    return leaderboard_data
+def filter_models(leaderboard_data: list[dict]):
+    leaderboard_data = remove_forbidden_models(leaderboard_data)
+    flag_models(leaderboard_data)

src/leaderboard/read_evals.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import glob
+import json
+import math
+import os
+from dataclasses import dataclass
+import dateutil
+import numpy as np
+from huggingface_hub import ModelCard
+from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Tasks, Ranks, Precision, WeightType
+from src.submission.check_validity import is_model_on_hub, check_model_card
+@dataclass
+class EvalResult:
+    # Also see src.display.utils.AutoEvalColumn for what will be displayed.
+    eval_name: str # org_model_precision (uid)
+    full_model: str # org/model (path on hub)
+    org: str
+    model: str
+    revision: str # commit hash, "" if main
+    results: dict
+    precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
+    weight_type: WeightType = WeightType.Original # Original or Adapter
+    architecture: str = "Unknown" # From config file
+    license: str = "?"
+    likes: int = 0
+    num_params: int = 0
+    date: str = "" # submission date of request file
+    still_on_hub: bool = False
+    is_merge: bool = False
+    flagged: bool = False
+    @classmethod
+    def init_from_json_file(self, json_filepath):
+        """Inits the result from the specific model result file"""
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        # We manage the legacy config format
+        config = data.get("config", data.get("config_general", None))
+        # Precision
+        precision = Precision.from_str(config.get("model_dtype"))
+        # Get model and org
+        org_and_model = config.get("model_name", config.get("model_args", None))
+        org_and_model = org_and_model.split("/", 1)
+        if len(org_and_model) == 1:
+            org = None
+            model = org_and_model[0]
+            result_key = f"{model}_{precision.value.name}"
+        else:
+            org = org_and_model[0]
+            model = org_and_model[1]
+            result_key = f"{org}_{model}_{precision.value.name}"
+        full_model = "/".join(org_and_model)
+        still_on_hub, error, model_config = is_model_on_hub(
+            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
+        )
+        architecture = "?"
+        if model_config is not None:
+            architectures = getattr(model_config, "architectures", None)
+            if architectures:
+                architecture = ";".join(architectures)
+        # If the model doesn't have a model card or a license, we consider it's deleted
+        if still_on_hub:
+            try:
+                if check_model_card(full_model)[0] is False:
+                    still_on_hub = False
+            except Exception:
+                still_on_hub = False
+        # Check if the model is a merge
+        is_merge_from_metadata = False
+        flagged = False
+        if still_on_hub:
+            model_card = ModelCard.load(full_model)
+            if model_card.data.tags:
+                is_merge_from_metadata = "merge" in model_card.data.tags
+            merge_keywords = ["mergekit", "merged model", "merge model", "merging", "Carbon"]
+            # If the model is a merge but not saying it in the metadata, we flag it
+            is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
+            flagged = is_merge_from_model_card and not is_merge_from_metadata
+        # Extract results available in this file (some results are split in several files)
+        results = {}
+        for rank in Ranks:
+            rank = rank.value
+            if rank.benchmark in data["results"]:
+                results[rank.benchmark] = data["results"][rank.benchmark][rank.metric]
+        for task in Tasks:
+            task = task.value
+            # Some truthfulQA values are NaNs
+            if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
+                if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
+                    results[task.benchmark] = 0.0
+                    continue
+            # New tasks have been added, we need to skip them if not exists
+            if task.benchmark in ["winogrande", "gsm8k", "eq_bench", "inst_follow", "harmlessness", "helpfulness"]:
+                accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
+                if accs.size == 0 or any([acc is None for acc in accs]):
+                    results[task.benchmark] = 0.0
+                    continue
+            # We average all scores of a given metric (mostly for mmlu)
+            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
+            if accs.size == 0 or any([acc is None for acc in accs]):
+                continue
+            mean_acc = np.mean(accs) * 100.0
+            results[task.benchmark] = mean_acc
+        return self(
+            eval_name=result_key,
+            full_model=full_model,
+            org=org,
+            model=model,
+            results=results,
+            precision=precision,
+            revision= config.get("model_sha", ""),
+            still_on_hub=still_on_hub,
+            architecture=architecture,
+            is_merge=is_merge_from_metadata,
+            flagged=flagged,
+        )
+    def update_with_request_file(self, requests_path):
+        """Finds the relevant request file for the current model and updates info with it"""
+        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
+        try:
+            with open(request_file, "r") as f:
+                request = json.load(f)
+            self.model_type = ModelType.from_str(request.get("model_type", ""))
+            self.weight_type = WeightType[request.get("weight_type", "Original")]
+            self.license = request.get("license", "?")
+            self.likes = request.get("likes", 0)
+            self.num_params = request.get("params", 0)
+            self.date = request.get("submitted_time", "")
+        except Exception:
+            print(f"Could not find request file for {self.org}/{self.model}")
+    def to_dict(self):
+        """Converts the Eval Result to a dict compatible with our dataframe display"""
+        # Skip the new tasks for now
+        # TODO: safely remove this code when the task results are all added
+        skip_avg_len = 0
+        # if self.results['winogrande'] == 0.0:
+        #     skip_avg_len += 1
+        # if self.results['gsm8k'] == 0.0:
+        #     skip_avg_len += 1
+        # if self.results['eq_bench'] == 0.0:
+        #     skip_avg_len += 1
+        # if self.results['inst_follow'] == 0.0:
+        #     skip_avg_len += 1
+        # if self.results['harmlessness'] == 0.0:
+        #     skip_avg_len += 1
+        # if self.results['helpfulness'] == 0.0:
+        #     skip_avg_len += 1
+        average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
+        data_dict = {
+            "eval_name": self.eval_name,  # not a column, just a save name,
+            AutoEvalColumn.precision.name: self.precision.value.name,
+            AutoEvalColumn.model_type.name: self.model_type.value.name,
+            AutoEvalColumn.merged.name: self.is_merge,
+            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, # + "🥦" if self.is_merge,
+            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+            AutoEvalColumn.architecture.name: self.architecture,
+            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
+            AutoEvalColumn.dummy.name: self.full_model,
+            AutoEvalColumn.revision.name: self.revision,
+            AutoEvalColumn.average.name: average,
+            AutoEvalColumn.license.name: self.license,
+            AutoEvalColumn.likes.name: self.likes,
+            AutoEvalColumn.params.name: self.num_params,
+            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
+            AutoEvalColumn.flagged.name: self.flagged
+        }
+        AllColumns = []
+        for task in Tasks:
+            AllColumns.append(task.value)
+        for rank in Ranks:
+            AllColumns.append(rank.value)
+        for a in AllColumns:
+            if a.benchmark in ["daily", "quarterly"]:
+                data_dict[a.col_name] = self.results[a.benchmark]
+                print(a.benchmark, self.results[a.benchmark], a.col_name)
+            else:
+                data_dict[a.col_name] = self.results[a.benchmark]
+        return data_dict
+def get_request_file_for_model(requests_path, model_name, precision):
+    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
+    request_files = os.path.join(
+        requests_path,
+        f"{model_name}_eval_request_*.json",
+    )
+    request_files = glob.glob(request_files)
+    # Select correct request file (precision)
+    request_file = ""
+    request_files = sorted(request_files, reverse=True)
+    for tmp_request_file in request_files:
+        with open(tmp_request_file, "r") as f:
+            req_content = json.load(f)
+            if (
+                req_content["status"] in ["FINISHED"]
+                and req_content["precision"] == precision.split(".")[-1]
+            ):
+                request_file = tmp_request_file
+    return request_file
+def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
+    """From the path of the results folder root, extract all needed info for results"""
+    model_result_filepaths = []
+    for root, _, files in os.walk(results_path):
+        # We should only have json files in model results
+        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
+            continue
+        # Sort the files by date
+        try:
+            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+        except dateutil.parser._parser.ParserError:
+            files = [files[-1]]
+        for file in files:
+            model_result_filepaths.append(os.path.join(root, file))
+    eval_results = {}
+    for model_result_filepath in model_result_filepaths:
+        # Creation of result
+        eval_result = EvalResult.init_from_json_file(model_result_filepath)
+        eval_result.update_with_request_file(requests_path)
+        # Store results of same eval together
+        eval_name = eval_result.eval_name
+        if eval_name in eval_results.keys():
+            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+        else:
+            eval_results[eval_name] = eval_result
+    results = []
+    for v in eval_results.values():
+        try:
+            v.to_dict() # we test if the dict version is complete
+            results.append(v)
+        except KeyError:  # not all eval values present
+            continue
+    return results

src/populate.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import json
+import os
+import pandas as pd
+from src.display.formatting import has_no_nan_values, make_clickable_model
+from src.display.utils import AutoEvalColumn, EvalQueueColumn
+from src.leaderboard.filter_models import filter_models
+from src.leaderboard.read_evals import get_raw_eval_results
+def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
+    raw_data = get_raw_eval_results(results_path, requests_path)
+    all_data_json = [v.to_dict() for v in raw_data]
+    # all_data_json.append(baseline_row)
+    filter_models(all_data_json)
+    df = pd.DataFrame.from_records(all_data_json)
+    # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
+    df = df.sort_values(by=["Daily Rank"], ascending=True)
+    # print(df[AutoEvalColumn.average.name])
+    try:
+        df = df[cols].round(decimals=2)
+    except:
+        pass
+    # filter out if any of the benchmarks have not been produced
+    try:
+        df = df[has_no_nan_values(df, benchmark_cols)]
+    except:
+        pass
+    return raw_data, df
+def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
+    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
+    all_evals = []
+    for entry in entries:
+        if ".json" in entry:
+            file_path = os.path.join(save_path, entry)
+            with open(file_path) as fp:
+                data = json.load(fp)
+            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+            all_evals.append(data)
+        elif ".md" not in entry:
+            # this is a folder
+            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
+            for sub_entry in sub_entries:
+                file_path = os.path.join(save_path, entry, sub_entry)
+                with open(file_path) as fp:
+                    data = json.load(fp)
+                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+                all_evals.append(data)
+    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
+    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
+    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
+    failed_list = [e for e in all_evals if e["status"] == "FAILED"]
+    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
+    df_running = pd.DataFrame.from_records(running_list, columns=cols)
+    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
+    df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
+    return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]

src/submission/__pycache__/check_validity.cpython-310.pyc ADDED Viewed

Binary file (4.64 kB). View file

src/submission/__pycache__/submit.cpython-310.pyc ADDED Viewed

Binary file (3.57 kB). View file