Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Running

App Files Files Community

Add GSM8k dataset

#29

by AppleSwing - opened May 4

base: refs/heads/main

←

from: refs/pr/29

Discussion Files changed

+156

-2590

Files changed (26) hide show

LICENSE +0 -201
README.md +1 -1
app.py +17 -20
backend-cli.py +42 -71
requirements.txt +3 -6
src/backend/envs.py +0 -3
src/backend/hflm_with_measurement.py +14 -163
src/backend/moe_infinity.py +4 -9
src/backend/run_eval_suite.py +0 -8
src/backend/tasks/arena_hard/__init__.py +0 -0
src/backend/tasks/arena_hard/arena_hard.yaml +0 -2
src/backend/tasks/arena_hard/arena_judgment.py +0 -256
src/backend/tasks/arena_hard/arena_utils.py +0 -349
src/backend/tasks/arena_hard/configs/api_config.yaml +0 -17
src/backend/tasks/arena_hard/configs/judge_config.yaml +0 -26
src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl +0 -0
src/backend/tasks/arena_hard/question.jsonl +0 -0
src/backend/tasks/arena_hard/task.py +0 -220
src/backend/tasks/gsm8k/gsm8k-custom.yaml +0 -47
src/backend/tasks/measurement_task_utils.py +0 -9
src/backend/tasks/selfcheckgpt/task.py +2 -2
src/display/about.py +1 -4
src/display/utils.py +42 -57
src/leaderboard/read_evals.py +24 -36
src/submission/check_validity.py +2 -3
src/utils.py +4 -80

LICENSE DELETED Viewed

@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🔥
 colorFrom: green
 colorTo: indigo
 sdk: gradio
-sdk_version: 4.26.0
 app_file: app.py
 pinned: true
 license: apache-2.0

 colorFrom: green
 colorTo: indigo
 sdk: gradio
+sdk_version: 4.9.0
 app_file: app.py
 pinned: true
 license: apache-2.0

app.py CHANGED Viewed

@@ -11,7 +11,6 @@ import time
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
-from pytz import utc
 from src.display.about import (
     CITATION_BUTTON_LABEL,
@@ -76,7 +75,7 @@ def restart_space():
 def init_space():
-    # dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
     if socket.gethostname() not in {"neuromancer"}:
         # sync model_type with open-llm-leaderboard
@@ -91,8 +90,7 @@ def init_space():
     finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
         EVAL_REQUESTS_PATH, EVAL_COLS
     )
-    # return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
-    return None, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
 def add_benchmark_columns(shown_columns):
@@ -160,7 +158,6 @@ def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precisio
     type_emoji = [t[0] for t in type_query]
     filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
     filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
-    filtered_df = filtered_df.loc[df[AutoEvalColumn.inference_framework.name].isin(size_query)]
     # numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
     # params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
@@ -259,7 +256,7 @@ with demo:
                                 for c in fields(AutoEvalColumn)
                                 if c.displayed_by_default and not c.hidden and not c.never_hidden
                             ],
-                            label="Tasks",
                             elem_id="column-select",
                             interactive=True,
                         )
@@ -356,21 +353,21 @@ with demo:
                     queue=True,
                 )
-        # with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
-        #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        #     dataset_table = gr.components.Dataframe(
-        #         value=dataset_df,
-        #         headers=list(dataset_df.columns),
-        #         datatype=["str", "markdown", "str", "str", "str"],
-        #         elem_id="dataset-table",
-        #         interactive=False,
-        #         visible=True,
-        #         column_widths=["15%", "20%"],
-        #     )
-        #     gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
-        #     gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
         with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
@@ -479,7 +476,7 @@ with demo:
                 show_copy_button=True,
             )
-scheduler = BackgroundScheduler(timezone=utc)
 scheduler.add_job(restart_space, "interval", hours=6)

 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.display.about import (
     CITATION_BUTTON_LABEL,
 def init_space():
+    dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
     if socket.gethostname() not in {"neuromancer"}:
         # sync model_type with open-llm-leaderboard
     finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
         EVAL_REQUESTS_PATH, EVAL_COLS
     )
+    return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
 def add_benchmark_columns(shown_columns):
     type_emoji = [t[0] for t in type_query]
     filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
     filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
     # numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
     # params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
                                 for c in fields(AutoEvalColumn)
                                 if c.displayed_by_default and not c.hidden and not c.never_hidden
                             ],
+                            label="Select columns to show",
                             elem_id="column-select",
                             interactive=True,
                         )
                     queue=True,
                 )
+        with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+            dataset_table = gr.components.Dataframe(
+                value=dataset_df,
+                headers=list(dataset_df.columns),
+                datatype=["str", "markdown", "str", "str", "str"],
+                elem_id="dataset-table",
+                interactive=False,
+                visible=True,
+                column_widths=["15%", "20%"],
+            )
+            gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
+            gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
         with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 show_copy_button=True,
             )
+scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", hours=6)

backend-cli.py CHANGED Viewed

@@ -17,7 +17,7 @@ from src.backend.manage_requests import EvalRequest
 from src.leaderboard.read_evals import EvalResult
 from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
-from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus, get_gpu_details
 from src.leaderboard.read_evals import get_raw_eval_results
@@ -28,8 +28,6 @@ import time
 import pprint
 import logging
-from lm_eval.filters.extraction import RegexFilter
 # Configure the root logger
 logging.basicConfig(
@@ -44,20 +42,6 @@ eval_logger = logging.getLogger("lm-eval")
 # Explicitly set the level for 'lm-eval' logger to WARNING
 eval_logger.setLevel(logging.WARNING)
-def tuple_input_decorator(func):
-    def wrapper(self, resps, docs):
-        stripped_resps = [[resp_data[0] for resp_data in group] for group in resps]
-        filtered_resps = func(self, stripped_resps, docs)
-        combined_resps = []
-        for original_group, new_group in zip(resps, filtered_resps):
-            combined_group = [(new_resp,) + rest_of_data[1:] for new_resp, rest_of_data in zip(new_group, original_group)]
-            combined_resps.append(combined_group)
-        return combined_resps
-    return wrapper
 def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
     for i in range(10):
@@ -142,6 +126,9 @@ def request_to_result_name(request: EvalRequest) -> str:
 def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
     batch_size = 1
     batch_size = eval_request.batch_size
     init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
     # if init_gpu_info['Mem(M)'] > 500:
@@ -150,12 +137,6 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
     stop_event = threading.Event()
     monitor_thread = threading.Thread(target=monitor_gpus, args=(stop_event, 5, gpu_stats_list))
     monitor_thread.start()
-    original_apply = RegexFilter.apply
-    if task.benchmark in ["gsm8k", "gsm8k_cot", "gsm8k_cot_self_consistency", "gsm8k_custom"]:
-        RegexFilter.apply = tuple_input_decorator(RegexFilter.apply)
-    else:
-        RegexFilter.apply = original_apply
     try:
         results = run_evaluation(
@@ -217,8 +198,6 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
         repo_id=RESULTS_REPO,
         repo_type="dataset",
     )
-    RegexFilter.apply = original_apply
     return results
@@ -387,7 +366,21 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
     return False
 def process_pending_requests() -> bool:
     sanity_checks()
     print("Processing pending requests")
     current_pending_status = [PENDING_STATUS]
@@ -450,15 +443,13 @@ def get_args():
     parser = argparse.ArgumentParser(description="Run the backend")
     parser.add_argument("--debug", action="store_true", help="Run in debug mode")
     # debug parameters
-    parser.add_argument("--task", type=str, default="selfcheckgpt,mmlu, gsm8k", help="Task to debug")
     parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1,mistralai/Mixtral-8x7B-v0.1", help="Model to debug")
     parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
     parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
     parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
     parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
                         help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
-    parser.add_argument("--debug_repo", action="store_true", help="Use debug repo")
-    parser.add_argument("--model_type", type=str, default="chat", help="Model type")
     return parser.parse_args()
@@ -466,7 +457,7 @@ if __name__ == "__main__":
     args = get_args()
     local_debug = args.debug
     # debug specific task by ping
-    if local_debug and not args.debug_repo:
         # debug_model_names = [args.model]  # Use model from arguments
         # debug_task_name = [args.task]  # Use task from arguments
         debug_model_names = args.model.split(",")
@@ -474,68 +465,48 @@ if __name__ == "__main__":
         precisions = args.precision.split(",")
         print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
         task_lst = TASKS_HARNESS.copy()
-        RESULTS_REPO = DEBUG_RESULTS_REPO
         for precision in precisions:
             for debug_model_name in debug_model_names:
                 for task in task_lst:
                     task_name = task.benchmark
                     if task_name not in debug_task_name:
                         continue
-                    # try:
-                    eval_request = EvalRequest(
-                        model=debug_model_name,
-                        private=False,
-                        status="",
-                        json_filepath="",
-                        precision=precision,  # Use precision from arguments
-                        inference_framework=args.inference_framework,  # Use inference framework from arguments
-                        gpu_type=args.gpu_type,
-                        model_type=args.model_type,
-                    )
-                    curr_gpu_type = get_gpu_details()
-                    if eval_request.gpu_type != curr_gpu_type:
-                        print(f"GPU type mismatch: {eval_request.gpu_type} vs {curr_gpu_type}")
-                        raise Exception("GPU type mismatch")
-                    results = process_evaluation(task, eval_request, limit=args.limit)
-                    # except Exception as e:
-                    #     print(f"debug running error: {e}")
-    elif local_debug and args.debug_repo:
-        QUEUE_REPO = DEBUG_QUEUE_REPO
-        RESULTS_REPO = DEBUG_RESULTS_REPO
         while True:
             res = False
             # if random.randint(0, 10) == 0:
             res = process_pending_requests()
             print(f"waiting for 60 seconds")
             time.sleep(60)
             # if res is False:
             #     if random.randint(0, 5) == 0:
             #         res = maybe_refresh_results(100)
             #     else:
             #         res = process_finished_requests(100)
             # time.sleep(60)
             # if res is False:
             #     if random.randint(0, 5) == 0:
             #         res = maybe_refresh_results(0)
             #     else:
             #         res = process_finished_requests(0)
-    elif not local_debug and not args.debug_repo:
-        while True:
-           res = False
-           # if random.randint(0, 10) == 0:
-           res = process_pending_requests()
-           print(f"waiting for 60 seconds")
-           time.sleep(60)
-           # if res is False:
-           #     if random.randint(0, 5) == 0:
-           #         res = maybe_refresh_results(100)
-           #     else:
-           #         res = process_finished_requests(100)
-           # time.sleep(60)
-           # if res is False:
-           #     if random.randint(0, 5) == 0:
-           #         res = maybe_refresh_results(0)
-           #     else:
-           #         res = process_finished_requests(0)
-    else:
-        raise Exception("Cannot use debug_repo without local debug flag")

 from src.leaderboard.read_evals import EvalResult
 from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
+from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus
 from src.leaderboard.read_evals import get_raw_eval_results
 import pprint
 import logging
 # Configure the root logger
 logging.basicConfig(
 # Explicitly set the level for 'lm-eval' logger to WARNING
 eval_logger.setLevel(logging.WARNING)
 def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
     for i in range(10):
 def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
     batch_size = 1
     batch_size = eval_request.batch_size
+    if args.debug:
+        RESULTS_REPO = DEBUG_RESULTS_REPO
     init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
     # if init_gpu_info['Mem(M)'] > 500:
     stop_event = threading.Event()
     monitor_thread = threading.Thread(target=monitor_gpus, args=(stop_event, 5, gpu_stats_list))
     monitor_thread.start()
     try:
         results = run_evaluation(
         repo_id=RESULTS_REPO,
         repo_type="dataset",
     )
     return results
     return False
+def get_gpu_details():
+    gpus = GPUtil.getGPUs()
+    gpu = gpus[0]
+    name = gpu.name.replace(" ", "-")
+    # Convert memory from MB to GB and round to nearest whole number
+    memory_gb = round(gpu.memoryTotal / 1024)
+    memory = f"{memory_gb}GB"
+    formatted_name = f"{name}-{memory}"
+    return formatted_name
 def process_pending_requests() -> bool:
+    if args.debug:
+        QUEUE_REPO = DEBUG_QUEUE_REPO
     sanity_checks()
     print("Processing pending requests")
     current_pending_status = [PENDING_STATUS]
     parser = argparse.ArgumentParser(description="Run the backend")
     parser.add_argument("--debug", action="store_true", help="Run in debug mode")
     # debug parameters
+    parser.add_argument("--task", type=str, default="selfcheckgpt,mmlu", help="Task to debug")
     parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1,mistralai/Mixtral-8x7B-v0.1", help="Model to debug")
     parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
     parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
     parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
     parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
                         help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
     return parser.parse_args()
     args = get_args()
     local_debug = args.debug
     # debug specific task by ping
+    if local_debug:
         # debug_model_names = [args.model]  # Use model from arguments
         # debug_task_name = [args.task]  # Use task from arguments
         debug_model_names = args.model.split(",")
         precisions = args.precision.split(",")
         print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
         task_lst = TASKS_HARNESS.copy()
         for precision in precisions:
             for debug_model_name in debug_model_names:
                 for task in task_lst:
                     task_name = task.benchmark
                     if task_name not in debug_task_name:
                         continue
+                    try:
+                        eval_request = EvalRequest(
+                            model=debug_model_name,
+                            private=False,
+                            status="",
+                            json_filepath="",
+                            precision=precision,  # Use precision from arguments
+                            inference_framework=args.inference_framework,  # Use inference framework from arguments
+                            gpu_type=args.gpu_type
+                        )
+                        curr_gpu_type = get_gpu_details()
+                        if eval_request.gpu_type != curr_gpu_type:
+                            print(f"GPU type mismatch: {eval_request.gpu_type} vs {curr_gpu_type}")
+                            raise Exception("GPU type mismatch")
+                        results = process_evaluation(task, eval_request, limit=args.limit)
+                    except Exception as e:
+                        print(f"debug running error: {e}")
+    else:
         while True:
             res = False
             # if random.randint(0, 10) == 0:
             res = process_pending_requests()
             print(f"waiting for 60 seconds")
             time.sleep(60)
             # if res is False:
             #     if random.randint(0, 5) == 0:
             #         res = maybe_refresh_results(100)
             #     else:
             #         res = process_finished_requests(100)
             # time.sleep(60)
             # if res is False:
             #     if random.randint(0, 5) == 0:
             #         res = maybe_refresh_results(0)
             #     else:
             #         res = process_finished_requests(0)

requirements.txt CHANGED Viewed

@@ -4,7 +4,7 @@ APScheduler
 black
 click
 datasets
-gradio==4.26.0
 gradio_client
 huggingface-hub
 matplotlib
@@ -16,7 +16,7 @@ requests
 semantic-version
 tqdm
 wandb
-transformers
 tokenizers>=0.15.0
 lm_eval[ifeval] @ git+https://github.com/EleutherAI/[email protected]
 accelerate
@@ -30,7 +30,4 @@ evaluate
 spacy==3.7.4
 selfcheckgpt
 immutabledict
-gputil
-bitsandbytes
-openai
-scikit-learn

 black
 click
 datasets
+gradio
 gradio_client
 huggingface-hub
 matplotlib
 semantic-version
 tqdm
 wandb
+transformers>=4.36.0
 tokenizers>=0.15.0
 lm_eval[ifeval] @ git+https://github.com/EleutherAI/[email protected]
 accelerate
 spacy==3.7.4
 selfcheckgpt
 immutabledict
+gputil

src/backend/envs.py CHANGED Viewed

@@ -57,9 +57,6 @@ class Tasks(Enum):
     # task20 = Task("race", "acc", "RACE", 0)
     task21 = Task("mmlu", "acc", "MMLU", 5)
-    task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
-    # task23 = Task("gsm8k_cot", "em", "GSM8K", 8)
-    task24 = Task("arena_hard", "score", "Arena Hard", 0)
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")

     # task20 = Task("race", "acc", "RACE", 0)
     task21 = Task("mmlu", "acc", "MMLU", 5)
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")

src/backend/hflm_with_measurement.py CHANGED Viewed

@@ -24,7 +24,7 @@ from transformers.models.auto.modeling_auto import (
     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
 )
 from transformers import TextStreamer
-from transformers.models.dbrx.modeling_dbrx import DbrxExpertGLU
 from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
@@ -37,9 +37,6 @@ from lm_eval.models.utils import (
     stop_sequences_criteria,
 )
 from lm_eval.models.huggingface import HFLM
-from src.utils import get_gpu_details, get_peak_bw, transfer_precision2bytes, get_peak_flops
-from src.submission.check_validity import get_model_size
-from src.envs import API
 class StopWatch(TextStreamer):
@@ -70,21 +67,6 @@ class StopWatch(TextStreamer):
 class HFLMWithMeasurement(HFLM):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.pretrained = kwargs.get("pretrained", None)
-        self.revision = kwargs.get("revision", None)
-        self.precision = kwargs.get("dtype", None)
-        self.num_gpus = None
-    def _detect_num_gpus_used(self):
-        if self.num_gpus is not None:
-            return self.num_gpus
-        gpus = []
-        for p in self.model.parameters():
-            if p.device.type == "cuda":
-                gpus.append(p.device.index)
-        self.num_gpus = len(set(gpus))
-        return self.num_gpus
     def _loglikelihood_tokens(
         self,
@@ -297,7 +279,7 @@ class HFLMWithMeasurement(HFLM):
                     # Answer: (log prob, is-exact-match)
                     answer = (float(logits.sum()), bool(max_equal))
-                    res.append((answer, per_sample_time, 0, 0, 0, 0))
                     self.cache_hook.add_partial("loglikelihood", request_str, answer)
                     pbar.update(1)
@@ -306,15 +288,13 @@ class HFLMWithMeasurement(HFLM):
         return re_ord.get_original(res)
-    def _model_generate(self, context, max_tokens, stop, **generation_kwargs):
         # temperature = 0.0 if not set
         # if do_sample is false and temp==0.0:
         # remove temperature, as do_sample=False takes care of this
         # and we don't want a warning from HF
         generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
         do_sample = generation_kwargs.get("do_sample", None)
-        # is_gsm8k = generation_kwargs.get("is_gsm8k", False)
         # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
         if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
@@ -322,52 +302,7 @@ class HFLMWithMeasurement(HFLM):
         if do_sample is False and generation_kwargs.get("temperature") == 0.0:
             generation_kwargs.pop("temperature")
-        # if is_gsm8k:
-        #     generation_kwargs.pop("is_gsm8k")
-        context_length = context.shape[1]
-        if self.model.__class__.__name__ == "MoE":
-            model_config = self.model.model.config
-        else:
-            model_config = self.model.config
-        if not self.precision:
-            if model_config.quantization_config._load_in_4bit:
-                self.precision = "4bit"
-            elif model_config.quantization_config._load_in_8bit:
-                self.precision = "8bit"
-            else:
-                raise ValueError("Unknown precision")
-        # print(self.model)
-        linear_count = 0
-        element_wise_mul = 0
-        for name, module in self.model.named_modules():
-            if ('layers.0.' in name or "transformer.blocks.0" in name) and ('attn' not in name):
-                if 'experts.0.' in name or "ffn.experts" in name:
-                    if "linear_v" in name:
-                        element_wise_mul = 1
-                    if isinstance(module, torch.nn.Linear):
-                        # print(name, module)
-                        linear_count += 1
-                    elif isinstance(module, DbrxExpertGLU):
-                        linear_count = 3
-                        element_wise_mul = 1
-                # elif 'experts' not in name:
-                #     if ("gate" not in name and "router" not in name) or "gate_proj" in name:
-                #         if "gate_proj" in name:
-                #             element_wise_mul = 1
-                #         if isinstance(module, torch.nn.Linear):
-                #             # print(name, module)
-                #             linear_count += 1
-                else:
-                    continue
-        print(f"linear_count: {linear_count}")
-        print(f"element_wise_mul: {element_wise_mul}")
-        print(f"GPU usage: {self._detect_num_gpus_used()}")
         stopping_criteria = stop_sequences_criteria(
             self.tokenizer, stop, context.shape[1], context.shape[0]
         )
@@ -375,7 +310,7 @@ class HFLMWithMeasurement(HFLM):
         start = time()
         res = self.model.generate(
             input_ids=context,
-            max_new_tokens=max_tokens,
             stopping_criteria=stopping_criteria,
             pad_token_id=self.tokenizer.pad_token_id,
             use_cache=True,
@@ -386,83 +321,12 @@ class HFLMWithMeasurement(HFLM):
         batch_size = context.shape[0]
         output_length = stop_watch.decoding_iterations
-        precision_bytes = transfer_precision2bytes(self.precision)
-        model_size_param = sum(p.numel() for p in self.model.parameters())
-        n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else \
-            (model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layers)
-        d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
-        if hasattr(model_config, "num_experts_per_tok"):
-            n_experts_per_tok = model_config.num_experts_per_tok
-        elif hasattr(model_config, "num_selected_experts"):
-            n_experts_per_tok = model_config.num_selected_experts
-        elif hasattr(model_config, "ffn_config"):
-            n_experts_per_tok = model_config.ffn_config.moe_top_k
-        else:
-            n_experts_per_tok = 1
-        if hasattr(model_config, "ffn_dim"):
-            d_ff = model_config.ffn_dim
-        elif hasattr(model_config, "intermediate_size"):
-            d_ff = model_config.intermediate_size
-        elif hasattr(model_config, "d_ff"):
-            d_ff = model_config.d_ff
-        elif hasattr(model_config, "ff_ratio"):
-            d_ff = d_model * model_config.ff_ratio
-        elif hasattr(model_config, "ffn_config"):
-            d_ff = model_config.ffn_config.ffn_hidden_size
-        else:
-            raise ValueError("Unknown FFN dimension")
-        if hasattr(model_config, "num_local_experts"):
-            num_experts = model_config.num_local_experts
-        elif hasattr(model_config, "num_experts"):
-            num_experts = model_config.num_experts
-        elif hasattr(model_config, "ffn_config"):
-            num_experts = model_config.ffn_config.moe_num_experts
-        else:
-            num_experts = 1
-        ffn_params = n_layers * d_ff * linear_count * d_model
-        shared_params = model_size_param - num_experts * ffn_params
-        model_size = shared_params + n_experts_per_tok * ffn_params
-        per_token_kv_size = 2 * n_layers * d_model * precision_bytes
-        peak_bw_single = get_peak_bw(get_gpu_details())
-        peak_bw = peak_bw_single * self._detect_num_gpus_used()
-        context_prefill_size = context_length
-        kv_size = context_prefill_size * per_token_kv_size + (output_length - 1) * per_token_kv_size / 2
-        kv_size = kv_size / 1e9
-        n_vocab = model_config.vocab_size
         end_to_end_time = (end - start) / batch_size
         prefilling_time = stop_watch.prefilling_time / batch_size
         decoding_time = stop_watch.decoding_time / batch_size
         token_per_sec = output_length / decoding_time
-        achieve_mem_bw = (model_size * precision_bytes / 1e9 + kv_size) * token_per_sec
-        avg_context_length = context_length + (output_length - 1) / 2
-        flops_per_token = 2 * model_size + ((linear_count + element_wise_mul) * n_layers * avg_context_length * d_model) + 4 * d_model + 2 * d_model * n_vocab
-        peak_flops_single = get_peak_flops(get_gpu_details(), self.precision)
-        peak_flops = peak_flops_single * self._detect_num_gpus_used()
-        ## TODO only support llama-type decoder only models and moe models of switch transformer and mixtrial
-        mfu = token_per_sec * flops_per_token / peak_flops
-        mbu = achieve_mem_bw / peak_bw
-        print(f"mfu: {mfu}, mbu: {mbu}")
-        return res, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
@@ -539,19 +403,11 @@ class HFLMWithMeasurement(HFLM):
                     f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                 )
             # add EOS token to stop sequences
-            eos = "<|eot_id|>"
             if not until:
                 until = [eos]
             else:
                 until.append(eos)
-            # is_gsm8k = kwargs.get("is_gsm8k", False)
-            # if is_gsm8k:
-            #     until = ["Question:", "Question", "</s>"]
-            #     eos_ids = [self.tokenizer.eos_token_id,
-            #              self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
             if "max_gen_toks" in kwargs.keys():
                 max_gen_toks = kwargs.pop("max_gen_toks")
             else:
@@ -571,16 +427,14 @@ class HFLMWithMeasurement(HFLM):
                 left_truncate_len=max_ctx_len,
                 truncation=self.truncation,
             )
-            # print("context: ", self.tok_decode(context_enc[0]))
             context_enc = context_enc.to(self.device)
             attn_masks = attn_masks.to(self.device)
-            if "max_tokens" not in kwargs:
-                kwargs["max_tokens"] = max_gen_toks
             # perform batched generation
-            cont, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu = self._model_generate(
                 context=context_enc,
                 attention_mask=attn_masks,
                 stop=until,
@@ -591,21 +445,18 @@ class HFLMWithMeasurement(HFLM):
             for cont_toks, context in zip(cont_toks_list, contexts):
                 # discard context + left-padding toks if using causal decoder-only LM
                 if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-                    # print("After Generation: ", self.tok_decode(cont_toks))
                     cont_toks = cont_toks[context_enc.shape[1] :]
                 s = self.tok_decode(cont_toks)
-                # # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                # if not is_gsm8k:
                 for term in until:
                     if len(term) > 0:
                         # ignore '' separator,
                         # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
                         s = s.split(term)[0]
-                # print(s)
-                res.append((s, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu))
                 self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
                 pbar.update(1)

     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
 )
 from transformers import TextStreamer
 from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
     stop_sequences_criteria,
 )
 from lm_eval.models.huggingface import HFLM
 class StopWatch(TextStreamer):
 class HFLMWithMeasurement(HFLM):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
     def _loglikelihood_tokens(
         self,
                     # Answer: (log prob, is-exact-match)
                     answer = (float(logits.sum()), bool(max_equal))
+                    res.append((answer, per_sample_time, 0, 0))
                     self.cache_hook.add_partial("loglikelihood", request_str, answer)
                     pbar.update(1)
         return re_ord.get_original(res)
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
         # temperature = 0.0 if not set
         # if do_sample is false and temp==0.0:
         # remove temperature, as do_sample=False takes care of this
         # and we don't want a warning from HF
         generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
         do_sample = generation_kwargs.get("do_sample", None)
         # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
         if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
         if do_sample is False and generation_kwargs.get("temperature") == 0.0:
             generation_kwargs.pop("temperature")
+        # build stopping criteria
         stopping_criteria = stop_sequences_criteria(
             self.tokenizer, stop, context.shape[1], context.shape[0]
         )
         start = time()
         res = self.model.generate(
             input_ids=context,
+            max_length=max_length,
             stopping_criteria=stopping_criteria,
             pad_token_id=self.tokenizer.pad_token_id,
             use_cache=True,
         batch_size = context.shape[0]
         output_length = stop_watch.decoding_iterations
         end_to_end_time = (end - start) / batch_size
         prefilling_time = stop_watch.prefilling_time / batch_size
         decoding_time = stop_watch.decoding_time / batch_size
         token_per_sec = output_length / decoding_time
+        return res, end_to_end_time, prefilling_time, token_per_sec
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
                     f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                 )
             # add EOS token to stop sequences
+            eos = self.tok_decode(self.eot_token_id)
             if not until:
                 until = [eos]
             else:
                 until.append(eos)
             if "max_gen_toks" in kwargs.keys():
                 max_gen_toks = kwargs.pop("max_gen_toks")
             else:
                 left_truncate_len=max_ctx_len,
                 truncation=self.truncation,
             )
             context_enc = context_enc.to(self.device)
             attn_masks = attn_masks.to(self.device)
+            if "max_length" not in kwargs:
+                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
             # perform batched generation
+            cont, end_to_end_time, prefilling_time, token_per_sec = self._model_generate(
                 context=context_enc,
                 attention_mask=attn_masks,
                 stop=until,
             for cont_toks, context in zip(cont_toks_list, contexts):
                 # discard context + left-padding toks if using causal decoder-only LM
                 if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
                     cont_toks = cont_toks[context_enc.shape[1] :]
                 s = self.tok_decode(cont_toks)
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
                 for term in until:
                     if len(term) > 0:
                         # ignore '' separator,
                         # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
                         s = s.split(term)[0]
+                res.append((s, end_to_end_time, prefilling_time, token_per_sec))
                 self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
                 pbar.update(1)

src/backend/moe_infinity.py CHANGED Viewed

@@ -31,20 +31,15 @@ class MoEHFLM(HFLMWithMeasurement):
         self.use_chat_template = use_chat_template
         if "device" in kwargs:
             kwargs.pop("device")
-        if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")):
-            shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
-        kwargs["device_map"] = "cuda:0"
         super().__init__(
-            *args, **kwargs, pretrained=pretrained
         )  # Assuming HFLM accepts a 'pretrained' arg and handles it
         # self._create_model()
     def __del__(self):
-        self._model.engine.clean_up() # clean up hooks
-        self._model.engine.archer_engine.clean_up_resources() # clean up resources
-        if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")):
-            shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads")) # clean up offload model
     def _create_model(self, *args, **kwargs):
         """

         self.use_chat_template = use_chat_template
         if "device" in kwargs:
             kwargs.pop("device")
         super().__init__(
+            *args, **kwargs, pretrained=pretrained, device_map="cuda:0"
         )  # Assuming HFLM accepts a 'pretrained' arg and handles it
         # self._create_model()
+        shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
     def __del__(self):
+        # Clean up offloaded models from self.offload_path
+        shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
     def _create_model(self, *args, **kwargs):
         """

src/backend/run_eval_suite.py CHANGED Viewed

@@ -17,16 +17,12 @@ def process_results_decorator(func):
         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
-        mfu = sum([r[4] for r in results]) / len(results)
-        mbu = sum([r[5] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         result_dict = func(self, doc, processed_results, *args, **kwargs)
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
-        result_dict["mfu"] = mfu
-        result_dict["mbu"] = mbu
         return result_dict
     return wrapper
 ConfigurableTask.process_results = process_results_decorator(orig_process_results)
@@ -37,8 +33,6 @@ def aggregation_decorator(func):
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
-        aggregation_list["mfu"] = mean
-        aggregation_list["mbu"] = mean
         return aggregation_list
     return wrapper
 ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
@@ -49,8 +43,6 @@ def higher_is_better_decorator(func):
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
-        higher_is_better_dict["mfu"] = True
-        higher_is_better_dict["mbu"] = True
         return higher_is_better_dict
     return wrapper
 ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)

         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         result_dict = func(self, doc, processed_results, *args, **kwargs)
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
         return result_dict
     return wrapper
 ConfigurableTask.process_results = process_results_decorator(orig_process_results)
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
         return aggregation_list
     return wrapper
 ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
         return higher_is_better_dict
     return wrapper
 ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)

src/backend/tasks/arena_hard/__init__.py DELETED Viewed

File without changes

src/backend/tasks/arena_hard/arena_hard.yaml DELETED Viewed

	@@ -1,2 +0,0 @@
1	- task: arena_hard
2	- class: !function task.ArenaHard

src/backend/tasks/arena_hard/arena_judgment.py DELETED Viewed

@@ -1,256 +0,0 @@
-'''
-This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
-under the Apache 2.0 License from the arena-hard project.
-(https://github.com/lm-sys/arena-hard)
-Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
-See the NOTICE file distributed with this work for additional
-information regarding copyright ownership.
-'''
-import pandas as pd
-from tqdm import tqdm
-import numpy as np
-from sklearn.linear_model import LogisticRegression
-import math
-from collections import defaultdict
-from tqdm import tqdm
-from src.backend.tasks.arena_hard.arena_utils import (
-    chat_completion_openai,
-    load_questions,
-    load_model_answers,
-    get_endpoint,
-    make_config,
-)
-def get_score(judgment, pattern, pairwise=True):
-    matches = pattern.findall(judgment)
-    matches = [m for m in matches if m != ""]
-    if len(set(matches)) == 0:
-        return None, True
-    elif len(set(matches)) == 1:
-        if pairwise:
-            return matches[0].strip("\n"), False
-        return int(matches[0])
-    else:
-        return None, False
-# get answer from model
-def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None):
-    api_dict = get_endpoint(endpoint_dict["endpoints"])
-    # if endpoint_dict["api_type"] == "anthropic":
-    #     output = chat_completion_anthropic(model, conv, temperature, max_tokens)
-    # elif endpoint_dict["api_type"] == "azure":
-    #     output = chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict)
-    output = chat_completion_openai(model, conv, temperature, max_tokens, api_dict)
-    return output
-def judgment(**args):
-    question = args["question"]
-    answer = args["answer"]
-    reference = args["reference"]
-    baseline = args["baseline_answer"]
-    configs = args["configs"]
-    # output_file = args["output_file"]
-    model = configs["judge_model"]
-    num_games = 2 if configs["pairwise"] else 1
-    # output = {
-    #     "question_id":question["question_id"],
-    #     "judge": model,
-    #     "model": "custom_model",
-    #     "games":[]
-    #     }
-    output = [question["question_id"]]
-    for game in range(num_games):
-        conv = [{"role": "system", "content": configs["system_prompt"]}]
-        for template in configs["prompt_template"]:
-            prompt_args = {}
-            prompt_args[f"question_{1}"] = question["content"]
-            base = 1
-            if baseline:
-                if game % 2 == 1: # swap position
-                    temp = baseline
-                    baseline = answer
-                    answer = temp
-                if game == 0:
-                    for i, turn in enumerate(baseline["choices"][0]["turns"]):
-                        prompt_args[f"answer_{i+1}"] = turn["content"]
-                        base += 1
-                if game == 1:
-                    prompt_args[f"answer_{1}"] = baseline
-                    base += 1
-            if answer:
-                prompt_args[f"answer_{base}"] = answer
-            if reference:
-                for j, ref_answer in enumerate(reference):
-                    for i, turn in enumerate(ref_answer["choices"][0]["turns"]):
-                        prompt_args[f"ref_answer_{i+j+1}"] = turn["content"]
-            user_prompt = template.format(**prompt_args)
-            conv.append({"role": "user", "content": user_prompt})
-        judgment = ""
-        for _ in range(2):
-            new_judgment = get_answer(
-                model,
-                conv,
-                configs["temperature"],
-                configs["max_tokens"],
-                args["endpoint_dict"],
-            )
-            judgment += ("\n" + new_judgment)
-            score, try_again = get_score(judgment, args["regex_pattern"])
-            conv.append({"role": "assistant", "content": new_judgment})
-            if not try_again:
-                break
-            conv.append({"role": "user", "content": "continue your judgment and finish by outputting a final verdict label"})
-        print("Finish judgment!!!")
-        # result = {
-        #     "user_prompt": conv[1]["content"],
-        #     "judgment": judgment,
-        #     "score":score
-        # }
-        output.append(score)
-    return output
-def get_battles_from_scores(score_list, first_game_only=False, WEIGHT=3):
-    arena_hard_battles = pd.DataFrame()
-    print("Turning score list into battles...")
-    for scores in tqdm(score_list):
-        question_id, score1, score2 = scores
-        # Process game 1
-        output = {"question_id": question_id,
-                  "model_a": "gpt-4-0314",
-                  "model_b": f"custom_model"}  # Unique identifier for model
-        weight = 1
-        if score1 == "A=B":
-            output["winner"] = "tie"
-        elif score1 == "A>B":
-            output["winner"] = "model_a"
-        elif score1 == "A>>B":
-            output["winner"] = "model_a"
-            weight = WEIGHT
-        elif score1 == "B>A":
-            output["winner"] = "model_b"
-        elif score1 == "B>>A":
-            output["winner"] = "model_b"
-            weight = WEIGHT
-        else:
-            weight = 0
-        if weight:
-            arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
-        if not first_game_only:
-            # Process game 2
-            output = {"question_id": question_id,
-                      "model_a": "gpt-4-0314",
-                      "model_b": f"custom_model"}  # Unique identifier for model
-            weight = 1
-            if score2 == "A=B":
-                output["winner"] = "tie"
-            elif score2 == "A>B":
-                output["winner"] = "model_b"
-            elif score2 == "A>>B":
-                output["winner"] = "model_b"
-                weight = WEIGHT
-            elif score2 == "B>A":
-                output["winner"] = "model_a"
-            elif score2 == "B>>A":
-                output["winner"] = "model_a"
-                weight = WEIGHT
-            else:
-                weight = 0
-            if weight:
-                arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
-    arena_hard_battles.to_json("./arena_hard_battles.jsonl", lines=True, orient="records")
-    return arena_hard_battles
-def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
-    models = pd.concat([df["model_a"], df["model_b"]]).unique()
-    models = pd.Series(np.arange(len(models)), index=models)
-    LOW_RATING = 100
-    # duplicate battles
-    df = pd.concat([df, df], ignore_index=True)
-    p = len(models.index)
-    n = df.shape[0]
-    X = np.zeros([n, p])
-    X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
-    X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
-    # one A win => two A win
-    Y = np.zeros(n)
-    Y[df["winner"] == "model_a"] = 1.0
-    # one tie => one A win + one B win
-    # find tie + tie (both bad) index
-    tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
-    tie_idx[len(tie_idx)//2:] = False
-    Y[tie_idx] = 1.0
-    if len(np.unique(Y)) == 1:
-        # If there's only one class in the data, assign default ratings
-        elo_scores = np.full(p, LOW_RATING)
-        elo_scores[models["gpt-4-0314"]] = INIT_RATING
-    else:
-        lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
-        lr.fit(X,Y)
-        elo_scores = SCALE * lr.coef_[0] + INIT_RATING
-    # set anchor as gpt-4-0314 = 1000
-    if "gpt-4-0314" in models.index:
-        elo_scores += 1000 - elo_scores[models["gpt-4-0314"]]
-    return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)
-def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
-    names = sorted(list(elo_ratings.keys()))
-    wins = defaultdict(lambda: defaultdict(lambda: 0))
-    for a in names:
-        for b in names:
-            ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
-            wins[a][b] = ea
-            wins[b][a] = 1 - ea
-    data = {
-        a: [wins[a][b] if a != b else np.NAN for b in names]
-        for a in names
-    }
-    df = pd.DataFrame(data, index=names)
-    df.index.name = "model_a"
-    df.columns.name = "model_b"
-    return df.T
-def get_win_rate_column(df, column, baseline="gpt-4-0314"):
-    to_dict = df[["model", column]].set_index("model").to_dict()[column]
-    win_rate_table = predict_win_rate(to_dict)
-    return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2))

src/backend/tasks/arena_hard/arena_utils.py DELETED Viewed

@@ -1,349 +0,0 @@
-'''
-This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
-under the Apache 2.0 License from the arena-hard project.
-(https://github.com/lm-sys/arena-hard)
-Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
-See the NOTICE file distributed with this work for additional
-information regarding copyright ownership.
-'''
-import os
-import json
-import time
-import yaml
-import random
-from typing import Optional
-from glob import glob
-# API setting constants
-API_MAX_RETRY = 16
-API_RETRY_SLEEP = 10
-API_ERROR_OUTPUT = "$ERROR$"
-OPENAI_MODEL_LIST = (
-    "gpt-3.5-turbo",
-    "gpt-3.5-turbo-0301",
-    "gpt-3.5-turbo-0613",
-    "gpt-3.5-turbo-0613-verbose",
-    "gpt-3.5-turbo-1106",
-    "gpt-3.5-turbo-0125",
-    "gpt-4",
-    "gpt-4-0314",
-    "gpt-4-0613",
-    "gpt-4-turbo",
-    "gpt-4-1106-preview",
-    "gpt-4-0125-preview",
-)
-temperature_config = {
-    "writing": 0.7,
-    "roleplay": 0.7,
-    "extraction": 0.0,
-    "math": 0.0,
-    "coding": 0.0,
-    "reasoning": 0.0,
-    "stem": 0.1,
-    "humanities": 0.1,
-}
-def load_questions(question_file: str):
-    """Load questions from a file."""
-    questions = []
-    with open(question_file, "r") as ques_file:
-        for line in ques_file:
-            if line:
-                questions.append(json.loads(line))
-    return questions
-def load_model_answers(answer_dir: str):
-    """Load model answers.
-    The return value is a python dict of type:
-    Dict[model_name: str -> Dict[question_id: int -> answer: dict]]
-    """
-    filenames = glob(os.path.join(answer_dir, "*.jsonl"))
-    filenames.sort()
-    model_answers = {}
-    for filename in filenames:
-        model_name = os.path.basename(filename)[:-6]
-        answer = {}
-        with open(filename) as fin:
-            for line in fin:
-                line = json.loads(line)
-                answer[line["question_id"]] = line
-        model_answers[model_name] = answer
-    return model_answers
-def get_endpoint(endpoint_list):
-    if endpoint_list is None:
-        return None
-    assert endpoint_list is not None
-    # randomly pick one
-    api_dict = random.choices(
-        endpoint_list
-    )[0]
-    return api_dict
-# load config args from config yaml files
-def make_config(config_file: str) -> dict:
-    config_kwargs = {}
-    with open(config_file, "r") as f:
-        config_kwargs = yaml.load(f, Loader=yaml.SafeLoader)
-    return config_kwargs
-def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
-    import openai
-    if api_dict:
-        client = openai.OpenAI(
-            base_url=api_dict["api_base"],
-            api_key=api_dict["api_key"],
-        )
-    else:
-        client = openai.OpenAI()
-    output = API_ERROR_OUTPUT
-    for _ in range(API_MAX_RETRY):
-        try:
-            # print(messages)
-            completion = client.chat.completions.create(
-                model=model,
-                messages=messages,
-                temperature=temperature,
-                max_tokens=max_tokens
-                )
-            output = completion.choices[0].message.content
-            break
-        except openai.RateLimitError as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
-        except openai.BadRequestError as e:
-            print(messages)
-            print(type(e), e)
-        except KeyError:
-            print(type(e), e)
-            break
-    return output
-# def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_dict=None):
-#     import openai
-#     from openai import AzureOpenAI
-#     api_base = api_dict["api_base"]
-#     client = AzureOpenAI(
-#         azure_endpoint = api_base,
-#         api_key= api_dict["api_key"],
-#         api_version=api_dict["api_version"],
-#         timeout=240,
-#         max_retries=2
-#     )
-#     output = API_ERROR_OUTPUT
-#     for _ in range(API_MAX_RETRY):
-#         try:
-#             response = client.chat.completions.create(
-#                 model=model,
-#                 messages=messages,
-#                 n=1,
-#                 temperature=temperature,
-#                 max_tokens=max_tokens,
-#                 seed=42,
-#             )
-#             output = response.choices[0].message.content
-#             break
-#         except openai.RateLimitError as e:
-#             print(type(e), e)
-#             time.sleep(API_RETRY_SLEEP)
-#         except openai.BadRequestError as e:
-#             print(type(e), e)
-#             break
-#         except KeyError:
-#             print(type(e), e)
-#             break
-#     return output
-# def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict=None):
-#     import anthropic
-#     if api_dict:
-#         api_key = api_dict["api_key"]
-#     else:
-#         api_key = os.environ["ANTHROPIC_API_KEY"]
-#     sys_msg = ""
-#     if messages[0]["role"] == "system":
-#         sys_msg = messages[0]["content"]
-#         messages = messages[1:]
-#     output = API_ERROR_OUTPUT
-#     for _ in range(API_MAX_RETRY):
-#         try:
-#             # print(sys_msg)
-#             c = anthropic.Anthropic(api_key=api_key)
-#             response = c.messages.create(
-#                 model=model,
-#                 messages=messages,
-#                 stop_sequences=[anthropic.HUMAN_PROMPT],
-#                 max_tokens=max_tokens,
-#                 temperature=temperature,
-#                 system=sys_msg
-#             )
-#             output = response.content[0].text
-#             break
-#         except anthropic.APIError as e:
-#             print(type(e), e)
-#             time.sleep(API_RETRY_SLEEP)
-#     return output
-# def chat_completion_mistral(model, messages, temperature, max_tokens):
-#     from mistralai.client import MistralClient
-#     from mistralai.models.chat_completion import ChatMessage
-#     from mistralai.exceptions import MistralException
-#     api_key = os.environ["MISTRAL_API_KEY"]
-#     client = MistralClient(api_key=api_key)
-#     prompts = [ChatMessage(role=message["role"], content=message["content"]) for message in messages]
-#     output = API_ERROR_OUTPUT
-#     for _ in range(API_MAX_RETRY):
-#         try:
-#             chat_response = client.chat(
-#                 model=model,
-#                 messages=prompts,
-#                 temperature=temperature,
-#                 max_tokens=max_tokens,
-#             )
-#             output = chat_response.choices[0].message.content
-#             break
-#         except MistralException as e:
-#             print(type(e), e)
-#             break
-#     return output
-# def chat_completion_gemini(model, messages, temperature, max_tokens):
-#     import google.generativeai as genai
-#     genai.configure(api_key=os.environ["GEMINI_API_KEY"])
-#     safety_settings = [
-#         {
-#             "category": "HARM_CATEGORY_HARASSMENT",
-#             "threshold": "BLOCK_NONE"
-#         },
-#         {
-#             "category": "HARM_CATEGORY_HATE_SPEECH",
-#             "threshold": "BLOCK_NONE"
-#         },
-#         {
-#             "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
-#             "threshold": "BLOCK_NONE"
-#         },
-#         {
-#             "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
-#             "threshold": "BLOCK_NONE"
-#         },
-#     ]
-#     # Set up the model
-#     generation_config = {
-#         "temperature": temperature,
-#         "top_p": 1,
-#         "top_k": 1,
-#         "max_output_tokens": max_tokens,
-#     }
-#     output = API_ERROR_OUTPUT
-#     for _ in range(API_MAX_RETRY):
-#         try:
-#             gemini = genai.GenerativeModel(
-#                 model_name=model,
-#                 generation_config=generation_config,
-#                 safety_settings=safety_settings)
-#             convo = gemini.start_chat(history=[])
-#             convo.send_message(messages)
-#             output = convo.last.text
-#             break
-#         except genai.types.generation_types.StopCandidateException as e:
-#             print(type(e), e)
-#             break
-#         except Exception as e:
-#             print(type(e), e)
-#             time.sleep(API_RETRY_SLEEP)
-#     return output
-# def chat_completion_cohere(model, messages, temperature, max_tokens):
-#     import cohere
-#     co = cohere.Client(os.environ["COHERE_API_KEY"])
-#     assert len(messages) > 0
-#     template_map = {"system":"SYSTEM",
-#                     "assistant":"CHATBOT",
-#                     "user":"USER"}
-#     assert messages[-1]["role"] == "user"
-#     prompt = messages[-1]["content"]
-#     if len(messages) > 1:
-#         history = []
-#         for message in messages[:-1]:
-#             history.append({"role":template_map[message["role"]], "message":message["content"]})
-#     else:
-#         history = None
-#     output = API_ERROR_OUTPUT
-#     for _ in range(API_MAX_RETRY):
-#         try:
-#             response = co.chat(
-#                 message=prompt,
-#                 model=model,
-#                 temperature=temperature,
-#                 max_tokens=max_tokens,
-#                 chat_history=history,
-#             )
-#             output = response.text
-#             break
-#         except cohere.core.api_error.ApiError as e:
-#             print(type(e), e)
-#             raise
-#         except Exception as e:
-#             print(type(e), e)
-#             break
-#     return output
-def reorg_answer_file(answer_file):
-    """Sort by question id and de-duplication"""
-    answers = {}
-    with open(answer_file, "r") as fin:
-        for l in fin:
-            qid = json.loads(l)["question_id"]
-            answers[qid] = l
-    qids = sorted(list(answers.keys()))
-    with open(answer_file, "w") as fout:
-        for qid in qids:
-            fout.write(answers[qid])

src/backend/tasks/arena_hard/configs/api_config.yaml DELETED Viewed

@@ -1,17 +0,0 @@
-# gpt-3.5-turbo:
-#     model_name: gpt-3.5-turbo
-#     endpoints: null
-#     api_type: openai
-#     parallel: 8
-gpt-4-1106-preview:
-    model_name: gpt-4-1106-preview
-    endpoints: null
-    api_type: openai
-    parallel: 8
-# llama3-7b:
-#     model_name: llama3-7b
-#     endpoints: null
-#     api_type: openai
-#     parallel: 8

src/backend/tasks/arena_hard/configs/judge_config.yaml DELETED Viewed

@@ -1,26 +0,0 @@
-name: judgment config file for Arena Hard
-bench_name: arena-hard-v0.1
-# Arena Hard default
-judge_model: gpt-4-1106-preview
-# judge_model: gpt-3.5-turbo
-reference: False # Optional
-ref_model: null
-baseline: True
-baseline_model: gpt-4-0314
-pairwise: True
-temperature: 0
-max_tokens: 4096
-regex_pattern: \[\[([AB<>=]+)\]\]
-system_prompt: "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
-prompt_template: ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"]
-# Add your model below for evaluation
-# model_list:
-#   - gpt-3.5-turbo-0125

src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl DELETED Viewed

The diff for this file is too large to render. See raw diff

src/backend/tasks/arena_hard/question.jsonl DELETED Viewed

The diff for this file is too large to render. See raw diff

src/backend/tasks/arena_hard/task.py DELETED Viewed

@@ -1,220 +0,0 @@
-import os
-from typing import Union, List
-from lm_eval.api.task import ConfigurableTask
-from lm_eval.api.instance import Instance
-# from lm_eval.api.registry import register_task
-from lm_eval.api.metrics import mean
-from src.backend.envs import DEVICE
-import pandas as pd
-from src.backend.tasks.measurement_task_utils import measure_system_metrics
-import json
-from typing import (
-    Any,
-    Dict,
-    List,
-    Optional,
-    Union,
-)
-from datasets import Dataset
-import re
-from src.backend.tasks.arena_hard.arena_utils import (
-    load_questions,
-    load_questions,
-    load_model_answers,
-    make_config,
-)
-from src.backend.tasks.arena_hard.arena_judgment import (
-    judgment,
-    get_battles_from_scores,
-    compute_mle_elo,
-    predict_win_rate,
-    get_win_rate_column
-)
-def load_questions(question_file: str):
-    """Load questions from a file."""
-    questions = []
-    with open(question_file, "r") as ques_file:
-        for line in ques_file:
-            if line:
-                questions.append(json.loads(line))
-    return questions
-def download_wrapper(func):
-    def download(self, *args, **kwargs):
-        print("Using Arena Hard, No need to download")
-    return download
-original_download = ConfigurableTask.download
-ConfigurableTask.download = download_wrapper(original_download)
-# @register_task("selfcheckgpt")
-@measure_system_metrics
-class ArenaHard(ConfigurableTask):
-    VERSION = 0.0
-    OUTPUT_TYPE = "generate_until"
-    data_path = os.path.join(os.path.dirname(__file__), 'question.jsonl')
-    judge_config_path = os.path.join(os.path.dirname(__file__), "configs/judge_config.yaml")
-    configs = make_config(judge_config_path)
-    model_ans_dir = os.path.join(os.path.dirname(__file__), "model_answer")
-    model_answers = load_model_answers(model_ans_dir)
-    data = load_questions(data_path)
-    def __init__(self):
-        super().__init__(config={"metadata": {"version": self.VERSION}})
-        # these end tokens are hard coded because of the current limitaion of the llm-eval.
-        # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
-        self.generation_kwargs = {"until": ["</s>", "<|im_end|>"], "max_gen_toks": 4096}
-        # self.generation_kwargs_sampling_number = 5  # the number of sampling for self-consistence
-        # self.generation_kwargs_sampling = {
-        #     "temperature": 0.99,
-        #     "do_sample": True,
-        #     "until": ["<im_end>", "<im_end>"],
-        #     "max_length": 1024,
-        # }
-    def transform_data(self, data):
-        transformed_data = []
-        for i in range(len(data)):
-            if self.configs["baseline"]:
-                baseline_answer = self.model_answers[self.configs["baseline_model"]][data[i]["question_id"]]
-            else:
-                baseline_answer = None
-            transformed_item = {
-                "question_id": data[i]["question_id"],
-                "content": data[i]["turns"][0]["content"],  # Assuming you want the first turn's content
-                "model_answer": baseline_answer
-            }
-            transformed_data.append(transformed_item)
-        return transformed_data
-    def has_training_docs(self):
-        return False
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return False
-    def validation_docs(self):
-        self.dataset = self.transform_data(self.data)
-        self.dataset = Dataset.from_dict({"question_id": [item["question_id"] for item in self.dataset],
-                             "content": [item["content"] for item in self.dataset],
-                             "model_answer": [item["model_answer"] for item in self.dataset]})
-        return self.dataset
-    def doc_to_text(self, doc):
-        sentence = doc["content"]
-        doc_text = f"{sentence}\n"
-        return doc_text
-    def doc_to_target(self, doc):
-        q_id = doc["question_id"]
-        return q_id
-    def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
-        arguments = (ctx, self.generation_kwargs)
-        request_list = [
-            Instance(request_type="generate_until", doc=doc, arguments=arguments, idx=0, **kwargs),
-        ]
-        # sampling_arguments = (ctx, self.generation_kwargs_sampling)
-        # request_list.extend(
-        #     [
-        #         Instance(request_type="generate_until", doc=doc, arguments=sampling_arguments, idx=idx, **kwargs)
-        #         for idx in range(1, self.generation_kwargs_sampling_number + 1)
-        #     ]
-        # )
-        return request_list
-    def process_results(self, doc, results):
-        response_temperature_0 = results[0]
-        # other_responses = results[1:]
-        api_config_path = os.path.join(os.path.dirname(__file__), "configs/api_config.yaml")
-        endpoint_list = make_config(api_config_path)
-        if self.configs["regex_pattern"]:
-            pattern = re.compile(self.configs["regex_pattern"])
-        ref_answer_dir = os.path.join(os.path.dirname(__file__), "reference_answer")
-        ref_answers = None
-        if self.configs["reference"]:
-            ref_answers = load_model_answers(ref_answer_dir)
-            ref_answers = [ref_answers[model] for model in self.configs["ref_model"]]
-        # output_files = {}
-        # models = ["custom_model"]
-        # output_dir = f"{os.path.join(os.path.dirname(__file__))}/model_judgments/{self.configs['judge_model']}"
-        # for model in models:
-        #     output_files[model] = os.path.join(
-        #         output_dir,
-        #         f"{model}.jsonl",
-        #     )
-        # for output_file in output_files.values():
-        #     os.makedirs(os.path.dirname(output_file), exist_ok=True)
-        endpoint_info = endpoint_list[self.configs["judge_model"]]
-        question = doc
-        kwargs = {}
-        kwargs["question"] = question
-        kwargs["answer"] = response_temperature_0
-        if ref_answers:
-            kwargs["reference"] = [ref_answer[doc["question_id"]] for ref_answer in ref_answers]
-            assert len(kwargs["reference"]) == len(self.configs["ref_model"])
-        else:
-            kwargs["reference"] = None
-        if self.configs["baseline"]:
-            kwargs["baseline_answer"] = doc["model_answer"]
-        else:
-            kwargs["baseline_answer"] = None
-        kwargs["configs"] = self.configs
-        kwargs["endpoint_dict"] = endpoint_info
-        # kwargs["output_file"] = output_files["custom_model"]
-        kwargs["regex_pattern"] = pattern
-        scores = judgment(**kwargs)
-        return {"score": scores}
-    def aggregation(self):
-        """
-        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are
-            functions that aggregate a list of metrics
-        """
-        ##TODO implement the aggregation function to calculate elo for score
-        def get_win_rate(score_list):
-            battles = get_battles_from_scores(score_list)
-            bootstrap_online_elo = compute_mle_elo(battles)
-            stats = pd.DataFrame()
-            stats["results"] = None
-            stats["results"] = stats['results'].astype('object')
-            for i, model in enumerate(bootstrap_online_elo.index):
-                stats.at[i, "model"] = model
-                stats.at[i, "score"] = bootstrap_online_elo[model]
-            stats.sort_values(by="model", inplace=True)
-            stats["score"] = get_win_rate_column(stats, "score", "gpt-4-0314").tolist()
-            return stats["score"][1]
-        return {k: get_win_rate for k in ["score"]}
-    def higher_is_better(self):
-        """
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are
-            whether a higher value of the submetric is better
-        """
-        return {k: True for k in ["score"]}

src/backend/tasks/gsm8k/gsm8k-custom.yaml DELETED Viewed

@@ -1,47 +0,0 @@
-group:
-  - math_word_problems
-task: gsm8k_custom
-dataset_path: gsm8k
-dataset_name: main
-output_type: generate_until
-training_split: train
-fewshot_split: train
-test_split: test
-doc_to_text: "Question: {{question}}\nAnswer:"
-doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: false
-    regexes_to_ignore:
-      - ","
-      - "\\$"
-      - "(?s).*#### "
-      - "\\.$"
-generation_kwargs:
-  until:
-    - "Question:"
-    - "Question"
-    - "</s>"
-    - "<|im_end|>"
-  do_sample: false
-  temperature: 0.0
-  # is_gsm8k: true
-repeats: 1
-num_fewshot: 5
-filter_list:
-  - name: "strict-match"
-    filter:
-      - function: "regex"
-        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
-      - function: "take_first"
-  - name: "flexible-extract"
-    filter:
-      - function: "regex"
-        group_select: -1
-        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-      - function: "take_first"
-metadata:
-  version: 3.0

src/backend/tasks/measurement_task_utils.py CHANGED Viewed

@@ -12,9 +12,6 @@ def process_results_decorator(func):
         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
-        mfu = sum([r[4] for r in results]) / len(results)
-        mbu = sum([r[5] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         # Now call the original process_results with the processed results
@@ -22,8 +19,6 @@ def process_results_decorator(func):
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
-        result_dict["mfu"] = mfu
-        result_dict["mbu"] = mbu
         return result_dict
     return wrapper
@@ -35,8 +30,6 @@ def aggregation_decorator(func):
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
-        aggregation_list["mfu"] = mean
-        aggregation_list["mbu"] = mean
         return aggregation_list
     return wrapper
@@ -48,8 +41,6 @@ def higher_is_better_decorator(func):
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
-        higher_is_better_dict["mfu"] = True
-        higher_is_better_dict["mbu"] = True
         return higher_is_better_dict
     return wrapper

         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         # Now call the original process_results with the processed results
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
         return result_dict
     return wrapper
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
         return aggregation_list
     return wrapper
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
         return higher_is_better_dict
     return wrapper

src/backend/tasks/selfcheckgpt/task.py CHANGED Viewed

@@ -27,12 +27,12 @@ class SelfCheckGPT(ConfigurableTask):
         super().__init__(config={"metadata": {"version": self.VERSION}})
         # these end tokens are hard coded because of the current limitaion of the llm-eval.
         # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
-        self.generation_kwargs = {"until": ["<|im_end|>"], "max_length": 1024}
         self.generation_kwargs_sampling_number = 5  # the number of sampling for self-consistence
         self.generation_kwargs_sampling = {
             "temperature": 0.99,
             "do_sample": True,
-            "until": ["<|im_end|>", "</s>"],
             "max_length": 1024,
         }

         super().__init__(config={"metadata": {"version": self.VERSION}})
         # these end tokens are hard coded because of the current limitaion of the llm-eval.
         # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
+        self.generation_kwargs = {"until": ["<im_end>"], "max_length": 1024}
         self.generation_kwargs_sampling_number = 5  # the number of sampling for self-consistence
         self.generation_kwargs_sampling = {
             "temperature": 0.99,
             "do_sample": True,
+            "until": ["<im_end>", "</s>"],
             "max_length": 1024,
         }

src/display/about.py CHANGED Viewed

@@ -10,17 +10,14 @@ The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to me
 Tasks:
 - **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
-- **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
-- **AI Judgment Scores for Responses to Complex User Queries** -- [Arena_Hard](https://lmsys.org/blog/2024-04-19-arena-hard/)
 Columns and Metrics:
 - Method: The MOE LLMs inference framework.
 - E2E(s): Average End to End generation time in seconds.
 - PRE(s): Prefilling Time of input prompt in seconds.
 - T/s: Tokens throughout per second.
-- S-MBU(%): Sparse Model Bandwidth Utilization.
-- S-MFU(%): Sparse Model FLOPs Utilization.
 - Precision: The precison of used model.
 """

 Tasks:
+- **Generation Self-consistancy** -- [SelfCheckGPT](https://github.com/potsawee/selfcheckgpt)
 - **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
 Columns and Metrics:
 - Method: The MOE LLMs inference framework.
 - E2E(s): Average End to End generation time in seconds.
 - PRE(s): Prefilling Time of input prompt in seconds.
 - T/s: Tokens throughout per second.
 - Precision: The precison of used model.
 """

src/display/utils.py CHANGED Viewed

@@ -18,16 +18,12 @@ GPU_Power = 'Power(W)'
 GPU_Mem = 'Mem(G)'
 GPU_Name = "GPU"
 GPU_Util = 'Util(%)'
-MFU = 'S-MFU(%)'
-MBU = 'S-MBU(%)'
 BATCH_SIZE = 'bs'
 PRECISION = "Precision"
 system_metrics_to_name_map = {
     "end_to_end_time": f"{E2Es}",
     "prefilling_time": f"{PREs}",
     "decoding_throughput": f"{TS}",
-    "mfu": f"{MFU}",
-    "mbu": f"{MBU}"
 }
 gpu_metrics_to_name_map = {
@@ -37,7 +33,7 @@ gpu_metrics_to_name_map = {
     GPU_Mem: GPU_Mem,
     "batch_size": BATCH_SIZE,
     "precision": PRECISION,
-    GPU_Name: GPU_Name
 }
 @dataclass
@@ -77,11 +73,8 @@ class Tasks(Enum):
     # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
     # # XXX include me back at some point
-    # selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
     mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
-    gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
-    # gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot)
-    arena_hard = Task("arena_hard", "score", "Arena Hard") #Arena Hard/Score
 # These classes are for user facing column names,
@@ -106,7 +99,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
 # # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
 # Inference framework
-auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True, dummy=True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
@@ -114,27 +107,25 @@ for task in Tasks:
     auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
     auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
     # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
-    # auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
     auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
-    # auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
     if task.value.benchmark in MULTIPLE_CHOICEs:
         continue
     # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
     auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True, hidden=True)])
-    auto_eval_column_dict.append([f"{task.name}_mbu", ColumnContent, ColumnContent(f"{task.value.col_name} {MBU}", "number", True, hidden=True)])
-    auto_eval_column_dict.append([f"{task.name}_mfu", ColumnContent, ColumnContent(f"{task.value.col_name} {MFU}", "number", True, hidden=True)])
 # Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, dummy=True)])
-# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
-auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True, dummy=True)])
-# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
-# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
@@ -160,10 +151,10 @@ class ModelDetails:
 class ModelType(Enum):
-    # PT = ModelDetails(name="pretrained", symbol="🟢")
-    # FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
     chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
-    # merges = ModelDetails(name="base merges and moerges", symbol="🤝")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
@@ -171,24 +162,21 @@ class ModelType(Enum):
     @staticmethod
     def from_str(type):
-        # if "fine-tuned" in type or "🔶" in type:
-        #     return ModelType.FT
-        # if "pretrained" in type or "🟢" in type:
-        #     return ModelType.PT
         if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
             return ModelType.chat
-        # if "merge" in type or "🤝" in type:
-        #     return ModelType.merges
         return ModelType.Unknown
 class InferenceFramework(Enum):
     # "moe-infinity", hf-chat
-    # MoE_Infinity = ModelDetails("moe-infinity")
     HF_Chat = ModelDetails("hf-chat")
-    VLLM = ModelDetails("vllm_moe")
-    TRTLLM = ModelDetails("tensorrt_llm")
-    VLLM_FIX = ModelDetails("vllm_moe_fixbs")
     Unknown = ModelDetails("?")
     def to_str(self):
@@ -196,21 +184,16 @@ class InferenceFramework(Enum):
     @staticmethod
     def from_str(inference_framework: str):
-        # if inference_framework in ["moe-infinity"]:
-        #     return InferenceFramework.MoE_Infinity
-        if inference_framework in ["tensorrt_llm"]:
-            return InferenceFramework.TRTLLM
         if inference_framework in ["hf-chat"]:
             return InferenceFramework.HF_Chat
-        if inference_framework in ["vllm_moe"]:
-            return InferenceFramework.VLLM
-        if inference_framework in ["vllm_moe_fixbs"]:
-            return InferenceFramework.VLLM_FIX
         return InferenceFramework.Unknown
 class GPUType(Enum):
-    A100_sxm = ModelDetails("NVIDIA-A100-SXM4-80GB")
     A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
     Unknown = ModelDetails("?")
     def to_str(self):
@@ -218,10 +201,12 @@ class GPUType(Enum):
     @staticmethod
     def from_str(gpu_type: str):
-        if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
             return GPUType.A100_pcie
-        if gpu_type in ["NVIDIA-A100-SXM4-80GB"]:
-            return GPUType.A100_sxm
         return GPUType.Unknown
 class WeightType(Enum):
@@ -231,28 +216,28 @@ class WeightType(Enum):
 class Precision(Enum):
-    # float32 = ModelDetails("float32")
-    # float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
     qt_8bit = ModelDetails("8bit")
     qt_4bit = ModelDetails("4bit")
-    # qt_GPTQ = ModelDetails("GPTQ")
     Unknown = ModelDetails("?")
     @staticmethod
     def from_str(precision: str):
-        # if precision in ["torch.float32", "float32"]:
-        #     return Precision.float32
-        # if precision in ["torch.float16", "float16"]:
-        #     return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
         if precision in ["8bit"]:
             return Precision.qt_8bit
         if precision in ["4bit"]:
             return Precision.qt_4bit
-        # if precision in ["GPTQ", "None"]:
-        #     return Precision.qt_GPTQ
         return Precision.Unknown

 GPU_Mem = 'Mem(G)'
 GPU_Name = "GPU"
 GPU_Util = 'Util(%)'
 BATCH_SIZE = 'bs'
 PRECISION = "Precision"
 system_metrics_to_name_map = {
     "end_to_end_time": f"{E2Es}",
     "prefilling_time": f"{PREs}",
     "decoding_throughput": f"{TS}",
 }
 gpu_metrics_to_name_map = {
     GPU_Mem: GPU_Mem,
     "batch_size": BATCH_SIZE,
     "precision": PRECISION,
+    GPU_Name: GPU_Name,
 }
 @dataclass
     # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
     # # XXX include me back at some point
+    selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
     mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
 # These classes are for user facing column names,
 # # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
 # Inference framework
+auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
     auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
     auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
     # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
+    auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
     auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
+    auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
     if task.value.benchmark in MULTIPLE_CHOICEs:
         continue
     # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
     auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True, hidden=True)])
 # Model information
+auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
+auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
 class ModelType(Enum):
+    PT = ModelDetails(name="pretrained", symbol="🟢")
+    FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
     chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
+    merges = ModelDetails(name="base merges and moerges", symbol="🤝")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
     @staticmethod
     def from_str(type):
+        if "fine-tuned" in type or "🔶" in type:
+            return ModelType.FT
+        if "pretrained" in type or "🟢" in type:
+            return ModelType.PT
         if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
             return ModelType.chat
+        if "merge" in type or "🤝" in type:
+            return ModelType.merges
         return ModelType.Unknown
 class InferenceFramework(Enum):
     # "moe-infinity", hf-chat
+    MoE_Infinity = ModelDetails("moe-infinity")
     HF_Chat = ModelDetails("hf-chat")
     Unknown = ModelDetails("?")
     def to_str(self):
     @staticmethod
     def from_str(inference_framework: str):
+        if inference_framework in ["moe-infinity"]:
+            return InferenceFramework.MoE_Infinity
         if inference_framework in ["hf-chat"]:
             return InferenceFramework.HF_Chat
         return InferenceFramework.Unknown
 class GPUType(Enum):
+    H100_pcie = ModelDetails("NVIDIA-H100-PCIe-80GB")
     A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
+    A5000 = ModelDetails("NVIDIA-RTX-A5000-24GB")
     Unknown = ModelDetails("?")
     def to_str(self):
     @staticmethod
     def from_str(gpu_type: str):
+        if gpu_type in ["NVIDIA-H100-PCIe-80GB"]:
             return GPUType.A100_pcie
+        if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
+            return GPUType.H100_pcie
+        if gpu_type in ["NVIDIA-A5000-24GB"]:
+            return GPUType.A5000
         return GPUType.Unknown
 class WeightType(Enum):
 class Precision(Enum):
+    float32 = ModelDetails("float32")
+    float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
     qt_8bit = ModelDetails("8bit")
     qt_4bit = ModelDetails("4bit")
+    qt_GPTQ = ModelDetails("GPTQ")
     Unknown = ModelDetails("?")
     @staticmethod
     def from_str(precision: str):
+        if precision in ["torch.float32", "float32"]:
+            return Precision.float32
+        if precision in ["torch.float16", "float16"]:
+            return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
         if precision in ["8bit"]:
             return Precision.qt_8bit
         if precision in ["4bit"]:
             return Precision.qt_4bit
+        if precision in ["GPTQ", "None"]:
+            return Precision.qt_GPTQ
         return Precision.Unknown

src/leaderboard/read_evals.py CHANGED Viewed

@@ -65,11 +65,11 @@ class EvalResult:
         if len(org_and_model) == 1:
             org = None
             model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}_{inference_framework}"
         else:
             org = org_and_model[0]
             model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}_{inference_framework}"
         full_model = "/".join(org_and_model)
         still_on_hub, error, model_config = is_model_on_hub(
@@ -120,15 +120,11 @@ class EvalResult:
                         multiplier = 1.0
                     if "batch_" in metric or "Mem" in metric or "Util" in metric:
                         multiplier = 1
                     # print('RESULTS', data['results'])
                     # print('XXX', benchmark, metric, value, multiplier)
-                    if value == "N/A":
-                        results[benchmark][metric] = "-"
-                    elif value == "auto":
-                        results[benchmark][metric] = "auto"
-                    else:
-                        results[benchmark][metric] = value * multiplier
         res = EvalResult(
             eval_name=result_key,
@@ -140,7 +136,6 @@ class EvalResult:
             revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture,
-            model_type=ModelType.from_str(config.get("model_type", "")),
             inference_framework=inference_framework,
         )
@@ -175,22 +170,22 @@ class EvalResult:
         # breakpoint()
         # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
-            # AutoEvalColumn.model_type.name: self.model_type.value.name,
             AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            # AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
             AutoEvalColumn.dummy.name: self.full_model,
-            # AutoEvalColumn.revision.name: self.revision,
-            # # AutoEvalColumn.average.name: average,
-            # AutoEvalColumn.license.name: self.license,
-            # AutoEvalColumn.likes.name: self.likes,
-            # AutoEvalColumn.params.name: self.num_params,
-            # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
             AutoEvalColumn.inference_framework.name: self.inference_framework,
         }
@@ -278,22 +273,15 @@ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool
     eval_results = {}
     for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
-        try:
-            # Creation of result
-            eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
-            eval_result.update_with_request_file(requests_path)
-            # Store results of same eval together
-            eval_name = eval_result.eval_name
-            if eval_name in eval_results.keys():
-                eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-            else:
-                eval_results[eval_name] = eval_result
-        except (FileNotFoundError, ValueError, KeyError, json.JSONDecodeError) as e:
-            # Log the error and continue with the next file
-            print(f"Error processing file {model_result_filepath}: {e}")
-            continue
     results = []
     for v in eval_results.values():

         if len(org_and_model) == 1:
             org = None
             model = org_and_model[0]
+            result_key = f"{model}_{precision.value.name}"
         else:
             org = org_and_model[0]
             model = org_and_model[1]
+            result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
         still_on_hub, error, model_config = is_model_on_hub(
                         multiplier = 1.0
                     if "batch_" in metric or "Mem" in metric or "Util" in metric:
                         multiplier = 1
                     # print('RESULTS', data['results'])
                     # print('XXX', benchmark, metric, value, multiplier)
+                    results[benchmark][metric] = value * multiplier
         res = EvalResult(
             eval_name=result_key,
             revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture,
             inference_framework=inference_framework,
         )
         # breakpoint()
         # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
+            AutoEvalColumn.model_type.name: self.model_type.value.name,
             AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
+            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+            AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
             AutoEvalColumn.dummy.name: self.full_model,
+            AutoEvalColumn.revision.name: self.revision,
+            # AutoEvalColumn.average.name: average,
+            AutoEvalColumn.license.name: self.license,
+            AutoEvalColumn.likes.name: self.likes,
+            AutoEvalColumn.params.name: self.num_params,
+            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
             AutoEvalColumn.inference_framework.name: self.inference_framework,
         }
     eval_results = {}
     for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
+        # Creation of result
+        eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
+        eval_result.update_with_request_file(requests_path)
+        # Store results of same eval together
+        eval_name = eval_result.eval_name
+        if eval_name in eval_results.keys():
+            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+        else:
+            eval_results[eval_name] = eval_result
     results = []
     for v in eval_results.values():

src/submission/check_validity.py CHANGED Viewed

@@ -74,7 +74,7 @@ def is_model_on_hub(
 def get_model_size(model_info: ModelInfo, precision: str):
-    size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
     except (AttributeError, TypeError):
@@ -130,8 +130,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
                     continue
                 with open(os.path.join(root, file), "r") as f:
                     info = json.load(f)
-                    if not info["status"] == "FINISHED" and not info["status"] == "RUNNING":
-                        file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
                     # Select organisation
                     if info["model"].count("/") == 0 or "submitted_time" not in info:

 def get_model_size(model_info: ModelInfo, precision: str):
+    size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
     except (AttributeError, TypeError):
                     continue
                 with open(os.path.join(root, file), "r") as f:
                     info = json.load(f)
+                    file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
                     # Select organisation
                     if info["model"].count("/") == 0 or "submitted_time" not in info:

src/utils.py CHANGED Viewed

@@ -3,54 +3,12 @@ from huggingface_hub import snapshot_download
 import subprocess
 import re
 import os
-import GPUtil
 try:
     from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
 except:
     print("local debug: from display.utils")
     from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
-MEM_BW_DICT ={
-    "NVIDIA-A100-PCIe-80GB": 1935,
-    "NVIDIA-A100-SXM-80GB": 2039,
-    "NVIDIA-H100-PCIe-80GB": 2039,
-    "NVIDIA-RTX-A5000-24GB": 768
-}
-PEAK_FLOPS_DICT = {
-    "float32":{
-        "NVIDIA-A100-PCIe-80GB": 312e12,
-        "NVIDIA-A100-SXM-80GB": 312e12,
-        "NVIDIA-H100-PCIe-80GB": 756e12,
-        "NVIDIA-RTX-A5000-24GB": 222.2e12
-    },
-    "float16":{
-        "NVIDIA-A100-PCIe-80GB": 624e12,
-        "NVIDIA-A100-SXM-80GB": 624e12,
-        "NVIDIA-H100-PCIe-80GB": 1513e12,
-        "NVIDIA-RTX-A5000-24GB": 444.4e12
-    },
-    "bfloat16":{
-        "NVIDIA-A100-PCIe-80GB": 624e12,
-        "NVIDIA-A100-SXM-80GB": 624e12,
-        "NVIDIA-H100-PCIe-80GB": 1513e12,
-        "NVIDIA-RTX-A5000-24GB": 444.4e12
-    },
-    "8bit":{
-        "NVIDIA-A100-PCIe-80GB": 1248e12,
-        "NVIDIA-A100-SXM-80GB": 1248e12,
-        "NVIDIA-H100-PCIe-80GB": 3026e12,
-        "NVIDIA-RTX-A5000-24GB": 889e12
-    },
-    "4bit": {
-        "NVIDIA-A100-PCIe-80GB": 2496e12,
-        "NVIDIA-A100-SXM-80GB": 2496e12,
-        "NVIDIA-H100-PCIe-80GB": 6052e12,
-        "NVIDIA-RTX-A5000-24GB": 1778e12
-    }
-}
 def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
     for i in range(10):
@@ -94,12 +52,11 @@ def parse_nvidia_smi():
             print("Failed to query GPU indices.")
             return []
         gpu_indices = result.stdout.strip().split('\n')
-    # print(f"gpu_indices: {gpu_indices}")
     gpu_stats = []
     gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
-    # gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
-    gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')
     gpu_name = ""
     for index in gpu_indices:
@@ -111,7 +68,7 @@ def parse_nvidia_smi():
             name_match = gpu_name_pattern.search(line)
             gpu_info = {}
             if name_match:
-                gpu_name = ''.join(filter(None, name_match.groups())).strip()
             if match:
                 temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
                 gpu_info.update({
@@ -123,7 +80,7 @@ def parse_nvidia_smi():
             if len(gpu_info) >= 4:
                 gpu_stats.append(gpu_info)
-    # print(f"gpu_stats: {gpu_stats}")
     gpu_name = f"{len(gpu_stats)}x{gpu_name}"
     gpu_stats_total = {
                         GPU_TEMP: 0,
@@ -174,38 +131,5 @@ def analyze_gpu_stats(stats_list):
     return avg_stats
-def get_gpu_details():
-    gpus = GPUtil.getGPUs()
-    gpu = gpus[0]
-    name = gpu.name.replace(" ", "-")
-    memory_gb = round(gpu.memoryTotal / 1024)
-    memory = f"{memory_gb}GB"
-    for part in name.split('-'):
-        if part.endswith("GB") and part[:-2].isdigit():
-            name = name.replace(f"-{part}", "").replace(part, "")
-    formatted_name = f"{name}-{memory}"
-    return formatted_name
-def get_peak_bw(gpu_name):
-    return MEM_BW_DICT[gpu_name]
-def get_peak_flops(gpu_name, precision):
-    return PEAK_FLOPS_DICT[precision][gpu_name]
-def transfer_precision2bytes(precision):
-    if precision == "float32":
-        return 4
-    elif precision in ["float16", "bfloat16"]:
-        return 2
-    elif precision == "8bit":
-        return 1
-    elif precision == "4bit":
-        return 0.5
-    else:
-        raise ValueError(f"Unsupported precision: {precision}")
 if __name__ == "__main__":
     print(analyze_gpu_stats(parse_nvidia_smi()))

 import subprocess
 import re
 import os
 try:
     from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
 except:
     print("local debug: from display.utils")
     from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
 def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
     for i in range(10):
             print("Failed to query GPU indices.")
             return []
         gpu_indices = result.stdout.strip().split('\n')
+    print(f"gpu_indices: {gpu_indices}")
     gpu_stats = []
     gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
+    gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+?\d+GB)')
     gpu_name = ""
     for index in gpu_indices:
             name_match = gpu_name_pattern.search(line)
             gpu_info = {}
             if name_match:
+                gpu_name = name_match.group(1).strip()
             if match:
                 temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
                 gpu_info.update({
             if len(gpu_info) >= 4:
                 gpu_stats.append(gpu_info)
+    print(f"gpu_stats: {gpu_stats}")
     gpu_name = f"{len(gpu_stats)}x{gpu_name}"
     gpu_stats_total = {
                         GPU_TEMP: 0,
     return avg_stats
 if __name__ == "__main__":
     print(analyze_gpu_stats(parse_nvidia_smi()))