Spaces:

double-ai
/

FormulaOne-Leaderboard

Running on CPU Upgrade

App Files Files Community

tomerz-aai commited on 29 days ago

Commit

c887522

1 Parent(s): a4be848

initial submit

Browse files

Files changed (7) hide show

app.py +111 -127
src/about.py +1 -1
src/datamodel/__init__.py +0 -0
src/datamodel/data.py +21 -0
src/envs.py +8 -14
src/submission/check_validity.py +3 -0
src/submission/submit.py +61 -102

app.py CHANGED Viewed

@@ -12,6 +12,8 @@ from src.about import (
     LLM_BENCHMARKS_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
     BENCHMARK_COLS,
@@ -24,69 +26,53 @@ from src.display.utils import (
     WeightType,
     Precision
 )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
-    )
 demo = gr.Blocks(css=custom_css)
@@ -95,8 +81,8 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -106,84 +92,82 @@ with demo:
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
             with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
                 with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
             submit_button.click(
-                add_new_eval,
                 [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
                 ],
                 submission_result,
             )
@@ -201,4 +185,4 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

     LLM_BENCHMARKS_TEXT,
     TITLE,
 )
+from src.datamodel.data import F1Data
 from src.display.css_html_js import custom_css
 from src.display.utils import (
     BENCHMARK_COLS,
     WeightType,
     Precision
 )
+from src.envs import API, REPO_ID, TOKEN, CODE_PROBLEMS_REPO, SUBMISSIONS_REPO, RESULTS_REPO
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_solutions
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
+lbdb = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO)
+# (
+#     finished_eval_queue_df,
+#     running_eval_queue_df,
+#     pending_eval_queue_df,
+# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+# def init_leaderboard(dataframe):
+#     if dataframe is None or dataframe.empty:
+#         raise ValueError("Leaderboard DataFrame is empty or None.")
+#     return Leaderboard(
+#         value=dataframe,
+#         datatype=[c.type for c in fields(AutoEvalColumn)],
+#         select_columns=SelectColumns(
+#             default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+#             cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
+#             label="Select Columns to Display:",
+#         ),
+#         search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
+#         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
+#         filter_columns=[
+#             ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
+#             ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
+#             ColumnFilter(
+#                 AutoEvalColumn.params.name,
+#                 type="slider",
+#                 min=0.01,
+#                 max=150,
+#                 label="Select the number of parameters (B)",
+#             ),
+#             ColumnFilter(
+#                 AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
+#             ),
+#         ],
+#         bool_checkboxgroup_label="Hide models",
+#         interactive=False,
+#     )
 demo = gr.Blocks(css=custom_css)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        # with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+        #     leaderboard = init_leaderboard(LEADERBOARD_DF)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                # with gr.Column():
+                #     with gr.Accordion(
+                #         f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                #         open=False,
+                #     ):
+                #         with gr.Row():
+                #             finished_eval_table = gr.components.Dataframe(
+                #                 value=finished_eval_queue_df,
+                #                 headers=EVAL_COLS,
+                #                 datatype=EVAL_TYPES,
+                #                 row_count=5,
+                #             )
+                #     with gr.Accordion(
+                #         f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                #         open=False,
+                #     ):
+                #         with gr.Row():
+                #             running_eval_table = gr.components.Dataframe(
+                #                 value=running_eval_queue_df,
+                #                 headers=EVAL_COLS,
+                #                 datatype=EVAL_TYPES,
+                #                 row_count=5,
+                #             )
+                #     with gr.Accordion(
+                #         f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                #         open=False,
+                #     ):
+                #         with gr.Row():
+                #             pending_eval_table = gr.components.Dataframe(
+                #                 value=pending_eval_queue_df,
+                #                 headers=EVAL_COLS,
+                #                 datatype=EVAL_TYPES,
+                #                 row_count=5,
+                #             )
             with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your sulutions here!", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
+                    submitter_textbox = gr.Textbox(label="Submitter")
+                    # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                    # model_type = gr.Dropdown(
+                    #     choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+                    #     label="Model type",
+                    #     multiselect=False,
+                    #     value=None,
+                    #     interactive=True,
+                    # )
                 with gr.Column():
+                    submission_file = gr.File(label="JSONL solutions file", file_types=[".jsonl"])
+                    # precision = gr.Dropdown(
+                    #     choices=[i.value.name for i in Precision if i != Precision.Unknown],
+                    #     label="Precision",
+                    #     multiselect=False,
+                    #     value="float16",
+                    #     interactive=True,
+                    # )
+                    # weight_type = gr.Dropdown(
+                    #     choices=[i.value.name for i in WeightType],
+                    #     label="Weights type",
+                    #     multiselect=False,
+                    #     value="Original",
+                    #     interactive=True,
+                    # )
+                    # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
             submit_button.click(
+                add_new_solutions,
                 [
+                    lbdb,
+                    submitter_textbox,
+                    submission_file,
                 ],
                 submission_result,
             )
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

src/about.py CHANGED Viewed

@@ -21,7 +21,7 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """

 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">AAI FormulaOne Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """

src/datamodel/__init__.py ADDED Viewed

File without changes

src/datamodel/data.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import functools
+from datasets import load_dataset
+class F1Data:
+    def __init__(self, cp_ds_name: str, sub_ds_name: str, res_ds_name: str):
+        self.cp_dataset_name = cp_ds_name
+        self.submissions_dataset_name = sub_ds_name
+        self.results_dataset_name = res_ds_name
+        self.initialize()
+    @functools.cached_property
+    def code_problem_formulas(self) -> set[str]:
+        return set(self.code_problems.keys())
+    def initialize(self):
+        cp_ds = load_dataset(self.cp_dataset_name, split="hard")
+        self.code_problems: dict[str, str] = {r["formula_name"]: r["code_problem"]["problem_description"] for r in cp_ds}
+    def add_submission(self, submitter: str, submission_path: str):
+        pass

src/envs.py CHANGED Viewed

@@ -2,24 +2,18 @@ import os
 from huggingface_hub import HfApi
-# Info to change for your repository
-# ----------------------------------
-TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
-# ----------------------------------
-REPO_ID = f"{OWNER}/leaderboard"
-QUEUE_REPO = f"{OWNER}/requests"
-RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")
-# Local caches
-EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
-EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
-EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
-EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 API = HfApi(token=TOKEN)

 from huggingface_hub import HfApi
+TOKEN = os.environ.get("HF_TOKEN")
+OWNER = "double-ai"
+REPO_ID = f"{OWNER}/FormulaOne-Leaderboard"
+# Datasets
+CODE_PROBLEMS_REPO = f"{OWNER}/dev-f1-dataset"
+SUBMISSIONS_REPO = f"{OWNER}/dev-f1-leaderboard-submissions"
+RESULTS_REPO = f"{OWNER}/dev-f1-leaderboard-results"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")
 API = HfApi(token=TOKEN)

src/submission/check_validity.py CHANGED Viewed

@@ -4,12 +4,15 @@ import re
 from collections import defaultdict
 from datetime import datetime, timedelta, timezone
 import huggingface_hub
 from huggingface_hub import ModelCard
 from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     try:

 from collections import defaultdict
 from datetime import datetime, timedelta, timezone
+from datasets import get_dataset_config_names
 import huggingface_hub
 from huggingface_hub import ModelCard
 from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
+from src.envs import SUBMISSIONS_REPO
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     try:

src/submission/submit.py CHANGED Viewed

@@ -1,118 +1,77 @@
 import json
 import os
 from datetime import datetime, timezone
-from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
-from src.submission.check_validity import (
-    already_submitted_models,
-    check_model_card,
-    get_model_size,
-    is_model_on_hub,
-)
-REQUESTED_MODELS = None
-USERS_TO_SUBMISSION_DATES = None
-def add_new_eval(
-    model: str,
-    base_model: str,
-    revision: str,
-    precision: str,
-    weight_type: str,
-    model_type: str,
 ):
-    global REQUESTED_MODELS
-    global USERS_TO_SUBMISSION_DATES
-    if not REQUESTED_MODELS:
-        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
-    user_name = ""
-    model_path = model
-    if "/" in model:
-        user_name = model.split("/")[0]
-        model_path = model.split("/")[1]
-    precision = precision.split(" ")[0]
-    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    if model_type is None or model_type == "":
-        return styled_error("Please select a model type.")
-    # Does the model actually exist?
-    if revision == "":
-        revision = "main"
-    # Is the model on the hub?
-    if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not base_model_on_hub:
-            return styled_error(f'Base model "{base_model}" {error}')
-    if not weight_type == "Adapter":
-        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not model_on_hub:
-            return styled_error(f'Model "{model}" {error}')
-    # Is the model info correctly filled?
     try:
-        model_info = API.model_info(repo_id=model, revision=revision)
-    except Exception:
-        return styled_error("Could not get your model information. Please fill it up properly.")
-    model_size = get_model_size(model_info=model_info, precision=precision)
-    # Were the model card and license filled?
-    try:
-        license = model_info.cardData["license"]
-    except Exception:
-        return styled_error("Please select a license for your model")
-    modelcard_OK, error_msg = check_model_card(model)
-    if not modelcard_OK:
-        return styled_error(error_msg)
     # Seems good, creating the eval
-    print("Adding new eval")
-    eval_entry = {
-        "model": model,
-        "base_model": base_model,
-        "revision": revision,
-        "precision": precision,
-        "weight_type": weight_type,
-        "status": "PENDING",
-        "submitted_time": current_time,
-        "model_type": model_type,
-        "likes": model_info.likes,
-        "params": model_size,
-        "license": license,
-        "private": False,
-    }
-    # Check for duplicate submission
-    if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
-        return styled_warning("This model has been already submitted.")
-    print("Creating eval file")
-    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
-    os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
-    with open(out_path, "w") as f:
-        f.write(json.dumps(eval_entry))
-    print("Uploading eval file")
-    API.upload_file(
-        path_or_fileobj=out_path,
-        path_in_repo=out_path.split("eval-queue/")[1],
-        repo_id=QUEUE_REPO,
-        repo_type="dataset",
-        commit_message=f"Add {model} to eval queue",
-    )
-    # Remove the local file
-    os.remove(out_path)
     return styled_message(
         "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."

 import json
 import os
 from datetime import datetime, timezone
+import time
+import pandas as pd
+from src.datamodel.data import F1Data
+from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, SUBMISSIONS_REPO, TOKEN
+# from src.submission.check_validity import (
+#     already_submitted_models,
+#     check_model_card,
+#     get_model_size,
+#     is_model_on_hub,
+# )
+def add_new_solutions(
+    lbdb: F1Data,
+    submitter: str,
+    submission_path: str,
 ):
+    if not submitter:
+        return styled_error("Please fill submitter name")
+    if not submission_path:
+        return styled_error("Please upload JSONL solutions file")
     try:
+        ds = pd.read_json(submission_path, lines=True)
+    except Exception as e:
+        return styled_error(f"Cannot read uploaded JSONL file: {str(e)}")
+    submitted_formulas = set(ds["formula_name"])
+    if submitted_formulas != lbdb.code_problem_formulas:
+        missing = lbdb.code_problem_formulas - submitted_formulas
+        unknown = submitted_formulas - lbdb.code_problem_formulas
+        return styled_error(f"Mismatched formula names: missing {len(missing)} unknown {len(unknown)}")
+    if len(ds) > len(lbdb.code_problem_formulas):
+        return styled_error("Duplicate formula solutions exist in uploaded file")
+    submission_id = datetime.now().strftime("%Y%m%d%H%M%S")
     # Seems good, creating the eval
+    print(f"Adding new submission {submission_id} from {submitter}")
+    submission_ts = time.time_ns()
+    def add_info(row):
+        row["submitter"] = submitter
+        row["submission_id"] = submission_id
+        row["submission_ts"] = submission_ts
+    ds = ds.map(add_info)
+    ds.push_to_hub(SUBMISSIONS_REPO, submission_id, private=True)
+    # print("Creating eval file")
+    # OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
+    # os.makedirs(OUT_DIR, exist_ok=True)
+    # out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
+    # with open(out_path, "w") as f:
+    #     f.write(json.dumps(eval_entry))
+    # print("Uploading eval file")
+    # API.upload_file(
+    #     path_or_fileobj=out_path,
+    #     path_in_repo=out_path.split("eval-queue/")[1],
+    #     repo_id=QUEUE_REPO,
+    #     repo_type="dataset",
+    #     commit_message=f"Add {model} to eval queue",
+    # )
+    # # Remove the local file
+    # os.remove(out_path)
     return styled_message(
         "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."