Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
More cleanup.
Browse files- README.md +3 -3
- src/about.py +1 -6
- src/datamodel/data.py +16 -11
- src/display/__init__.py +0 -0
- src/display/utils.py +11 -58
- src/logger.py +4 -1
- src/submission/check_validity.py +0 -115
- src/submission/submit.py +2 -1
README.md
CHANGED
@@ -41,9 +41,9 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
|
|
41 |
# Code logic for more complex edits
|
42 |
|
43 |
You'll find
|
44 |
-
-
|
45 |
-
-
|
46 |
-
-
|
47 |
|
48 |
|
49 |
# Setting up the environment
|
|
|
41 |
# Code logic for more complex edits
|
42 |
|
43 |
You'll find
|
44 |
+
- The main table' columns names and properties in `src/display/utils.py`
|
45 |
+
- The logic to read all results and request files, then convert them in dataframe lines, in `src/populate.py`
|
46 |
+
- The logic to allow or filter submissions in `src/submission/submit.py`.
|
47 |
|
48 |
|
49 |
# Setting up the environment
|
src/about.py
CHANGED
@@ -9,15 +9,11 @@ class Task:
|
|
9 |
col_name: str
|
10 |
|
11 |
|
12 |
-
# Select your tasks here
|
13 |
-
# ---------------------------------------------------
|
14 |
class Tasks(Enum):
|
15 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
|
17 |
|
18 |
|
19 |
-
NUM_FEWSHOT = 0
|
20 |
-
# ---------------------------------------------------
|
21 |
|
22 |
TITLE = """
|
23 |
<h1 id="space-title" style="
|
@@ -34,7 +30,6 @@ TITLE = """
|
|
34 |
</h1>
|
35 |
"""
|
36 |
|
37 |
-
# What does your leaderboard evaluate?
|
38 |
INTRODUCTION_TEXT = """
|
39 |
Welcome to the official leaderboard for the paper:
|
40 |
|
|
|
9 |
col_name: str
|
10 |
|
11 |
|
|
|
|
|
12 |
class Tasks(Enum):
|
|
|
13 |
task0 = Task("FormulaOne", "success_rate", "Success Rate (%)")
|
14 |
|
15 |
|
16 |
+
NUM_FEWSHOT = 0
|
|
|
17 |
|
18 |
TITLE = """
|
19 |
<h1 id="space-title" style="
|
|
|
30 |
</h1>
|
31 |
"""
|
32 |
|
|
|
33 |
INTRODUCTION_TEXT = """
|
34 |
Welcome to the official leaderboard for the paper:
|
35 |
|
src/datamodel/data.py
CHANGED
@@ -12,10 +12,10 @@ logger = get_logger(__name__)
|
|
12 |
class F1Data:
|
13 |
def __init__(
|
14 |
self,
|
15 |
-
cp_ds_name: str,
|
16 |
-
sub_ds_name: str,
|
17 |
-
res_ds_name: str,
|
18 |
-
split: str = "hard",
|
19 |
):
|
20 |
self.cp_dataset_name = cp_ds_name
|
21 |
self.submissions_dataset_name = sub_ds_name
|
@@ -27,14 +27,14 @@ class F1Data:
|
|
27 |
def _initialize(self):
|
28 |
logger.info(f"Initialize F1Data TOKEN='{TOKEN}'")
|
29 |
start_time = time.monotonic()
|
30 |
-
cp_ds = load_dataset(
|
31 |
-
logger.info(
|
32 |
-
"Loaded code-problems dataset from %s in %f sec",
|
33 |
self.cp_dataset_name,
|
34 |
-
|
|
|
35 |
)
|
36 |
-
|
37 |
-
|
|
|
38 |
|
39 |
@functools.cached_property
|
40 |
def code_problem_ids(self) -> set[str]:
|
@@ -43,6 +43,11 @@ class F1Data:
|
|
43 |
|
44 |
if __name__ == "__main__":
|
45 |
split = "hard"
|
46 |
-
f1_data = F1Data(
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
print(f"Found {len(f1_data.code_problem_ids)} code problems in {split} split of {f1_data.cp_dataset_name}")
|
|
|
12 |
class F1Data:
|
13 |
def __init__(
|
14 |
self,
|
15 |
+
cp_ds_name: str, # Name of the dataset. Fixed.
|
16 |
+
sub_ds_name: str, # Name of subdataset. Fixed.
|
17 |
+
res_ds_name: str, # Name of results repository. Fixed.
|
18 |
+
split: str = "hard", # Split is either 'hard' or 'easy'.
|
19 |
):
|
20 |
self.cp_dataset_name = cp_ds_name
|
21 |
self.submissions_dataset_name = sub_ds_name
|
|
|
27 |
def _initialize(self):
|
28 |
logger.info(f"Initialize F1Data TOKEN='{TOKEN}'")
|
29 |
start_time = time.monotonic()
|
30 |
+
cp_ds = load_dataset(
|
|
|
|
|
31 |
self.cp_dataset_name,
|
32 |
+
split=self.split,
|
33 |
+
token=TOKEN,
|
34 |
)
|
35 |
+
logger.info(f"Loaded code-problems dataset from {self.cp_dataset_name} in {time.monotonic() - start_time} sec")
|
36 |
+
self.code_problems = {r["id"]: r["code_problem"] for r in cp_ds} # id string -> code problem.
|
37 |
+
logger.info(f"Loaded {len(self.code_problems)} code problems")
|
38 |
|
39 |
@functools.cached_property
|
40 |
def code_problem_ids(self) -> set[str]:
|
|
|
43 |
|
44 |
if __name__ == "__main__":
|
45 |
split = "hard"
|
46 |
+
f1_data = F1Data(
|
47 |
+
cp_ds_name=CODE_PROBLEMS_REPO,
|
48 |
+
sub_ds_name=SUBMISSIONS_REPO,
|
49 |
+
res_ds_name=RESULTS_REPO,
|
50 |
+
split=split,
|
51 |
+
)
|
52 |
|
53 |
print(f"Found {len(f1_data.code_problem_ids)} code problems in {split} split of {f1_data.cp_dataset_name}")
|
src/display/__init__.py
ADDED
File without changes
|
src/display/utils.py
CHANGED
@@ -1,19 +1,15 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
-
from typing import ClassVar
|
3 |
from enum import Enum
|
4 |
|
5 |
-
import pandas as pd
|
6 |
|
7 |
-
|
|
|
8 |
|
9 |
|
10 |
-
|
11 |
-
|
12 |
|
13 |
|
14 |
-
# These classes are for user facing column names,
|
15 |
-
# to avoid having to change them all around the code
|
16 |
-
# when a modif is needed
|
17 |
@dataclass
|
18 |
class ColumnContent:
|
19 |
name: str
|
@@ -23,41 +19,6 @@ class ColumnContent:
|
|
23 |
never_hidden: bool = False
|
24 |
|
25 |
|
26 |
-
## Leaderboard columns
|
27 |
-
# auto_eval_column_fields = []
|
28 |
-
# # Init
|
29 |
-
# auto_eval_column_fields.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
30 |
-
# auto_eval_column_fields.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
31 |
-
# # Scores
|
32 |
-
# auto_eval_column_fields.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
33 |
-
# for task in Tasks:
|
34 |
-
# auto_eval_column_fields.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
35 |
-
# # Model information
|
36 |
-
# auto_eval_column_fields.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
37 |
-
# auto_eval_column_fields.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
38 |
-
# auto_eval_column_fields.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
39 |
-
# auto_eval_column_fields.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
40 |
-
# auto_eval_column_fields.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
41 |
-
# auto_eval_column_fields.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
42 |
-
# auto_eval_column_fields.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
43 |
-
# auto_eval_column_fields.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
44 |
-
# auto_eval_column_fields.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
45 |
-
#
|
46 |
-
#
|
47 |
-
#
|
48 |
-
# def make_classvar_dataclass(name: str, spec: list):
|
49 |
-
# ns = {"__annotations__": {}}
|
50 |
-
# for field_name, field_type, default in spec:
|
51 |
-
# # Mark as ClassVar so dataclass doesn't treat it as an instance field
|
52 |
-
# ns["__annotations__"][field_name] = ClassVar[field_type]
|
53 |
-
# ns[field_name] = default
|
54 |
-
# # No instance fields; just class-level descriptors
|
55 |
-
# return make_dataclass(name, [], frozen=True, namespace=ns)
|
56 |
-
#
|
57 |
-
# # We use make dataclass to dynamically fill the scores from Tasks
|
58 |
-
# AutoEvalColumn = make_classvar_dataclass("AutoEvalColumn", auto_eval_column_fields)
|
59 |
-
|
60 |
-
|
61 |
@dataclass(frozen=True)
|
62 |
class AutoEvalColumn:
|
63 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
@@ -68,18 +29,18 @@ class AutoEvalColumn:
|
|
68 |
submitted_on = ColumnContent("Submitted On", "datetime", True)
|
69 |
|
70 |
|
71 |
-
|
72 |
@dataclass(frozen=True)
|
73 |
class EvalQueueColumn: # Queue column
|
74 |
model = ColumnContent("model", "markdown", True)
|
75 |
revision = ColumnContent("revision", "str", True)
|
76 |
private = ColumnContent("private", "bool", True)
|
77 |
precision = ColumnContent("precision", "str", True)
|
78 |
-
weight_type = ColumnContent("weight_type", "str",
|
79 |
status = ColumnContent("status", "str", True)
|
80 |
|
81 |
|
82 |
-
|
83 |
@dataclass
|
84 |
class ModelDetails:
|
85 |
name: str
|
@@ -90,8 +51,6 @@ class ModelDetails:
|
|
90 |
class ModelType(Enum):
|
91 |
LLM = ModelDetails(name="LLM", symbol="🟢")
|
92 |
AgenticLLM = ModelDetails(name="AgenticLLM", symbol="🔶")
|
93 |
-
# IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
94 |
-
# RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
95 |
Other = ModelDetails(name="Other", symbol="?")
|
96 |
|
97 |
def to_str(self, separator=" "):
|
@@ -103,10 +62,6 @@ class ModelType(Enum):
|
|
103 |
return ModelType.AgenticLLM
|
104 |
if "LLM" in type or "🟢" in type:
|
105 |
return ModelType.LLM
|
106 |
-
# if "RL-tuned" in type or "🟦" in type:
|
107 |
-
# return ModelType.RL
|
108 |
-
# if "instruction-tuned" in type or "⭕" in type:
|
109 |
-
# return ModelType.IFT
|
110 |
return ModelType.Other
|
111 |
|
112 |
|
@@ -130,9 +85,7 @@ class Precision(Enum):
|
|
130 |
|
131 |
|
132 |
# Column selection
|
133 |
-
COLS = [c.name for c in
|
134 |
-
|
135 |
-
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
136 |
-
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
137 |
|
138 |
-
|
|
|
|
1 |
+
from dataclasses import dataclass
|
|
|
2 |
from enum import Enum
|
3 |
|
|
|
4 |
|
5 |
+
def _fields(raw_class):
|
6 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
7 |
|
8 |
|
9 |
+
# These classes are for user facing column names, to avoid having to change them
|
10 |
+
# all around the code when a modification is needed.
|
11 |
|
12 |
|
|
|
|
|
|
|
13 |
@dataclass
|
14 |
class ColumnContent:
|
15 |
name: str
|
|
|
19 |
never_hidden: bool = False
|
20 |
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
@dataclass(frozen=True)
|
23 |
class AutoEvalColumn:
|
24 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
|
|
29 |
submitted_on = ColumnContent("Submitted On", "datetime", True)
|
30 |
|
31 |
|
32 |
+
# For the queue columns in the submission tab
|
33 |
@dataclass(frozen=True)
|
34 |
class EvalQueueColumn: # Queue column
|
35 |
model = ColumnContent("model", "markdown", True)
|
36 |
revision = ColumnContent("revision", "str", True)
|
37 |
private = ColumnContent("private", "bool", True)
|
38 |
precision = ColumnContent("precision", "str", True)
|
39 |
+
weight_type = ColumnContent("weight_type", "str", True)
|
40 |
status = ColumnContent("status", "str", True)
|
41 |
|
42 |
|
43 |
+
# All the model information that we might need
|
44 |
@dataclass
|
45 |
class ModelDetails:
|
46 |
name: str
|
|
|
51 |
class ModelType(Enum):
|
52 |
LLM = ModelDetails(name="LLM", symbol="🟢")
|
53 |
AgenticLLM = ModelDetails(name="AgenticLLM", symbol="🔶")
|
|
|
|
|
54 |
Other = ModelDetails(name="Other", symbol="?")
|
55 |
|
56 |
def to_str(self, separator=" "):
|
|
|
62 |
return ModelType.AgenticLLM
|
63 |
if "LLM" in type or "🟢" in type:
|
64 |
return ModelType.LLM
|
|
|
|
|
|
|
|
|
65 |
return ModelType.Other
|
66 |
|
67 |
|
|
|
85 |
|
86 |
|
87 |
# Column selection
|
88 |
+
COLS = [c.name for c in _fields(AutoEvalColumn) if not c.hidden]
|
|
|
|
|
|
|
89 |
|
90 |
+
EVAL_COLS = [c.name for c in _fields(EvalQueueColumn)]
|
91 |
+
EVAL_TYPES = [c.type for c in _fields(EvalQueueColumn)]
|
src/logger.py
CHANGED
@@ -2,7 +2,10 @@ import logging
|
|
2 |
import sys
|
3 |
|
4 |
|
5 |
-
def get_logger(
|
|
|
|
|
|
|
6 |
new_logger = logging.getLogger(filename)
|
7 |
fmt = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s")
|
8 |
handler = logging.StreamHandler(sys.stderr)
|
|
|
2 |
import sys
|
3 |
|
4 |
|
5 |
+
def get_logger(
|
6 |
+
filename: str,
|
7 |
+
level=logging.INFO,
|
8 |
+
) -> logging.Logger:
|
9 |
new_logger = logging.getLogger(filename)
|
10 |
fmt = logging.Formatter(fmt="%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s")
|
11 |
handler = logging.StreamHandler(sys.stderr)
|
src/submission/check_validity.py
DELETED
@@ -1,115 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import re
|
4 |
-
from collections import defaultdict
|
5 |
-
from datetime import datetime, timedelta, timezone
|
6 |
-
|
7 |
-
import huggingface_hub
|
8 |
-
from datasets import get_dataset_config_names
|
9 |
-
from huggingface_hub import ModelCard
|
10 |
-
from huggingface_hub.hf_api import ModelInfo
|
11 |
-
from transformers import AutoConfig
|
12 |
-
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
13 |
-
|
14 |
-
from src.envs import SUBMISSIONS_REPO
|
15 |
-
|
16 |
-
|
17 |
-
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
18 |
-
"""Checks if the model card and license exist and have been filled"""
|
19 |
-
try:
|
20 |
-
card = ModelCard.load(repo_id)
|
21 |
-
except huggingface_hub.utils.EntryNotFoundError:
|
22 |
-
return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
|
23 |
-
|
24 |
-
# Enforce license metadata
|
25 |
-
if card.data.license is None:
|
26 |
-
if not ("license_name" in card.data and "license_link" in card.data):
|
27 |
-
return False, (
|
28 |
-
"License not found. Please add a license to your model card using the `license` metadata or a"
|
29 |
-
" `license_name`/`license_link` pair."
|
30 |
-
)
|
31 |
-
|
32 |
-
# Enforce card content
|
33 |
-
if len(card.text) < 200:
|
34 |
-
return False, "Please add a description to your model card, it is too short."
|
35 |
-
|
36 |
-
return True, ""
|
37 |
-
|
38 |
-
|
39 |
-
def is_model_on_hub(
|
40 |
-
model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
|
41 |
-
) -> tuple[bool, str]:
|
42 |
-
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
43 |
-
try:
|
44 |
-
config = AutoConfig.from_pretrained(
|
45 |
-
model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
|
46 |
-
)
|
47 |
-
if test_tokenizer:
|
48 |
-
try:
|
49 |
-
AutoTokenizer.from_pretrained(
|
50 |
-
model_name,
|
51 |
-
revision=revision,
|
52 |
-
trust_remote_code=trust_remote_code,
|
53 |
-
token=token,
|
54 |
-
)
|
55 |
-
except ValueError as e:
|
56 |
-
return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
|
57 |
-
except Exception as e:
|
58 |
-
return (
|
59 |
-
False,
|
60 |
-
"'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
|
61 |
-
None,
|
62 |
-
)
|
63 |
-
return True, None, config
|
64 |
-
|
65 |
-
except ValueError:
|
66 |
-
return (
|
67 |
-
False,
|
68 |
-
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
69 |
-
None,
|
70 |
-
)
|
71 |
-
|
72 |
-
except Exception as e:
|
73 |
-
return False, "was not found on hub!", None
|
74 |
-
|
75 |
-
|
76 |
-
def get_model_size(model_info: ModelInfo, precision: str):
|
77 |
-
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
78 |
-
try:
|
79 |
-
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
80 |
-
except (AttributeError, TypeError):
|
81 |
-
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
82 |
-
|
83 |
-
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
84 |
-
model_size = size_factor * model_size
|
85 |
-
return model_size
|
86 |
-
|
87 |
-
|
88 |
-
def get_model_arch(model_info: ModelInfo):
|
89 |
-
"""Gets the model architecture from the configuration"""
|
90 |
-
return model_info.config.get("architectures", "Unknown")
|
91 |
-
|
92 |
-
|
93 |
-
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
94 |
-
"""Gather a list of already submitted models to avoid duplicates"""
|
95 |
-
depth = 1
|
96 |
-
file_names = []
|
97 |
-
users_to_submission_dates = defaultdict(list)
|
98 |
-
|
99 |
-
for root, _, files in os.walk(requested_models_dir):
|
100 |
-
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
101 |
-
if current_depth == depth:
|
102 |
-
for file in files:
|
103 |
-
if not file.endswith(".json"):
|
104 |
-
continue
|
105 |
-
with open(os.path.join(root, file), "r") as f:
|
106 |
-
info = json.load(f)
|
107 |
-
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
|
108 |
-
|
109 |
-
# Select organisation
|
110 |
-
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
111 |
-
continue
|
112 |
-
organisation, _ = info["model"].split("/")
|
113 |
-
users_to_submission_dates[organisation].append(info["submitted_time"])
|
114 |
-
|
115 |
-
return set(file_names), users_to_submission_dates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/submission/submit.py
CHANGED
@@ -90,5 +90,6 @@ def add_new_solutions(
|
|
90 |
ds.push_to_hub(SUBMISSIONS_REPO, submission_id, private=True)
|
91 |
|
92 |
return styled_message(
|
93 |
-
"Your request has been submitted to the evaluation queue!\
|
|
|
94 |
)
|
|
|
90 |
ds.push_to_hub(SUBMISSIONS_REPO, submission_id, private=True)
|
91 |
|
92 |
return styled_message(
|
93 |
+
"Your request has been submitted to the evaluation queue!\n"
|
94 |
+
+ "Results may take up to 24 hours to be processed and shown in the leaderboard."
|
95 |
)
|