Alvinn-aai commited on
Commit
80fb2c0
·
1 Parent(s): 8cfcd49

switch formula->problem_id

Browse files
Files changed (2) hide show
  1. src/datamodel/data.py +5 -5
  2. src/submission/submit.py +15 -13
src/datamodel/data.py CHANGED
@@ -19,7 +19,7 @@ class F1Data:
19
  self._initialize()
20
 
21
  def _initialize(self):
22
- logger.info("Initialize F1Data TOMEN='%s'", TOKEN)
23
  start_time = time.monotonic()
24
  cp_ds = load_dataset(self.cp_dataset_name, split=self.split, token=TOKEN)
25
  logger.info(
@@ -27,11 +27,11 @@ class F1Data:
27
  self.cp_dataset_name,
28
  time.monotonic() - start_time,
29
  )
30
- self.code_problems: dict[str, str] = {r["id"]: r["code_problem"]["problem_description"] for r in cp_ds}
31
- logger.info("Code problems info: %s", self.code_problems)
32
 
33
  @functools.cached_property
34
- def code_problem_formulas(self) -> set[str]:
35
  return set(self.code_problems.keys())
36
 
37
 
@@ -39,4 +39,4 @@ if __name__ == "__main__":
39
  split = "hard"
40
  f1_data = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=split)
41
 
42
- print(f"Found {len(f1_data.code_problem_formulas)} code problems in {split} split of {f1_data.cp_dataset_name}")
 
19
  self._initialize()
20
 
21
  def _initialize(self):
22
+ logger.info("Initialize F1Data TOKEN='%s'", TOKEN)
23
  start_time = time.monotonic()
24
  cp_ds = load_dataset(self.cp_dataset_name, split=self.split, token=TOKEN)
25
  logger.info(
 
27
  self.cp_dataset_name,
28
  time.monotonic() - start_time,
29
  )
30
+ self.code_problems: dict[str, str] = {r["id"]: r["code_problem"] for r in cp_ds}
31
+ logger.info(f"Loaded %d code problems {len(self.code_problems)}")
32
 
33
  @functools.cached_property
34
+ def code_problem_ids(self) -> set[str]:
35
  return set(self.code_problems.keys())
36
 
37
 
 
39
  split = "hard"
40
  f1_data = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=split)
41
 
42
+ print(f"Found {len(f1_data.code_problem_ids)} code problems in {split} split of {f1_data.cp_dataset_name}")
src/submission/submit.py CHANGED
@@ -11,6 +11,7 @@ from src.display.formatting import styled_error, styled_message, styled_warning
11
  from src.display.utils import ModelType
12
  from src.envs import API, SUBMISSIONS_REPO, TOKEN
13
  from src.logger import get_logger
 
14
  # from src.submission.check_validity import (
15
  # already_submitted_models,
16
  # check_model_card,
@@ -20,27 +21,29 @@ from src.logger import get_logger
20
 
21
  logger = get_logger(__name__)
22
 
 
23
  def validate_submission(lbdb: F1Data, pd_ds: pd.DataFrame) -> str | None:
24
  logger.info("Validating DS size %d columns %s set %s", len(pd_ds), pd_ds.columns, set(pd_ds.columns))
25
- expected_cols = ["formula_name", "solution"]
26
  if set(pd_ds.columns) != set(expected_cols):
27
  return f"Expected attributes: {expected_cols}, Got: {pd_ds.columns.tolist()}"
28
- if any(type(v) != str for v in pd_ds["formula_name"]):
29
- return "Not all formula_name values are of type str"
30
  if any(type(v) != str for v in pd_ds["solution"]):
31
- return "Not all solution values are of type str"
32
- submitted_formulas = set(pd_ds["formula_name"])
33
- if submitted_formulas != lbdb.code_problem_formulas:
34
- missing = lbdb.code_problem_formulas - submitted_formulas
35
- unknown = submitted_formulas - lbdb.code_problem_formulas
36
- return f"Mismatched formula names: {len(missing)} missing, {len(unknown)} unknown"
37
- if len(pd_ds) > len(lbdb.code_problem_formulas):
38
- return "Duplicate formula solutions exist in uploaded file"
39
  return None
40
 
 
41
  def add_new_solutions(
42
  lbdb: F1Data,
43
- system_name : str,
44
  org: str,
45
  sys_type: str,
46
  submission_path: str,
@@ -68,7 +71,6 @@ def add_new_solutions(
68
  if validation_error:
69
  return styled_error(validation_error)
70
 
71
-
72
  submission_id = f"{system_name}_{org}_{sys_type}_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}"
73
 
74
  # Seems good, creating the eval
 
11
  from src.display.utils import ModelType
12
  from src.envs import API, SUBMISSIONS_REPO, TOKEN
13
  from src.logger import get_logger
14
+
15
  # from src.submission.check_validity import (
16
  # already_submitted_models,
17
  # check_model_card,
 
21
 
22
  logger = get_logger(__name__)
23
 
24
+
25
  def validate_submission(lbdb: F1Data, pd_ds: pd.DataFrame) -> str | None:
26
  logger.info("Validating DS size %d columns %s set %s", len(pd_ds), pd_ds.columns, set(pd_ds.columns))
27
+ expected_cols = ["problem_id", "solution"]
28
  if set(pd_ds.columns) != set(expected_cols):
29
  return f"Expected attributes: {expected_cols}, Got: {pd_ds.columns.tolist()}"
30
+ if any(type(v) != str for v in pd_ds["problem_id"]):
31
+ return "problem_id must be of type str"
32
  if any(type(v) != str for v in pd_ds["solution"]):
33
+ return "solution must be of type str"
34
+ submitted_ids = set(pd_ds["problem_id"])
35
+ if submitted_ids != lbdb.code_problem_ids:
36
+ missing = lbdb.code_problem_ids - submitted_ids
37
+ unknown = submitted_ids - lbdb.code_problem_ids
38
+ return f"Mismatched problem IDs: {len(missing)} missing, {len(unknown)} unknown"
39
+ if len(pd_ds) > len(lbdb.code_problem_ids):
40
+ return "Duplicate problem IDs exist in uploaded file"
41
  return None
42
 
43
+
44
  def add_new_solutions(
45
  lbdb: F1Data,
46
+ system_name: str,
47
  org: str,
48
  sys_type: str,
49
  submission_path: str,
 
71
  if validation_error:
72
  return styled_error(validation_error)
73
 
 
74
  submission_id = f"{system_name}_{org}_{sys_type}_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}"
75
 
76
  # Seems good, creating the eval