Alvinn-aai commited on
Commit
8cfcd49
·
1 Parent(s): 61885ca

data upload script, support both splits

Browse files
app.py CHANGED
@@ -158,10 +158,10 @@ with demo:
158
  org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
159
  # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
160
  sys_type_dropdown = gr.Dropdown(
161
- choices=[t.to_str(" : ") for t in ModelType],
162
  label=AutoEvalColumn.system_type.name,
163
  multiselect=False,
164
- value=ModelType.LLM.to_str(" : "),
165
  interactive=True,
166
  )
167
 
 
158
  org_textbox = gr.Textbox(label=AutoEvalColumn.organization.name)
159
  # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
160
  sys_type_dropdown = gr.Dropdown(
161
+ choices=[t.to_str(" ") for t in ModelType],
162
  label=AutoEvalColumn.system_type.name,
163
  multiselect=False,
164
+ value=ModelType.LLM.to_str(" "),
165
  interactive=True,
166
  )
167
 
scripts/upload_f1_dataset.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import fnmatch
3
+ import json
4
+ import os
5
+
6
+ from datasets import Dataset
7
+
8
+ from src.envs import CODE_PROBLEMS_REPO
9
+ from src.logger import get_logger
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
14
+ def get_args() -> argparse.Namespace:
15
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
16
+ parser.add_argument("--input_dir", type=str, help="Dir with .json files", required=True)
17
+ parser.add_argument("--dataset_name", type=str, default=f"{CODE_PROBLEMS_REPO}")
18
+ parser.add_argument("--split", type=str, choices=["hard", "warmup"], default="hard")
19
+ return parser.parse_args()
20
+
21
+
22
+ def main(args: argparse.Namespace) -> None:
23
+ logger.info("Reading problem files from %s", args.input_dir)
24
+ input_files = fnmatch.filter(os.listdir(args.input_dir), "*.json")
25
+ if len(input_files) == 0:
26
+ raise ValueError(f"No .json files in input dir {args.input_dir}")
27
+ logger.info("Found %d code problems in %s", len(input_files), args.input_dir)
28
+
29
+ def ds_generator():
30
+ for fname in sorted(input_files):
31
+ formula_name = os.path.splitext(fname)[0]
32
+ cp_path = os.path.join(args.input_dir, fname)
33
+ with open(cp_path, "r", encoding="utf-8") as f:
34
+ code_problem = json.load(f)
35
+ logger.info("Read code problem for formula %s from %s", formula_name, cp_path)
36
+ yield dict(id=code_problem["id"], code_problem=code_problem)
37
+
38
+ ds = Dataset.from_generator(ds_generator)
39
+ logger.info("Created dataset")
40
+
41
+ ds.push_to_hub(args.dataset_name, split=args.split, private=True)
42
+ logger.info("Saved dataset to repo %s", args.dataset_name)
43
+
44
+
45
+ if __name__ == "__main__":
46
+ main(get_args())
src/datamodel/data.py CHANGED
@@ -3,26 +3,40 @@ import time
3
 
4
  from datasets import load_dataset
5
 
6
- from src.envs import TOKEN
7
  from src.logger import get_logger
8
 
9
  logger = get_logger(__name__)
10
 
 
11
  class F1Data:
12
- def __init__(self, cp_ds_name: str, sub_ds_name: str, res_ds_name: str):
13
  self.cp_dataset_name = cp_ds_name
14
  self.submissions_dataset_name = sub_ds_name
15
  self.results_dataset_name = res_ds_name
16
- self.initialize()
17
-
18
- @functools.cached_property
19
- def code_problem_formulas(self) -> set[str]:
20
- return set(self.code_problems.keys())
21
 
22
- def initialize(self):
23
  logger.info("Initialize F1Data TOMEN='%s'", TOKEN)
24
  start_time = time.monotonic()
25
- cp_ds = load_dataset(self.cp_dataset_name, split="hard", token=TOKEN)
26
- logger.info("Loaded code-problems dataset from %s in %f sec", self.cp_dataset_name, time.monotonic() - start_time)
27
- self.code_problems: dict[str, str] = {r["formula_name"]: r["code_problem"]["problem_description"] for r in cp_ds}
 
 
 
 
28
  logger.info("Code problems info: %s", self.code_problems)
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  from datasets import load_dataset
5
 
6
+ from src.envs import TOKEN, CODE_PROBLEMS_REPO, RESULTS_REPO, SUBMISSIONS_REPO
7
  from src.logger import get_logger
8
 
9
  logger = get_logger(__name__)
10
 
11
+
12
  class F1Data:
13
+ def __init__(self, cp_ds_name: str, sub_ds_name: str, res_ds_name: str, split: str = "hard"):
14
  self.cp_dataset_name = cp_ds_name
15
  self.submissions_dataset_name = sub_ds_name
16
  self.results_dataset_name = res_ds_name
17
+ self.split = split
18
+ self.code_problems = None
19
+ self._initialize()
 
 
20
 
21
+ def _initialize(self):
22
  logger.info("Initialize F1Data TOMEN='%s'", TOKEN)
23
  start_time = time.monotonic()
24
+ cp_ds = load_dataset(self.cp_dataset_name, split=self.split, token=TOKEN)
25
+ logger.info(
26
+ "Loaded code-problems dataset from %s in %f sec",
27
+ self.cp_dataset_name,
28
+ time.monotonic() - start_time,
29
+ )
30
+ self.code_problems: dict[str, str] = {r["id"]: r["code_problem"]["problem_description"] for r in cp_ds}
31
  logger.info("Code problems info: %s", self.code_problems)
32
+
33
+ @functools.cached_property
34
+ def code_problem_formulas(self) -> set[str]:
35
+ return set(self.code_problems.keys())
36
+
37
+
38
+ if __name__ == "__main__":
39
+ split = "hard"
40
+ f1_data = F1Data(cp_ds_name=CODE_PROBLEMS_REPO, sub_ds_name=SUBMISSIONS_REPO, res_ds_name=RESULTS_REPO, split=split)
41
+
42
+ print(f"Found {len(f1_data.code_problem_formulas)} code problems in {split} split of {f1_data.cp_dataset_name}")
src/submission/submit.py CHANGED
@@ -33,7 +33,7 @@ def validate_submission(lbdb: F1Data, pd_ds: pd.DataFrame) -> str | None:
33
  if submitted_formulas != lbdb.code_problem_formulas:
34
  missing = lbdb.code_problem_formulas - submitted_formulas
35
  unknown = submitted_formulas - lbdb.code_problem_formulas
36
- return f"Mismatched formula names: missing {len(missing)} unknown {len(unknown)}"
37
  if len(pd_ds) > len(lbdb.code_problem_formulas):
38
  return "Duplicate formula solutions exist in uploaded file"
39
  return None
 
33
  if submitted_formulas != lbdb.code_problem_formulas:
34
  missing = lbdb.code_problem_formulas - submitted_formulas
35
  unknown = submitted_formulas - lbdb.code_problem_formulas
36
+ return f"Mismatched formula names: {len(missing)} missing, {len(unknown)} unknown"
37
  if len(pd_ds) > len(lbdb.code_problem_formulas):
38
  return "Duplicate formula solutions exist in uploaded file"
39
  return None