import os import json import gzip from typing import Dict, Iterable def stream_jsonl(filename: str) -> Iterable[Dict]: """ Parses each jsonl line and yields it as a dictionary """ if filename.endswith(".gz"): with open(filename, "rb") as gzfp: with gzip.open(gzfp, "rt") as fp: for line in fp: if any(not x.isspace() for x in line): yield json.loads(line) else: with open(filename, "r") as fp: for line in fp: if any(not x.isspace() for x in line): yield json.loads(line) def load_solutions(samples) -> Iterable[Dict]: """ """ for i, sample in enumerate(samples): assert "task_id" in sample, "No task_id found in sample!" assert "res_id" in sample, "No res_id found in sample!" assert "test" in sample, "No test found in sample!" assert "solution" in sample, "No solution found in sample!" assert "entry_point" in sample, "No entry_point found in sample!" assert isinstance( sample["solution"], str ), "Solution must be a string! If you have multiple solutions, please repeat the task_id." sample["_identifier"] = ( sample["task_id"] + f" (line {i+1} )" ) yield sample