Spaces:
Runtime error
Runtime error
compositional_test
/
transformers
/examples
/research_projects
/codeparrot
/scripts
/pretokenizing.py
import multiprocessing | |
import time | |
from arguments import PretokenizationArguments | |
from datasets import load_dataset | |
from transformers import AutoTokenizer, HfArgumentParser | |
def tokenize(example): | |
output = {} | |
output["input_ids"] = tokenizer(example["content"], truncation=False)["input_ids"] | |
output["ratio_char_token"] = len(example["content"]) / len(output["input_ids"]) | |
return output | |
parser = HfArgumentParser(PretokenizationArguments) | |
args = parser.parse_args() | |
if args.num_workers is None: | |
args.num_workers = multiprocessing.cpu_count() | |
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir) | |
t_start = time.time() | |
ds = load_dataset(args.dataset_name, split="train") | |
print(f"Dataset loaded in {time.time()-t_start:.2f}s") | |
t_start = time.time() | |
ds = ds.map( | |
tokenize, | |
num_proc=args.num_workers, | |
remove_columns=[ | |
"repo_name", | |
"path", | |
"copies", | |
"size", | |
"content", | |
"license", | |
"hash", | |
"line_mean", | |
"line_max", | |
"alpha_frac", | |
"autogenerated", | |
], | |
) | |
print(f"Dataset tokenized in {time.time()-t_start:.2f}s") | |
t_start = time.time() | |
ds.push_to_hub(args.tokenized_data_repo) | |
print(f"Data pushed to the hub in {time.time()-t_start:.2f}s") | |