File size: 1,267 Bytes
a1d409e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import multiprocessing
import time

from arguments import PretokenizationArguments
from datasets import load_dataset

from transformers import AutoTokenizer, HfArgumentParser


def tokenize(example):
    output = {}
    output["input_ids"] = tokenizer(example["content"], truncation=False)["input_ids"]
    output["ratio_char_token"] = len(example["content"]) / len(output["input_ids"])
    return output


parser = HfArgumentParser(PretokenizationArguments)
args = parser.parse_args()
if args.num_workers is None:
    args.num_workers = multiprocessing.cpu_count()
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)

t_start = time.time()
ds = load_dataset(args.dataset_name, split="train")
print(f"Dataset loaded in {time.time()-t_start:.2f}s")

t_start = time.time()
ds = ds.map(
    tokenize,
    num_proc=args.num_workers,
    remove_columns=[
        "repo_name",
        "path",
        "copies",
        "size",
        "content",
        "license",
        "hash",
        "line_mean",
        "line_max",
        "alpha_frac",
        "autogenerated",
    ],
)
print(f"Dataset tokenized in {time.time()-t_start:.2f}s")

t_start = time.time()
ds.push_to_hub(args.tokenized_data_repo)
print(f"Data pushed to the hub in {time.time()-t_start:.2f}s")