File size: 2,674 Bytes
601461e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import shutil

from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

from utils import batch_dataset_iterator
from base_datasets import base_datasets
from base_instruct_datasets import base_instruct_datasets


tokenizer_path = '../tokenizer'

if os.path.exists(tokenizer_path):
    shutil.rmtree(tokenizer_path)

os.makedirs(tokenizer_path, exist_ok=True)

#
# special_tokens
#
bos_token = '<|endoftext|>'
eos_token = '<|im_end|>'
pad_token = '<|pad|>'
unk_token = '<|unk|>'

special_tokens = [
    bos_token,
    eos_token,
    pad_token,
    unk_token,
    '<|im_start|>',
    '<|im_sep|>',
    'system',
    'user',
    'assistant',
    '<tools>',
    '</tools>',
    '<tool>',
    '</tool>',
    '<tool_call>',
    '</tool_call>',
    '<tool_response>',
    '</tool_response>',
    '<question>',
    '</question>',
    '<think>',
    '</think>',
    '<answer>',
    '</answer>',
]

for i in range(64 - len(special_tokens)):
    special_tokens.append(f'<|reserved_{i}|>')

#
# BPE Tokenizer
#
bpe = BPE(unk_token=None, byte_fallback=True)
tokenizer = Tokenizer(bpe)

# normalizer
tokenizer.normalizer = None

# pre-tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)

# post-processor
tokenizer.post_processor = processors.ByteLevel(add_prefix_space=True, trim_offsets=False, use_regex=True)

# decoder
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)

#
# BPE Trainer
#
trainer = BpeTrainer(
    vocab_size=65536, # 64 * 1024
    min_frequency=3,
    special_tokens=special_tokens,
    max_token_length=16,
)

tokenizer_datasets = base_datasets + base_instruct_datasets

tokenizer.train_from_iterator(
    (batch_dataset_iterator(n) for n in tokenizer_datasets),
    trainer,
)

tokenizer.save(os.path.join(tokenizer_path, 'tokenizer.json'))
tokenizer.model.save(tokenizer_path)

#
# PreTrainedTokenizerFast
#
CHAT_TEMPLATE = (
    "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '<|im_sep|>' + message['content'] + '<|im_end|>'}}"
    "{% endfor %}"

    "{% if add_generation_prompt %}"
        "{{ '<|im_start|>assistant<|im_sep|>' }}"
    "{% endif %}"
)

fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    chat_template=CHAT_TEMPLATE,
    bos_token=bos_token,
    eos_token=eos_token,
    pad_token=pad_token,
    unk_token=unk_token,
    clean_up_tokenization_spaces=False,
)

fast_tokenizer.save_pretrained(tokenizer_path)