tokenizer
Browse files
scripts/train_tokenizer.py
CHANGED
@@ -255,7 +255,7 @@ bpe = BPE(unk_token='<unk>', fuse_unk=True, byte_fallback=True)
|
|
255 |
tokenizer = Tokenizer(bpe)
|
256 |
|
257 |
tokenizer.normalizer = normalizers.Sequence([
|
258 |
-
|
259 |
normalizers.Replace(' ', '▁'),
|
260 |
])
|
261 |
|
@@ -271,7 +271,7 @@ tokenizer.decoder = decoders.Sequence([
|
|
271 |
decoders.Replace('▁', ' '),
|
272 |
decoders.ByteFallback(),
|
273 |
decoders.Fuse(),
|
274 |
-
|
275 |
])
|
276 |
|
277 |
trainer = BpeTrainer(
|
|
|
255 |
tokenizer = Tokenizer(bpe)
|
256 |
|
257 |
tokenizer.normalizer = normalizers.Sequence([
|
258 |
+
normalizers.Prepend('▁'),
|
259 |
normalizers.Replace(' ', '▁'),
|
260 |
])
|
261 |
|
|
|
271 |
decoders.Replace('▁', ' '),
|
272 |
decoders.ByteFallback(),
|
273 |
decoders.Fuse(),
|
274 |
+
decoders.Strip(' ', 1, 0),
|
275 |
])
|
276 |
|
277 |
trainer = BpeTrainer(
|