mtasic85 commited on
Commit
804c80b
1 Parent(s): 419b474
Files changed (1) hide show
  1. scripts/train_tokenizer.py +2 -2
scripts/train_tokenizer.py CHANGED
@@ -255,7 +255,7 @@ bpe = BPE(unk_token='<unk>', fuse_unk=True, byte_fallback=True)
255
  tokenizer = Tokenizer(bpe)
256
 
257
  tokenizer.normalizer = normalizers.Sequence([
258
- # normalizers.Prepend('▁'),
259
  normalizers.Replace(' ', '▁'),
260
  ])
261
 
@@ -271,7 +271,7 @@ tokenizer.decoder = decoders.Sequence([
271
  decoders.Replace('▁', ' '),
272
  decoders.ByteFallback(),
273
  decoders.Fuse(),
274
- # decoders.Strip(' ', 1, 0),
275
  ])
276
 
277
  trainer = BpeTrainer(
 
255
  tokenizer = Tokenizer(bpe)
256
 
257
  tokenizer.normalizer = normalizers.Sequence([
258
+ normalizers.Prepend('▁'),
259
  normalizers.Replace(' ', '▁'),
260
  ])
261
 
 
271
  decoders.Replace('▁', ' '),
272
  decoders.ByteFallback(),
273
  decoders.Fuse(),
274
+ decoders.Strip(' ', 1, 0),
275
  ])
276
 
277
  trainer = BpeTrainer(