File size: 586 Bytes
52abbf9 18c3410 52abbf9 a017e9a 52abbf9 18c3410 52abbf9 18c3410 52abbf9 18c3410 52abbf9 18c3410 52abbf9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
```python
from transformers import LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained(
'ocisd4/llama_tokenizer_ext_zhtw',
pad_token='<unk>',
add_bos_token=True,
add_eos_token=False
)
#vocab size: 36128
print(tokenizer.tokenize('今天天氣真好!'))
#['▁', '今', '天', '天', '氣', '真', '好', '!']
print(tokenizer.encode('今天天氣真好!'))
#[1, 29871, 31482, 30408, 30408, 32045, 30848, 31076, 30584]
print(tokenizer.decode(tokenizer.encode('今天天氣真好!')))
# <s>今天天氣真好!
``` |