samleeasus commited on
Commit
52abbf9
·
1 Parent(s): 18c3410

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +14 -12
README.md CHANGED
@@ -1,20 +1,22 @@
 
1
  from transformers import LlamaTokenizer
2
 
3
- tokenizer = LlamaTokenizer.from_pretrained(
4
- 'ocisd4/llama_tokenizer_ext_zhtw',
5
- pad_token='<unk>',
6
- add_bos_token=True,
7
- add_eos_token=False,
8
- use_auth_token=True,
9
- )
10
 
11
- #vocab size: 36128
12
 
13
- print(tokenizer.tokenize('今天天氣真好!'))
14
  #['▁', '今', '天', '天', '氣', '真', '好', '!']
15
 
16
- print(tokenizer.encode('今天天氣真好!'))
17
  #[1, 29871, 31482, 30408, 30408, 32045, 30848, 31076, 30584]
18
 
19
- print(tokenizer.decode(tokenizer.encode('今天天氣真好!')))
20
- # <s>今天天氣真好!
 
 
1
+ ```python
2
  from transformers import LlamaTokenizer
3
 
4
+ tokenizer = LlamaTokenizer.from_pretrained(
5
+ 'ocisd4/llama_tokenizer_ext_zhtw',
6
+ pad_token='<unk>',
7
+ add_bos_token=True,
8
+ add_eos_token=False,
9
+ use_auth_token=True,
10
+ )
11
 
12
+ #vocab size: 36128
13
 
14
+ print(tokenizer.tokenize('今天天氣真好!'))
15
  #['▁', '今', '天', '天', '氣', '真', '好', '!']
16
 
17
+ print(tokenizer.encode('今天天氣真好!'))
18
  #[1, 29871, 31482, 30408, 30408, 32045, 30848, 31076, 30584]
19
 
20
+ print(tokenizer.decode(tokenizer.encode('今天天氣真好!')))
21
+ # <s>今天天氣真好!
22
+ ```