samleeasus commited on
Commit
18c3410
·
1 Parent(s): f5546fa

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +20 -0
README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import LlamaTokenizer
2
+
3
+ tokenizer = LlamaTokenizer.from_pretrained(
4
+ 'ocisd4/llama_tokenizer_ext_zhtw',
5
+ pad_token='<unk>',
6
+ add_bos_token=True,
7
+ add_eos_token=False,
8
+ use_auth_token=True,
9
+ )
10
+
11
+ #vocab size: 36128
12
+
13
+ print(tokenizer.tokenize('今天天氣真好!'))
14
+ #['▁', '今', '天', '天', '氣', '真', '好', '!']
15
+
16
+ print(tokenizer.encode('今天天氣真好!'))
17
+ #[1, 29871, 31482, 30408, 30408, 32045, 30848, 31076, 30584]
18
+
19
+ print(tokenizer.decode(tokenizer.encode('今天天氣真好!')))
20
+ # <s>今天天氣真好!