samleeasus commited on
Commit
87c0ea9
·
verified ·
1 Parent(s): c656aa4

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +24 -0
README.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```python
2
+ from transformers import LlamaTokenizer
3
+
4
+ tokenizer = LlamaTokenizer.from_pretrained(
5
+ 'ocisd4/llama_tokenizer_ext_zhtw',
6
+ pad_token='<unk>',
7
+ add_bos_token=True,
8
+ add_eos_token=False
9
+ )
10
+
11
+ #vocab size: 36128
12
+
13
+ print(tokenizer.tokenize('今天天氣真好!'))
14
+ #['▁', '今', '天', '天', '氣', '真', '好', '!']
15
+
16
+ print(tokenizer.encode('今天天氣真好!'))
17
+ #[1, 29871, 31482, 30408, 30408, 32045, 30848, 31076, 30584]
18
+
19
+ print(tokenizer.decode(tokenizer.encode('今天天氣真好!')))
20
+ # <s>今天天氣真好!
21
+ ```
22
+
23
+
24
+