xiaohua828
commited on
Commit
·
362cdb6
1
Parent(s):
10f6c31
Upload tokenizer.py
Browse files- tokenizer.py +73 -0
tokenizer.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Optional,Union
|
3 |
+
from transformers import PreTrainedTokenizer
|
4 |
+
class CustomTokenizer(PreTrainedTokenizer):
|
5 |
+
def __init__(self, vocab_file, **kwargs):
|
6 |
+
super().__init__(**kwargs)
|
7 |
+
self.vocab = self._load_vocab(vocab_file)
|
8 |
+
self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
|
9 |
+
|
10 |
+
self.bos_token,self.eos_token,self.pad_token ,self.unk_token,self.mask_token = "[CLS]","[SEP]","[PAD]","[UNK]","[MASK]"
|
11 |
+
|
12 |
+
def _load_vocab(self, vocab_file):
|
13 |
+
vocab = {}
|
14 |
+
with open(vocab_file, 'r',encoding="UTF-8") as f:
|
15 |
+
for line in f:
|
16 |
+
token = line.strip()
|
17 |
+
vocab[token] = len(vocab)
|
18 |
+
return vocab
|
19 |
+
def tokenize(self, text):
|
20 |
+
tokens = []
|
21 |
+
for word in text.split():
|
22 |
+
if word in self.vocab:
|
23 |
+
tokens.append(word)
|
24 |
+
else:
|
25 |
+
tokens.append("[UNK]")
|
26 |
+
return tokens
|
27 |
+
|
28 |
+
def encode(
|
29 |
+
self,
|
30 |
+
text,text_pair = None,add_special_tokens:bool = True,padding: Union[bool, str] = False,truncation: Union[bool, str] = None,max_length: Optional[int] = None,stride: int = 0,return_tensors: Optional[Union[str]] = None,**kwargs,):
|
31 |
+
tokens = []
|
32 |
+
for word in text.split():
|
33 |
+
if word in self.vocab:
|
34 |
+
tokens.append(word)
|
35 |
+
else:
|
36 |
+
tokens.append("[UNK]")
|
37 |
+
return tokens
|
38 |
+
|
39 |
+
|
40 |
+
def convert_token_to_id(self, token):
|
41 |
+
if token in self.vocab:
|
42 |
+
return self.vocab[token]
|
43 |
+
else:
|
44 |
+
return self.vocab["[UNK]"]
|
45 |
+
|
46 |
+
def convert_id_to_token(self, idx):
|
47 |
+
if idx in self.ids_to_tokens:
|
48 |
+
return self.ids_to_tokens[idx]
|
49 |
+
else:
|
50 |
+
return "[UNK]"
|
51 |
+
|
52 |
+
def save_vocabulary(self, save_directory,filename_prefix = None):
|
53 |
+
if not os.path.isdir(save_directory):
|
54 |
+
return
|
55 |
+
vocab_file = os.path.join(save_directory, "vocab.txt")
|
56 |
+
with open(vocab_file, "w", encoding="utf-8") as f:
|
57 |
+
for token, index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
|
58 |
+
f.write(token + "\n")
|
59 |
+
return (vocab_file,)
|
60 |
+
|
61 |
+
|
62 |
+
if __name__ == '__main__':
|
63 |
+
# import login
|
64 |
+
# tokenizer = CustomTokenizer(vocab_file = "./vocab.txt")
|
65 |
+
# tokenizer.push_to_hub(login.name_or_path)
|
66 |
+
|
67 |
+
from transformers import AutoTokenizer
|
68 |
+
tokenizer = AutoTokenizer.from_pretrained("xiaohua828/MNIST_Demo_1")
|
69 |
+
token = tokenizer.encode("我要赚钱")
|
70 |
+
print(token)
|
71 |
+
|
72 |
+
|
73 |
+
|