xiaohua828 commited on
Commit
362cdb6
·
1 Parent(s): 10f6c31

Upload tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +73 -0
tokenizer.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional,Union
3
+ from transformers import PreTrainedTokenizer
4
+ class CustomTokenizer(PreTrainedTokenizer):
5
+ def __init__(self, vocab_file, **kwargs):
6
+ super().__init__(**kwargs)
7
+ self.vocab = self._load_vocab(vocab_file)
8
+ self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
9
+
10
+ self.bos_token,self.eos_token,self.pad_token ,self.unk_token,self.mask_token = "[CLS]","[SEP]","[PAD]","[UNK]","[MASK]"
11
+
12
+ def _load_vocab(self, vocab_file):
13
+ vocab = {}
14
+ with open(vocab_file, 'r',encoding="UTF-8") as f:
15
+ for line in f:
16
+ token = line.strip()
17
+ vocab[token] = len(vocab)
18
+ return vocab
19
+ def tokenize(self, text):
20
+ tokens = []
21
+ for word in text.split():
22
+ if word in self.vocab:
23
+ tokens.append(word)
24
+ else:
25
+ tokens.append("[UNK]")
26
+ return tokens
27
+
28
+ def encode(
29
+ self,
30
+ text,text_pair = None,add_special_tokens:bool = True,padding: Union[bool, str] = False,truncation: Union[bool, str] = None,max_length: Optional[int] = None,stride: int = 0,return_tensors: Optional[Union[str]] = None,**kwargs,):
31
+ tokens = []
32
+ for word in text.split():
33
+ if word in self.vocab:
34
+ tokens.append(word)
35
+ else:
36
+ tokens.append("[UNK]")
37
+ return tokens
38
+
39
+
40
+ def convert_token_to_id(self, token):
41
+ if token in self.vocab:
42
+ return self.vocab[token]
43
+ else:
44
+ return self.vocab["[UNK]"]
45
+
46
+ def convert_id_to_token(self, idx):
47
+ if idx in self.ids_to_tokens:
48
+ return self.ids_to_tokens[idx]
49
+ else:
50
+ return "[UNK]"
51
+
52
+ def save_vocabulary(self, save_directory,filename_prefix = None):
53
+ if not os.path.isdir(save_directory):
54
+ return
55
+ vocab_file = os.path.join(save_directory, "vocab.txt")
56
+ with open(vocab_file, "w", encoding="utf-8") as f:
57
+ for token, index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
58
+ f.write(token + "\n")
59
+ return (vocab_file,)
60
+
61
+
62
+ if __name__ == '__main__':
63
+ # import login
64
+ # tokenizer = CustomTokenizer(vocab_file = "./vocab.txt")
65
+ # tokenizer.push_to_hub(login.name_or_path)
66
+
67
+ from transformers import AutoTokenizer
68
+ tokenizer = AutoTokenizer.from_pretrained("xiaohua828/MNIST_Demo_1")
69
+ token = tokenizer.encode("我要赚钱")
70
+ print(token)
71
+
72
+
73
+