Qwen
/

yangapku commited on
Commit
9882935
·
1 Parent(s): 8504b36

update tokenization_qwen.py

Browse files
Files changed (2) hide show
  1. assets/logo.jpg +0 -0
  2. tokenization_qwen.py +4 -10
assets/logo.jpg ADDED
tokenization_qwen.py CHANGED
@@ -20,7 +20,7 @@ from transformers import PreTrainedTokenizer, AddedToken
20
 
21
  logger = logging.getLogger(__name__)
22
 
23
- TIKTOKEN_NAME = "qwen.tiktoken"
24
 
25
 
26
  class QWenTokenizer(PreTrainedTokenizer):
@@ -28,17 +28,11 @@ class QWenTokenizer(PreTrainedTokenizer):
28
 
29
  """NOTE: This tokenizer will not handle special tokens to avoid injection attacks"""
30
 
31
- @classmethod
32
- def from_pretrained(
33
- cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs
34
- ):
35
- merges_file = os.path.join(pretrained_model_name_or_path, TIKTOKEN_NAME)
36
- tokenizer = cls(merges_file, *inputs, **kwargs)
37
- return tokenizer
38
 
39
  def __init__(
40
  self,
41
- merges_file,
42
  errors="replace",
43
  max_len=None,
44
  unk_token="<|endoftext|>",
@@ -113,7 +107,7 @@ class QWenTokenizer(PreTrainedTokenizer):
113
  )
114
  }
115
 
116
- mergeable_ranks = load_tiktoken_bpe(merges_file)
117
  special_tokens = {
118
  token: index
119
  for index, token in enumerate(special_tokens, start=len(mergeable_ranks))
 
20
 
21
  logger = logging.getLogger(__name__)
22
 
23
+ VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
24
 
25
 
26
  class QWenTokenizer(PreTrainedTokenizer):
 
28
 
29
  """NOTE: This tokenizer will not handle special tokens to avoid injection attacks"""
30
 
31
+ vocab_files_names = VOCAB_FILES_NAMES
 
 
 
 
 
 
32
 
33
  def __init__(
34
  self,
35
+ vocab_file,
36
  errors="replace",
37
  max_len=None,
38
  unk_token="<|endoftext|>",
 
107
  )
108
  }
109
 
110
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
111
  special_tokens = {
112
  token: index
113
  for index, token in enumerate(special_tokens, start=len(mergeable_ranks))