Qwen
/

JustinLin610 commited on
Commit
f6498e5
·
1 Parent(s): 62bf1c6

fix decoder, and provide an option to remove attack rejection (#8)

Browse files

- fix decoder, and provide an option to remove attack rejection (d00ebe5828fcd095516d8eb8d314c9864d9d3863)

Files changed (1) hide show
  1. tokenization_qwen.py +13 -5
tokenization_qwen.py CHANGED
@@ -126,6 +126,7 @@ class QWenTokenizer(PreTrainedTokenizer):
126
  self.mergeable_ranks = mergeable_ranks
127
  self.encoder = self.mergeable_ranks
128
  self.decoder = {v: k for k, v in self.encoder.items()}
 
129
  self.tokenizer = enc # type: tiktoken.Encoding
130
  self.eod_id = self.tokenizer.eot_token
131
  self.im_start_id = special_tokens[IMSTART]
@@ -182,16 +183,20 @@ class QWenTokenizer(PreTrainedTokenizer):
182
  text (`str`):
183
  The sequence to be encoded.
184
  kwargs (additional keyword arguments, *optional*):
185
- Will be passed to the underlying model specific encode method. See details in
186
- [`~PreTrainedTokenizerBase.__call__`]
 
 
187
 
188
  Returns:
189
  `List[str]`: The list of tokens.
190
  """
191
  tokens = []
192
  text = unicodedata.normalize("NFC", text)
193
- for t in self.tokenizer.encode_ordinary(text):
 
194
  tokens.append(self.decoder[t])
 
195
  return tokens
196
 
197
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
@@ -216,7 +221,10 @@ class QWenTokenizer(PreTrainedTokenizer):
216
 
217
  def _convert_token_to_id(self, token: str) -> int:
218
  """Converts a token to an id using the vocab."""
219
- return self.encoder.get(token.encode('UTF-8'), self.tokenizer.encode(self.unk_token, allowed_special='all')[0])
 
 
 
220
 
221
  @property
222
  def all_special_tokens(self) -> List[str]:
@@ -255,4 +263,4 @@ class QWenTokenizer(PreTrainedTokenizer):
255
  token_ids = [token_ids]
256
  if skip_special_tokens:
257
  token_ids = [i for i in token_ids if i not in self.all_special_ids]
258
- return self.tokenizer.decode(token_ids)
 
126
  self.mergeable_ranks = mergeable_ranks
127
  self.encoder = self.mergeable_ranks
128
  self.decoder = {v: k for k, v in self.encoder.items()}
129
+ self.decoder.update({v: k for k, v in self.special_tokens.items()})
130
  self.tokenizer = enc # type: tiktoken.Encoding
131
  self.eod_id = self.tokenizer.eot_token
132
  self.im_start_id = special_tokens[IMSTART]
 
183
  text (`str`):
184
  The sequence to be encoded.
185
  kwargs (additional keyword arguments, *optional*):
186
+ Will be passed to the underlying model specific encode method.
187
+ Tiktoken allows users to allow the tokenization of special tokens with the following args:
188
+ `allowed_special`: set to 'all' or a `set` of special tokens.
189
+ `disallowed_special`: set to 'all' or a `Collection` of special tokens. NOT RECOMMENDED, AS IT MAY BE CONFLICTED WITH `allowed_special`.
190
 
191
  Returns:
192
  `List[str]`: The list of tokens.
193
  """
194
  tokens = []
195
  text = unicodedata.normalize("NFC", text)
196
+
197
+ for t in self.tokenizer.encode(text, **kwargs):
198
  tokens.append(self.decoder[t])
199
+
200
  return tokens
201
 
202
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
 
221
 
222
  def _convert_token_to_id(self, token: str) -> int:
223
  """Converts a token to an id using the vocab."""
224
+ return self.encoder.get(
225
+ token.encode("UTF-8"),
226
+ self.tokenizer.encode(self.unk_token, allowed_special="all")[0],
227
+ )
228
 
229
  @property
230
  def all_special_tokens(self) -> List[str]:
 
263
  token_ids = [token_ids]
264
  if skip_special_tokens:
265
  token_ids = [i for i in token_ids if i not in self.all_special_ids]
266
+ return self.tokenizer.decode(token_ids)