KoichiYasuoka commited on
Commit
089d741
·
verified ·
1 Parent(s): c17eed9

Update hf_rwkv_tokenizer.py

Browse files

Bug fix of `convert_tokens_to_ids` and `decode` for byte-fallback characters.

Files changed (1) hide show
  1. hf_rwkv_tokenizer.py +8 -1
hf_rwkv_tokenizer.py CHANGED
@@ -182,8 +182,15 @@ class Rwkv6Tokenizer(PreTrainedTokenizer):
182
  # return self.wordpiece_tokenizer.tokenize(text.encode("utf-8"))
183
  return self.trie_tokenizer.encode(text)[0]
184
 
 
 
 
185
  def _convert_token_to_id(self, token):
186
- return token
 
 
 
 
187
 
188
  def _convert_id_to_token(self, index):
189
  """Converts an index (integer) in a token (byte) using the vocab."""
 
182
  # return self.wordpiece_tokenizer.tokenize(text.encode("utf-8"))
183
  return self.trie_tokenizer.encode(text)[0]
184
 
185
+ def _decode(self, token_ids, **kwargs):
186
+ return self.trie_tokenizer.decodeBytes(token_ids).decode("utf-8")
187
+
188
  def _convert_token_to_id(self, token):
189
+ if isinstance(token, (int)):
190
+ return token
191
+ elif isinstance(token, (bytes)):
192
+ return self.encoder.get(token)
193
+ return self.encoder.get(token.encode("utf-8"))
194
 
195
  def _convert_id_to_token(self, index):
196
  """Converts an index (integer) in a token (byte) using the vocab."""