KoichiYasuoka
commited on
Update hf_rwkv_tokenizer.py
Browse filesBug fix of `convert_tokens_to_ids` and `decode` for byte-fallback characters.
- hf_rwkv_tokenizer.py +8 -1
hf_rwkv_tokenizer.py
CHANGED
@@ -182,8 +182,15 @@ class Rwkv6Tokenizer(PreTrainedTokenizer):
|
|
182 |
# return self.wordpiece_tokenizer.tokenize(text.encode("utf-8"))
|
183 |
return self.trie_tokenizer.encode(text)[0]
|
184 |
|
|
|
|
|
|
|
185 |
def _convert_token_to_id(self, token):
|
186 |
-
|
|
|
|
|
|
|
|
|
187 |
|
188 |
def _convert_id_to_token(self, index):
|
189 |
"""Converts an index (integer) in a token (byte) using the vocab."""
|
|
|
182 |
# return self.wordpiece_tokenizer.tokenize(text.encode("utf-8"))
|
183 |
return self.trie_tokenizer.encode(text)[0]
|
184 |
|
185 |
+
def _decode(self, token_ids, **kwargs):
|
186 |
+
return self.trie_tokenizer.decodeBytes(token_ids).decode("utf-8")
|
187 |
+
|
188 |
def _convert_token_to_id(self, token):
|
189 |
+
if isinstance(token, (int)):
|
190 |
+
return token
|
191 |
+
elif isinstance(token, (bytes)):
|
192 |
+
return self.encoder.get(token)
|
193 |
+
return self.encoder.get(token.encode("utf-8"))
|
194 |
|
195 |
def _convert_id_to_token(self, index):
|
196 |
"""Converts an index (integer) in a token (byte) using the vocab."""
|