zR
commited on
Commit
•
b97dd95
1
Parent(s):
b36cb68
fix with convert_tokens_to_string
Browse files- tokenization_chatglm.py +11 -4
tokenization_chatglm.py
CHANGED
@@ -63,22 +63,22 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
|
|
63 |
vocab.update(self.added_tokens_encoder)
|
64 |
return vocab
|
65 |
|
66 |
-
def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
|
67 |
"""
|
68 |
Converts a sequence of tokens in a single string.
|
69 |
"""
|
70 |
text = ""
|
71 |
temp = b""
|
72 |
for t in tokens:
|
|
|
|
|
73 |
if isinstance(t, str):
|
74 |
if temp:
|
75 |
text += temp.decode("utf-8", errors="replace")
|
76 |
-
temp = b""
|
77 |
-
text += t
|
78 |
elif isinstance(t, bytes):
|
79 |
temp += t
|
80 |
else:
|
81 |
-
raise TypeError("token should only be of type
|
82 |
if temp:
|
83 |
text += temp.decode("utf-8", errors="replace")
|
84 |
return text
|
@@ -90,6 +90,13 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
|
|
90 |
tokens.append(self.decoder[t])
|
91 |
return tokens
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
def _convert_token_to_id(self, token):
|
94 |
""" Converts a token (str) in an id using the vocab. """
|
95 |
return self.mergeable_ranks[token]
|
|
|
63 |
vocab.update(self.added_tokens_encoder)
|
64 |
return vocab
|
65 |
|
66 |
+
def convert_tokens_to_string(self, tokens: List[Union[bytes, str, int]]) -> str:
|
67 |
"""
|
68 |
Converts a sequence of tokens in a single string.
|
69 |
"""
|
70 |
text = ""
|
71 |
temp = b""
|
72 |
for t in tokens:
|
73 |
+
if isinstance(t, int):
|
74 |
+
t = chr(t)
|
75 |
if isinstance(t, str):
|
76 |
if temp:
|
77 |
text += temp.decode("utf-8", errors="replace")
|
|
|
|
|
78 |
elif isinstance(t, bytes):
|
79 |
temp += t
|
80 |
else:
|
81 |
+
raise TypeError("token should only be of type int, bytes or str")
|
82 |
if temp:
|
83 |
text += temp.decode("utf-8", errors="replace")
|
84 |
return text
|
|
|
90 |
tokens.append(self.decoder[t])
|
91 |
return tokens
|
92 |
|
93 |
+
def _tokenize(self, text, **kwargs):
|
94 |
+
tokens = []
|
95 |
+
ids = self.tokenizer.encode(text)
|
96 |
+
for t in ids:
|
97 |
+
tokens.append(self.decoder[t])
|
98 |
+
return tokens
|
99 |
+
|
100 |
def _convert_token_to_id(self, token):
|
101 |
""" Converts a token (str) in an id using the vocab. """
|
102 |
return self.mergeable_ranks[token]
|