zR commited on
Commit
b97dd95
1 Parent(s): b36cb68

fix with convert_tokens_to_string

Browse files
Files changed (1) hide show
  1. tokenization_chatglm.py +11 -4
tokenization_chatglm.py CHANGED
@@ -63,22 +63,22 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
63
  vocab.update(self.added_tokens_encoder)
64
  return vocab
65
 
66
- def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
67
  """
68
  Converts a sequence of tokens in a single string.
69
  """
70
  text = ""
71
  temp = b""
72
  for t in tokens:
 
 
73
  if isinstance(t, str):
74
  if temp:
75
  text += temp.decode("utf-8", errors="replace")
76
- temp = b""
77
- text += t
78
  elif isinstance(t, bytes):
79
  temp += t
80
  else:
81
- raise TypeError("token should only be of type types or str")
82
  if temp:
83
  text += temp.decode("utf-8", errors="replace")
84
  return text
@@ -90,6 +90,13 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
90
  tokens.append(self.decoder[t])
91
  return tokens
92
 
 
 
 
 
 
 
 
93
  def _convert_token_to_id(self, token):
94
  """ Converts a token (str) in an id using the vocab. """
95
  return self.mergeable_ranks[token]
 
63
  vocab.update(self.added_tokens_encoder)
64
  return vocab
65
 
66
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str, int]]) -> str:
67
  """
68
  Converts a sequence of tokens in a single string.
69
  """
70
  text = ""
71
  temp = b""
72
  for t in tokens:
73
+ if isinstance(t, int):
74
+ t = chr(t)
75
  if isinstance(t, str):
76
  if temp:
77
  text += temp.decode("utf-8", errors="replace")
 
 
78
  elif isinstance(t, bytes):
79
  temp += t
80
  else:
81
+ raise TypeError("token should only be of type int, bytes or str")
82
  if temp:
83
  text += temp.decode("utf-8", errors="replace")
84
  return text
 
90
  tokens.append(self.decoder[t])
91
  return tokens
92
 
93
+ def _tokenize(self, text, **kwargs):
94
+ tokens = []
95
+ ids = self.tokenizer.encode(text)
96
+ for t in ids:
97
+ tokens.append(self.decoder[t])
98
+ return tokens
99
+
100
  def _convert_token_to_id(self, token):
101
  """ Converts a token (str) in an id using the vocab. """
102
  return self.mergeable_ranks[token]