Upload tokenization_chatglm.py

#11
Files changed (1) hide show
  1. tokenization_chatglm.py +7 -14
tokenization_chatglm.py CHANGED
@@ -66,7 +66,6 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
66
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
67
 
68
  def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
69
- super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
70
  self.name = "GLMTokenizer"
71
 
72
  self.vocab_file = vocab_file
@@ -76,6 +75,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
76
  "<eos>": self.tokenizer.eos_id,
77
  "<pad>": self.tokenizer.pad_id
78
  }
 
79
 
80
  def get_command(self, token):
81
  if token in self.special_tokens:
@@ -225,7 +225,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
225
  (optional) Set to False to avoid returning attention mask (default: set to model specifics)
226
  """
227
  # Load from model defaults
228
- # assert self.padding_side == "left"
229
 
230
  required_input = encoded_inputs[self.model_input_names[0]]
231
  seq_length = len(required_input)
@@ -248,17 +248,10 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
248
  if needs_to_be_padded:
249
  difference = max_length - len(required_input)
250
 
251
- if self.padding_side == "left":
252
- if "attention_mask" in encoded_inputs:
253
- encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
254
- if "position_ids" in encoded_inputs:
255
- encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
256
- encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
257
- else:
258
- if "attention_mask" in encoded_inputs:
259
- encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
260
- if "position_ids" in encoded_inputs:
261
- encoded_inputs["position_ids"] = encoded_inputs["position_ids"] + [0] * difference
262
- encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
263
 
264
  return encoded_inputs
 
66
  model_input_names = ["input_ids", "attention_mask", "position_ids"]
67
 
68
  def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
 
69
  self.name = "GLMTokenizer"
70
 
71
  self.vocab_file = vocab_file
 
75
  "<eos>": self.tokenizer.eos_id,
76
  "<pad>": self.tokenizer.pad_id
77
  }
78
+ super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
79
 
80
  def get_command(self, token):
81
  if token in self.special_tokens:
 
225
  (optional) Set to False to avoid returning attention mask (default: set to model specifics)
226
  """
227
  # Load from model defaults
228
+ assert self.padding_side == "left"
229
 
230
  required_input = encoded_inputs[self.model_input_names[0]]
231
  seq_length = len(required_input)
 
248
  if needs_to_be_padded:
249
  difference = max_length - len(required_input)
250
 
251
+ if "attention_mask" in encoded_inputs:
252
+ encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
253
+ if "position_ids" in encoded_inputs:
254
+ encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
255
+ encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
 
 
 
 
 
 
 
256
 
257
  return encoded_inputs