Upload tokenization_chatglm.py
#11
by
Blueberry001
- opened
- tokenization_chatglm.py +7 -14
tokenization_chatglm.py
CHANGED
@@ -66,7 +66,6 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
66 |
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
67 |
|
68 |
def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
|
69 |
-
super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
|
70 |
self.name = "GLMTokenizer"
|
71 |
|
72 |
self.vocab_file = vocab_file
|
@@ -76,6 +75,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
76 |
"<eos>": self.tokenizer.eos_id,
|
77 |
"<pad>": self.tokenizer.pad_id
|
78 |
}
|
|
|
79 |
|
80 |
def get_command(self, token):
|
81 |
if token in self.special_tokens:
|
@@ -225,7 +225,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
225 |
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
226 |
"""
|
227 |
# Load from model defaults
|
228 |
-
|
229 |
|
230 |
required_input = encoded_inputs[self.model_input_names[0]]
|
231 |
seq_length = len(required_input)
|
@@ -248,17 +248,10 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
248 |
if needs_to_be_padded:
|
249 |
difference = max_length - len(required_input)
|
250 |
|
251 |
-
if
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
257 |
-
else:
|
258 |
-
if "attention_mask" in encoded_inputs:
|
259 |
-
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
|
260 |
-
if "position_ids" in encoded_inputs:
|
261 |
-
encoded_inputs["position_ids"] = encoded_inputs["position_ids"] + [0] * difference
|
262 |
-
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
263 |
|
264 |
return encoded_inputs
|
|
|
66 |
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
67 |
|
68 |
def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
|
|
|
69 |
self.name = "GLMTokenizer"
|
70 |
|
71 |
self.vocab_file = vocab_file
|
|
|
75 |
"<eos>": self.tokenizer.eos_id,
|
76 |
"<pad>": self.tokenizer.pad_id
|
77 |
}
|
78 |
+
super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
|
79 |
|
80 |
def get_command(self, token):
|
81 |
if token in self.special_tokens:
|
|
|
225 |
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
226 |
"""
|
227 |
# Load from model defaults
|
228 |
+
assert self.padding_side == "left"
|
229 |
|
230 |
required_input = encoded_inputs[self.model_input_names[0]]
|
231 |
seq_length = len(required_input)
|
|
|
248 |
if needs_to_be_padded:
|
249 |
difference = max_length - len(required_input)
|
250 |
|
251 |
+
if "attention_mask" in encoded_inputs:
|
252 |
+
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
|
253 |
+
if "position_ids" in encoded_inputs:
|
254 |
+
encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
|
255 |
+
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
return encoded_inputs
|