Spaces:
Running
Running
fixes data collation issue with padding
Browse files
data.py
CHANGED
@@ -167,13 +167,13 @@ class SmolLM3Dataset:
|
|
167 |
|
168 |
def tokenize_function(examples):
|
169 |
"""Tokenize the examples"""
|
170 |
-
# Tokenize the texts
|
171 |
tokenized = self.tokenizer(
|
172 |
examples["text"],
|
173 |
truncation=True,
|
174 |
-
padding=
|
175 |
max_length=self.max_seq_length,
|
176 |
-
return_overflowing_tokens=
|
177 |
return_length=True,
|
178 |
)
|
179 |
|
@@ -263,6 +263,7 @@ class SmolLM3Dataset:
|
|
263 |
mlm=False, # We're doing causal LM, not masked LM
|
264 |
pad_to_multiple_of=8, # Pad to multiple of 8 for efficiency
|
265 |
return_tensors="pt", # Ensure we return PyTorch tensors
|
|
|
266 |
)
|
267 |
|
268 |
def create_sample_dataset(output_path: str = "my_dataset"):
|
|
|
167 |
|
168 |
def tokenize_function(examples):
|
169 |
"""Tokenize the examples"""
|
170 |
+
# Tokenize the texts with fixed length
|
171 |
tokenized = self.tokenizer(
|
172 |
examples["text"],
|
173 |
truncation=True,
|
174 |
+
padding=True, # Enable padding during tokenization
|
175 |
max_length=self.max_seq_length,
|
176 |
+
return_overflowing_tokens=False, # Don't return overflowing tokens
|
177 |
return_length=True,
|
178 |
)
|
179 |
|
|
|
263 |
mlm=False, # We're doing causal LM, not masked LM
|
264 |
pad_to_multiple_of=8, # Pad to multiple of 8 for efficiency
|
265 |
return_tensors="pt", # Ensure we return PyTorch tensors
|
266 |
+
padding=True, # Enable padding
|
267 |
)
|
268 |
|
269 |
def create_sample_dataset(output_path: str = "my_dataset"):
|