Tonic commited on
Commit
c740b39
·
verified ·
1 Parent(s): 5c7e6ea

fixes data collation issue with padding

Browse files
Files changed (1) hide show
  1. data.py +4 -3
data.py CHANGED
@@ -167,13 +167,13 @@ class SmolLM3Dataset:
167
 
168
  def tokenize_function(examples):
169
  """Tokenize the examples"""
170
- # Tokenize the texts
171
  tokenized = self.tokenizer(
172
  examples["text"],
173
  truncation=True,
174
- padding=False,
175
  max_length=self.max_seq_length,
176
- return_overflowing_tokens=True,
177
  return_length=True,
178
  )
179
 
@@ -263,6 +263,7 @@ class SmolLM3Dataset:
263
  mlm=False, # We're doing causal LM, not masked LM
264
  pad_to_multiple_of=8, # Pad to multiple of 8 for efficiency
265
  return_tensors="pt", # Ensure we return PyTorch tensors
 
266
  )
267
 
268
  def create_sample_dataset(output_path: str = "my_dataset"):
 
167
 
168
  def tokenize_function(examples):
169
  """Tokenize the examples"""
170
+ # Tokenize the texts with fixed length
171
  tokenized = self.tokenizer(
172
  examples["text"],
173
  truncation=True,
174
+ padding=True, # Enable padding during tokenization
175
  max_length=self.max_seq_length,
176
+ return_overflowing_tokens=False, # Don't return overflowing tokens
177
  return_length=True,
178
  )
179
 
 
263
  mlm=False, # We're doing causal LM, not masked LM
264
  pad_to_multiple_of=8, # Pad to multiple of 8 for efficiency
265
  return_tensors="pt", # Ensure we return PyTorch tensors
266
+ padding=True, # Enable padding
267
  )
268
 
269
  def create_sample_dataset(output_path: str = "my_dataset"):