Tonic commited on
Commit
5c7e6ea
·
verified ·
1 Parent(s): d60ab6c

fixes data collation issue

Browse files
Files changed (1) hide show
  1. data.py +2 -0
data.py CHANGED
@@ -261,6 +261,8 @@ class SmolLM3Dataset:
261
  return DataCollatorForLanguageModeling(
262
  tokenizer=self.tokenizer,
263
  mlm=False, # We're doing causal LM, not masked LM
 
 
264
  )
265
 
266
  def create_sample_dataset(output_path: str = "my_dataset"):
 
261
  return DataCollatorForLanguageModeling(
262
  tokenizer=self.tokenizer,
263
  mlm=False, # We're doing causal LM, not masked LM
264
+ pad_to_multiple_of=8, # Pad to multiple of 8 for efficiency
265
+ return_tensors="pt", # Ensure we return PyTorch tensors
266
  )
267
 
268
  def create_sample_dataset(output_path: str = "my_dataset"):