Spaces:
Running
Running
fixes data collation issue
Browse files
data.py
CHANGED
@@ -261,6 +261,8 @@ class SmolLM3Dataset:
|
|
261 |
return DataCollatorForLanguageModeling(
|
262 |
tokenizer=self.tokenizer,
|
263 |
mlm=False, # We're doing causal LM, not masked LM
|
|
|
|
|
264 |
)
|
265 |
|
266 |
def create_sample_dataset(output_path: str = "my_dataset"):
|
|
|
261 |
return DataCollatorForLanguageModeling(
|
262 |
tokenizer=self.tokenizer,
|
263 |
mlm=False, # We're doing causal LM, not masked LM
|
264 |
+
pad_to_multiple_of=8, # Pad to multiple of 8 for efficiency
|
265 |
+
return_tensors="pt", # Ensure we return PyTorch tensors
|
266 |
)
|
267 |
|
268 |
def create_sample_dataset(output_path: str = "my_dataset"):
|