Tonic commited on
Commit
231fcd0
·
verified ·
1 Parent(s): 0e297cd

adds datasets splits

Browse files
Files changed (1) hide show
  1. data.py +11 -0
data.py CHANGED
@@ -74,6 +74,17 @@ class SmolLM3Dataset:
74
  try:
75
  dataset = load_dataset(self.data_path)
76
  logger.info(f"Loaded Hugging Face dataset: {self.data_path}")
 
 
 
 
 
 
 
 
 
 
 
77
  return dataset
78
  except Exception as e:
79
  logger.error(f"Failed to load dataset: {e}")
 
74
  try:
75
  dataset = load_dataset(self.data_path)
76
  logger.info(f"Loaded Hugging Face dataset: {self.data_path}")
77
+ # If only 'train' split exists, create validation and test splits
78
+ if ("train" in dataset) and ("validation" not in dataset or "test" not in dataset):
79
+ logger.info("Automatically splitting train into train/validation/test (98/1/1)")
80
+ split_dataset = dataset["train"].train_test_split(test_size=0.02, seed=42)
81
+ # Now split test into validation and test (1% each)
82
+ val_test_split = split_dataset["test"].train_test_split(test_size=0.5, seed=42)
83
+ dataset = {
84
+ "train": split_dataset["train"],
85
+ "validation": val_test_split["train"],
86
+ "test": val_test_split["test"]
87
+ }
88
  return dataset
89
  except Exception as e:
90
  logger.error(f"Failed to load dataset: {e}")