Spaces:
Running
Running
adds datasets splits
Browse files
data.py
CHANGED
@@ -74,6 +74,17 @@ class SmolLM3Dataset:
|
|
74 |
try:
|
75 |
dataset = load_dataset(self.data_path)
|
76 |
logger.info(f"Loaded Hugging Face dataset: {self.data_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
return dataset
|
78 |
except Exception as e:
|
79 |
logger.error(f"Failed to load dataset: {e}")
|
|
|
74 |
try:
|
75 |
dataset = load_dataset(self.data_path)
|
76 |
logger.info(f"Loaded Hugging Face dataset: {self.data_path}")
|
77 |
+
# If only 'train' split exists, create validation and test splits
|
78 |
+
if ("train" in dataset) and ("validation" not in dataset or "test" not in dataset):
|
79 |
+
logger.info("Automatically splitting train into train/validation/test (98/1/1)")
|
80 |
+
split_dataset = dataset["train"].train_test_split(test_size=0.02, seed=42)
|
81 |
+
# Now split test into validation and test (1% each)
|
82 |
+
val_test_split = split_dataset["test"].train_test_split(test_size=0.5, seed=42)
|
83 |
+
dataset = {
|
84 |
+
"train": split_dataset["train"],
|
85 |
+
"validation": val_test_split["train"],
|
86 |
+
"test": val_test_split["test"]
|
87 |
+
}
|
88 |
return dataset
|
89 |
except Exception as e:
|
90 |
logger.error(f"Failed to load dataset: {e}")
|