load explicit splits on datasets (#1652)
Browse files
src/axolotl/utils/data/sft.py
CHANGED
@@ -308,12 +308,16 @@ def load_tokenized_prepared_datasets(
|
|
308 |
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
309 |
)
|
310 |
elif ds_from_hub:
|
|
|
|
|
|
|
311 |
ds = load_dataset(
|
312 |
config_dataset.path,
|
313 |
name=config_dataset.name,
|
314 |
streaming=False,
|
315 |
data_files=config_dataset.data_files,
|
316 |
token=use_auth_token,
|
|
|
317 |
)
|
318 |
elif ds_from_cloud and remote_file_system:
|
319 |
if remote_file_system.isdir(config_dataset.path):
|
|
|
308 |
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
309 |
)
|
310 |
elif ds_from_hub:
|
311 |
+
load_ds_kwargs = {}
|
312 |
+
if config_dataset.split:
|
313 |
+
load_ds_kwargs = {"split": config_dataset.split}
|
314 |
ds = load_dataset(
|
315 |
config_dataset.path,
|
316 |
name=config_dataset.name,
|
317 |
streaming=False,
|
318 |
data_files=config_dataset.data_files,
|
319 |
token=use_auth_token,
|
320 |
+
**load_ds_kwargs,
|
321 |
)
|
322 |
elif ds_from_cloud and remote_file_system:
|
323 |
if remote_file_system.isdir(config_dataset.path):
|