winglian commited on
Commit
a944f7b
·
unverified ·
1 Parent(s): 9d4225a

load explicit splits on datasets (#1652)

Browse files
Files changed (1) hide show
  1. src/axolotl/utils/data/sft.py +4 -0
src/axolotl/utils/data/sft.py CHANGED
@@ -308,12 +308,16 @@ def load_tokenized_prepared_datasets(
308
  "unhandled dataset load: local path exists, but is neither a directory or a file"
309
  )
310
  elif ds_from_hub:
 
 
 
311
  ds = load_dataset(
312
  config_dataset.path,
313
  name=config_dataset.name,
314
  streaming=False,
315
  data_files=config_dataset.data_files,
316
  token=use_auth_token,
 
317
  )
318
  elif ds_from_cloud and remote_file_system:
319
  if remote_file_system.isdir(config_dataset.path):
 
308
  "unhandled dataset load: local path exists, but is neither a directory or a file"
309
  )
310
  elif ds_from_hub:
311
+ load_ds_kwargs = {}
312
+ if config_dataset.split:
313
+ load_ds_kwargs = {"split": config_dataset.split}
314
  ds = load_dataset(
315
  config_dataset.path,
316
  name=config_dataset.name,
317
  streaming=False,
318
  data_files=config_dataset.data_files,
319
  token=use_auth_token,
320
+ **load_ds_kwargs,
321
  )
322
  elif ds_from_cloud and remote_file_system:
323
  if remote_file_system.isdir(config_dataset.path):