optimize the iteration when tokenizeing large datasets (#332)
Browse files- src/axolotl/utils/data.py +10 -1
src/axolotl/utils/data.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
"""Module containing data utilities"""
|
| 2 |
import functools
|
|
|
|
| 3 |
import logging
|
| 4 |
from hashlib import md5
|
| 5 |
from pathlib import Path
|
|
@@ -264,8 +265,16 @@ def load_tokenized_prepared_datasets(
|
|
| 264 |
LOG.info("tokenizing, merging, and shuffling master dataset")
|
| 265 |
|
| 266 |
samples: List[int] = []
|
|
|
|
| 267 |
for d in datasets:
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
dataset = Dataset.from_list(samples).shuffle(seed=seed)
|
| 270 |
if cfg.local_rank == 0:
|
| 271 |
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
|
|
|
| 1 |
"""Module containing data utilities"""
|
| 2 |
import functools
|
| 3 |
+
import itertools
|
| 4 |
import logging
|
| 5 |
from hashlib import md5
|
| 6 |
from pathlib import Path
|
|
|
|
| 265 |
LOG.info("tokenizing, merging, and shuffling master dataset")
|
| 266 |
|
| 267 |
samples: List[int] = []
|
| 268 |
+
chunk_size = 1000
|
| 269 |
for d in datasets:
|
| 270 |
+
d_iter = iter(d)
|
| 271 |
+
while True:
|
| 272 |
+
chunk = list(itertools.islice(d_iter, chunk_size))
|
| 273 |
+
if not chunk:
|
| 274 |
+
break
|
| 275 |
+
samples.extend(chunk)
|
| 276 |
+
|
| 277 |
+
LOG.info("shuffle")
|
| 278 |
dataset = Dataset.from_list(samples).shuffle(seed=seed)
|
| 279 |
if cfg.local_rank == 0:
|
| 280 |
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|