|
|
|
|
|
|
|
|
|
from megatron.data import indexed_dataset |
|
from megatron.tokenizer import build_tokenizer |
|
import argparse |
|
import os |
|
import sys |
|
|
|
import torch |
|
|
|
script_dir = os.path.dirname(os.path.realpath(__file__)) |
|
sys.path.append(os.path.join(script_dir, "../../../")) |
|
|
|
|
|
def test_indexed_dataset(args): |
|
ds = indexed_dataset.make_dataset(args.data, args.dataset_impl) |
|
tokenizer = build_tokenizer(args) |
|
print(len(ds.doc_idx)) |
|
print(len(ds)) |
|
print(ds.doc_idx[-1]) |
|
if ds.supports_prefetch: |
|
|
|
ds.prefetch(range(len(ds))) |
|
if args.count > len(ds.doc_idx) - 1: |
|
args.count = len(ds.doc_idx) - 1 |
|
|
|
for i in range(args.count): |
|
start = ds.doc_idx[i] |
|
end = ds.doc_idx[i + 1] |
|
ids = ds[start:end] |
|
print(f"Document {i}:") |
|
print("--------------") |
|
for s in ids: |
|
assert len(s) > 0 |
|
l = s.data.tolist() |
|
text = tokenizer.detokenize(l) |
|
print(text) |
|
print("---") |
|
|
|
|
|
def test_indexed_dataset_get(args): |
|
ds = indexed_dataset.make_dataset(args.data, args.dataset_impl) |
|
tokenizer = build_tokenizer(args) |
|
size = ds.sizes[0] |
|
print(f"size: {size}") |
|
full = ds.get(0) |
|
print(full) |
|
|
|
print("---") |
|
end = ds.get(0, offset=size - 10) |
|
print(end) |
|
|
|
|
|
start = ds.get(0, length=10) |
|
print(start) |
|
|
|
|
|
part = ds.get(0, offset=2, length=8) |
|
print(part) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--data', type=str, help='prefix to data files') |
|
parser.add_argument('--dataset-impl', type=str, default='infer', |
|
choices=['lazy', 'cached', 'mmap', 'infer']) |
|
parser.add_argument('--count', type=int, default=10, |
|
help='Number of samples/documents to print') |
|
|
|
group = parser.add_argument_group(title='tokenizer') |
|
group.add_argument('--tokenizer-type', type=str, required=True, |
|
choices=['BertWordPieceLowerCase', |
|
'GPT2BPETokenizer'], |
|
help='What type of tokenizer to use.') |
|
group.add_argument('--vocab-file', type=str, default=None, |
|
help='Path to the vocab file') |
|
group.add_argument('--merge-file', type=str, default=None, |
|
help='Path to the BPE merge file (if necessary).') |
|
|
|
parser.add_argument('--epochs', type=int, default=5, |
|
help='Number of epochs to plan for') |
|
parser.add_argument('--max-num-samples', type=int, default=None, |
|
help='Maximum number of samples to plan for') |
|
parser.add_argument('--masked-lm-prob', type=float, default=0.15, |
|
help='probability of masking tokens') |
|
parser.add_argument('--seq-length', type=int, default=512, |
|
help='maximum sequence length') |
|
parser.add_argument('--short-seq-prob', type=float, default=0.1, |
|
help='probability of creating a short sequence') |
|
parser.add_argument('--seed', type=int, default=1234, |
|
help='random seed') |
|
args = parser.parse_args() |
|
args.rank = 0 |
|
args.make_vocab_size_divisible_by = 128 |
|
args.tensor_model_parallel_size = 1 |
|
|
|
if args.dataset_impl == "infer": |
|
args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data) |
|
|
|
|
|
test_indexed_dataset_get(args) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|