Spaces:
Sleeping
Sleeping
File size: 676 Bytes
17ff0d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
from datasets import DatasetDict, load_from_disk
tokenized_data_path = (
"/home/lily/jt856/documents/simplex-diffusion/processed_data/openwebtext_256_split"
)
output_dir = "/home/lily/jt856/documents/simplex-diffusion/processed_data/openwebtext_256_split_gpt_eval"
seed = 42
tokenized_datasets = load_from_disk(tokenized_data_path)
validation_split_ratio = 0.1414827391058291
train_testvalid = tokenized_datasets["validation"].train_test_split(
test_size=validation_split_ratio, shuffle=True, seed=seed
)
tokenized_datasets = DatasetDict(
{"train": tokenized_datasets["train"], "validation": train_testvalid["test"]}
)
tokenized_datasets.save_to_disk(output_dir)
|