Spaces:
Sleeping
Sleeping
File size: 3,694 Bytes
17ff0d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
"""Tokenize the dataset and saves the output."""
import logging
import os
import sys
import datasets
from accelerate import Accelerator
from arguments import DataTrainingArguments, ModelArguments, TrainingArguments
from data.data_utils import tokenize_data
from datasets import DatasetDict, load_dataset
from transformers import AutoTokenizer, HfArgumentParser, set_seed
from transformers.utils.versions import require_version
require_version("datasets>=1.8.0")
logger = logging.getLogger(__name__)
def main():
parser = HfArgumentParser(
(ModelArguments, DataTrainingArguments, TrainingArguments)
)
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(
json_file=os.path.abspath(sys.argv[1])
)
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
accelerator = Accelerator()
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
# Set the verbosity to info of the Transformers logger (on main process only):
logger.info(f"Training/evaluation parameters {training_args}")
# Set seed before initializing model.
set_seed(training_args.seed)
assert data_args.dataset_name is not None
# Downloading and loading a dataset from the hub.
raw_datasets = datasets.DatasetDict()
raw_datasets["train"] = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
split="train",
verification_mode=data_args.verification_mode,
)
tokenizer_kwargs = {
"cache_dir": model_args.cache_dir,
"use_fast": model_args.use_fast_tokenizer,
"revision": model_args.model_revision,
"use_auth_token": True if model_args.use_auth_token else None,
}
if model_args.model_name_or_path:
tokenizer = AutoTokenizer.from_pretrained(
model_args.model_name_or_path, **tokenizer_kwargs
)
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)
tokenized_datasets = tokenize_data(data_args, tokenizer, raw_datasets, accelerator)
train_testvalid = tokenized_datasets["train"].train_test_split(
test_size=data_args.validation_split_ratio,
shuffle=True,
seed=training_args.seed,
)
tokenized_datasets = DatasetDict(
{"train": train_testvalid["train"], "validation": train_testvalid["test"]}
)
with training_args.main_process_first():
tokenized_datasets.save_to_disk(training_args.output_dir)
logger.info(f"The processed data are written in {training_args.output_dir}")
if __name__ == "__main__":
main()
|