"""Tokenize the dataset and saves the output.""" import logging import os import sys import datasets from accelerate import Accelerator from arguments import DataTrainingArguments, ModelArguments, TrainingArguments from data.data_utils import tokenize_data from datasets import DatasetDict, load_dataset from transformers import AutoTokenizer, HfArgumentParser, set_seed from transformers.utils.versions import require_version require_version("datasets>=1.8.0") logger = logging.getLogger(__name__) def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments) ) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1]) ) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() accelerator = Accelerator() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) assert data_args.dataset_name is not None # Downloading and loading a dataset from the hub. raw_datasets = datasets.DatasetDict() raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, split="train", verification_mode=data_args.verification_mode, ) tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, **tokenizer_kwargs ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) tokenized_datasets = tokenize_data(data_args, tokenizer, raw_datasets, accelerator) train_testvalid = tokenized_datasets["train"].train_test_split( test_size=data_args.validation_split_ratio, shuffle=True, seed=training_args.seed, ) tokenized_datasets = DatasetDict( {"train": train_testvalid["train"], "validation": train_testvalid["test"]} ) with training_args.main_process_first(): tokenized_datasets.save_to_disk(training_args.output_dir) logger.info(f"The processed data are written in {training_args.output_dir}") if __name__ == "__main__": main()