Spaces:
Sleeping
Sleeping
"""Tokenize the dataset and saves the output.""" | |
import logging | |
import os | |
import sys | |
import datasets | |
from accelerate import Accelerator | |
from arguments import DataTrainingArguments, ModelArguments, TrainingArguments | |
from data.data_utils import tokenize_data | |
from datasets import DatasetDict, load_dataset | |
from transformers import AutoTokenizer, HfArgumentParser, set_seed | |
from transformers.utils.versions import require_version | |
require_version("datasets>=1.8.0") | |
logger = logging.getLogger(__name__) | |
def main(): | |
parser = HfArgumentParser( | |
(ModelArguments, DataTrainingArguments, TrainingArguments) | |
) | |
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): | |
# If we pass only one argument to the script and it's the path to a json file, | |
# let's parse it to get our arguments. | |
model_args, data_args, training_args = parser.parse_json_file( | |
json_file=os.path.abspath(sys.argv[1]) | |
) | |
else: | |
model_args, data_args, training_args = parser.parse_args_into_dataclasses() | |
accelerator = Accelerator() | |
# Setup logging | |
logging.basicConfig( | |
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | |
datefmt="%m/%d/%Y %H:%M:%S", | |
handlers=[logging.StreamHandler(sys.stdout)], | |
) | |
log_level = training_args.get_process_log_level() | |
logger.setLevel(log_level) | |
datasets.utils.logging.set_verbosity(log_level) | |
# Log on each process the small summary: | |
logger.warning( | |
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" | |
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" | |
) | |
# Set the verbosity to info of the Transformers logger (on main process only): | |
logger.info(f"Training/evaluation parameters {training_args}") | |
# Set seed before initializing model. | |
set_seed(training_args.seed) | |
assert data_args.dataset_name is not None | |
# Downloading and loading a dataset from the hub. | |
raw_datasets = datasets.DatasetDict() | |
raw_datasets["train"] = load_dataset( | |
data_args.dataset_name, | |
data_args.dataset_config_name, | |
cache_dir=model_args.cache_dir, | |
use_auth_token=True if model_args.use_auth_token else None, | |
split="train", | |
verification_mode=data_args.verification_mode, | |
) | |
tokenizer_kwargs = { | |
"cache_dir": model_args.cache_dir, | |
"use_fast": model_args.use_fast_tokenizer, | |
"revision": model_args.model_revision, | |
"use_auth_token": True if model_args.use_auth_token else None, | |
} | |
if model_args.model_name_or_path: | |
tokenizer = AutoTokenizer.from_pretrained( | |
model_args.model_name_or_path, **tokenizer_kwargs | |
) | |
else: | |
raise ValueError( | |
"You are instantiating a new tokenizer from scratch. This is not supported by this script." | |
"You can do it from another script, save it, and load it from here, using --tokenizer_name." | |
) | |
tokenized_datasets = tokenize_data(data_args, tokenizer, raw_datasets, accelerator) | |
train_testvalid = tokenized_datasets["train"].train_test_split( | |
test_size=data_args.validation_split_ratio, | |
shuffle=True, | |
seed=training_args.seed, | |
) | |
tokenized_datasets = DatasetDict( | |
{"train": train_testvalid["train"], "validation": train_testvalid["test"]} | |
) | |
with training_args.main_process_first(): | |
tokenized_datasets.save_to_disk(training_args.output_dir) | |
logger.info(f"The processed data are written in {training_args.output_dir}") | |
if __name__ == "__main__": | |
main() | |