tess-2-demo / sdlm /data /process_data.py
hamishivi's picture
commit
17ff0d8 verified
"""Tokenize the dataset and saves the output."""
import logging
import os
import sys
import datasets
from accelerate import Accelerator
from arguments import DataTrainingArguments, ModelArguments, TrainingArguments
from data.data_utils import tokenize_data
from datasets import DatasetDict, load_dataset
from transformers import AutoTokenizer, HfArgumentParser, set_seed
from transformers.utils.versions import require_version
require_version("datasets>=1.8.0")
logger = logging.getLogger(__name__)
def main():
parser = HfArgumentParser(
(ModelArguments, DataTrainingArguments, TrainingArguments)
)
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(
json_file=os.path.abspath(sys.argv[1])
)
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
accelerator = Accelerator()
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
# Set the verbosity to info of the Transformers logger (on main process only):
logger.info(f"Training/evaluation parameters {training_args}")
# Set seed before initializing model.
set_seed(training_args.seed)
assert data_args.dataset_name is not None
# Downloading and loading a dataset from the hub.
raw_datasets = datasets.DatasetDict()
raw_datasets["train"] = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
split="train",
verification_mode=data_args.verification_mode,
)
tokenizer_kwargs = {
"cache_dir": model_args.cache_dir,
"use_fast": model_args.use_fast_tokenizer,
"revision": model_args.model_revision,
"use_auth_token": True if model_args.use_auth_token else None,
}
if model_args.model_name_or_path:
tokenizer = AutoTokenizer.from_pretrained(
model_args.model_name_or_path, **tokenizer_kwargs
)
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)
tokenized_datasets = tokenize_data(data_args, tokenizer, raw_datasets, accelerator)
train_testvalid = tokenized_datasets["train"].train_test_split(
test_size=data_args.validation_split_ratio,
shuffle=True,
seed=training_args.seed,
)
tokenized_datasets = DatasetDict(
{"train": train_testvalid["train"], "validation": train_testvalid["test"]}
)
with training_args.main_process_first():
tokenized_datasets.save_to_disk(training_args.output_dir)
logger.info(f"The processed data are written in {training_args.output_dir}")
if __name__ == "__main__":
main()