question-answering-api / finetune_roberta_qa.py
TangSan003's picture
Load model
8516514
import os
import logging
os.environ["WANDB_PROJECT"] = "RoBERTa_QA_Finetune"
import argparse
from datasets import load_dataset
from transformers import (
RobertaTokenizerFast,
DefaultDataCollator,
TrainingArguments,
Trainer,
)
import torch
from utils import RobertaConfig, ExtractiveQAPreProcesing
from model import RobertaForQuestionAnswering
import warnings
warnings.filterwarnings("ignore")
def parse_arguments():
parser = argparse.ArgumentParser(description="Wav2Vec2 Finetuning Arguments on Librispeech")
### Experiment Logging ###
parser.add_argument(
"--experiment_name",
required=True,
type=str
)
parser.add_argument(
"--working_directory",
required=True,
type=str
)
parser.add_argument(
"--path_to_cache_dir",
help="Path to huggingface cache if different from default",
default=None,
type=str
)
parser.add_argument(
"--num_train_epochs",
help="Number of epochs you want to train for",
default=3,
type=int
)
parser.add_argument(
"--save_steps",
help="After how many steps do you want to log a checkpoint",
default=500,
type=int
)
parser.add_argument(
"--eval_steps",
help="After how many steps do you want to evaluate on eval data",
default=500,
type=int
)
parser.add_argument(
"--logging_steps",
help="After how many steps do you want to log to Weights and Biases (if installed)",
default=500,
type=int
)
parser.add_argument(
"--warmup_steps",
help="Number of learning rate warmup steps",
default=100,
type=int
)
### Training Arguments ###
parser.add_argument(
"--per_device_batch_size",
help="Batch size for every gradient accumulation steps",
default=2,
type=int
)
parser.add_argument(
"--gradient_accumulation_steps",
help="Number of gradient accumulation steps you want",
default=2,
type=int
)
parser.add_argument(
"--learning_rate",
help="Max learning rate that we warmup to",
default=2e-5,
type=float
)
parser.add_argument(
"--weight_decay",
help="Weight decay applied to model parameters during training",
default=0.01,
type=float
)
parser.add_argument(
"--save_total_limit",
help="Max number of checkpoints to save",
default=4,
type=int
)
### Backbone Arguments ###
parser.add_argument(
"--huggingface_model_name",
help="Name for pretrained RoBERTa backbone and Tokenizer",
default="deepset/roberta-base-squad2",
type=str
)
parser.add_argument(
"--path_to_pretrained_backbone",
help="Path to model weights stored from our pretraining to initialize the backbone",
default=None,
type=str
)
parser.add_argument(
"--pretrained_backbone",
help="Do you want want a `pretrained` backbone that we made (need to provide path_to_pretrained_backbone), \
`pretrained_huggingface` backbone (then need huggingface_model_name), or `random` initialized backbone",
choices=("pretrained", "pretrained_huggingface", "random"),
type=str
)
parser.add_argument('--resume_from_checkpoint', type=str, default=None)
parser.add_argument('--model_name_or_path', type=str, default="roberta-base")
args = parser.parse_args()
return args
### Load Arguments ###
args = parse_arguments()
def load_tokenizer(model_name):
try:
return RobertaTokenizerFast.from_pretrained(model_name)
except Exception as e:
logging.error(f"Failed to load tokenizer: {e}")
raise
def load_model(config):
try:
return RobertaForQuestionAnswering(config)
except Exception as e:
logging.error(f"Failed to load model: {e}")
raise
logging.basicConfig(level=logging.INFO)
logging.info("----------Loading dataset and tokenizer----------")
### Load Tokenizer ###
tokenizer = RobertaTokenizerFast.from_pretrained(args.huggingface_model_name)
### Load Config ###
dataset = load_dataset("stanfordnlp/coqa")
processor = ExtractiveQAPreProcesing()
tokenized_squad = dataset.map(processor, batched=True, remove_columns=dataset["train"].column_names)
# print(tokenized_squad.column_names)
### Load Model ###
if args.resume_from_checkpoint is not None:
config = RobertaConfig(pretrained_backbone=args.pretrained_backbone,
path_to_pretrained_weights=args.path_to_pretrained_backbone)
model = RobertaForQuestionAnswering(config)
model.load_state_dict(torch.load(f"{args.resume_from_checkpoint}/training_args.bin", map_location="cpu"))
else:
config = RobertaConfig(pretrained_backbone=args.pretrained_backbone,
path_to_pretrained_weights=args.path_to_pretrained_backbone)
model = RobertaForQuestionAnswering(config)
### Load Default Collator, We padded to longest length so no padding necessary ##
data_collator = DefaultDataCollator()
### Define Training Arguments ###
training_args = TrainingArguments(
output_dir=os.path.join(args.working_directory, args.experiment_name),
per_device_train_batch_size=args.per_device_batch_size,
gradient_accumulation_steps=args.gradient_accumulation_steps,
# evaluation_strategy="steps",
num_train_epochs=args.num_train_epochs,
bf16=True,
save_steps=args.save_steps,
eval_steps=args.eval_steps,
logging_steps=args.logging_steps,
learning_rate=args.learning_rate,
weight_decay=args.weight_decay,
warmup_steps=args.warmup_steps,
save_total_limit=args.save_total_limit,
run_name=args.experiment_name,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_squad["train"],
eval_dataset=tokenized_squad["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
)
### TRAIN MODEL !!! ###
# trainer.train()
trainer.train(resume_from_checkpoint="model/RoBERTa/finetune_qa_hf_roberta_backbone/checkpoint-54324")
### Save Final Model ###
trainer.save_model("/home/tangsan/AllNlpProject/CoQAChat/model/RoBERTa/save_model")