sunit333's picture
Upload 63 files
d08dd00 verified
import argparse
import logging
import os
import glob
import random
import copy
import numpy as np
import pytorch_lightning as pl
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import DataLoader, TensorDataset
from transformers import (
AdamW,
AutoConfig,
AutoModel,
AutoModelForPreTraining,
AutoModelForQuestionAnswering,
AutoModelForSequenceClassification,
AutoModelForTokenClassification,
AutoModelWithLMHead,
AutoModelForMultipleChoice,
AutoTokenizer,
get_linear_schedule_with_warmup,
)
from ..data import load_dataset
from ..data.examples import *
logger = logging.getLogger(__name__)
MODEL_MODES = {
'base': AutoModel,
'sequence-classification': AutoModelForSequenceClassification,
'question-answering': AutoModelForQuestionAnswering,
'pretraining': AutoModelForPreTraining,
'token-classification': AutoModelForTokenClassification,
'language-modeling': AutoModelWithLMHead,
'multiple-choice': AutoModelForMultipleChoice,
}
def get_model_class(model_type, mode):
return MODEL_MODES[mode]
def set_seed(hparams):
random.seed(hparams['seed'])
np.random.seed(hparams['seed'])
torch.manual_seed(hparams['seed'])
if hparams['n_gpu'] > 0:
torch.cuda.manual_seed_all(hparams['seed'])
class BaseModule(pl.LightningModule):
"""
The base module has 4 components: config, tokenizer, transformer model,
and dataset
Loading of a dataset:
1. Load instances of a dataset in the form of `Examples`
2. Convert all examples into features - may require tokenizer
3. Create a tensor dataset and loader given all the converted features
"""
def __init__(self, hparams):
super().__init__()
hparams['mode'] = self.mode
hparams['output_mode'] = self.output_mode
hparams['example_type'] = self.example_type
hparams['dev_lang'] = hparams['train_lang']
self.hparams = hparams # must come after super
self.dataset = load_dataset(hparams['dataset'], hparams['data_dir'])
if self.output_mode == 'classification':
self.labels = self.dataset.get_labels(hparams['train_lang'])
# setup config object
config_name = hparams['config_name'] or hparams['model_name_or_path']
args = {}
if self.output_mode == 'classification':
hparams['num_labels'] = len(self.dataset.get_labels(hparams['train_lang']))
args = {'num_labels': hparams['num_labels']}
self.config = AutoConfig.from_pretrained(
config_name,
**args,
cache_dir=hparams['cache_dir']
)
# setup tokenizer object
tok_name = hparams['tokenizer_name'] or hparams['model_name_or_path']
self.tokenizer = AutoTokenizer.from_pretrained(
tok_name,
config=self.config,
cache_dir=hparams['cache_dir'],
)
# setup transformer model
model_class = get_model_class(self.config.model_type, hparams['mode'])
self.model = model_class.from_pretrained(
hparams['model_name_or_path'],
config=self.config,
cache_dir=hparams['cache_dir'],
)
def forward(self, **inputs):
return self.model(**inputs)
def prepare_data(self):
"""Cache feature files on disk for every mode at the onset"""
modes = self.dataset.modes()
for mode in modes:
cached_features_file = self._feature_file(mode)
if not os.path.exists(cached_features_file)\
or self.hparams['overwrite_cache']:
self.load_features(mode)
def load_features(self, mode):
"""Load examples and convert them into features"""
if mode in ('train', 'dev', 'test'):
lang = self.hparams['{}_lang'.format(mode)]
else:
lang = self.hparams['test_lang']
examples = self.dataset.get_examples(lang, mode)
cached_features_file = self._feature_file(mode)
if os.path.exists(cached_features_file)\
and not self.hparams['overwrite_cache']:
features = torch.load(cached_features_file)
else:
features = self.convert_examples_to_features(examples)
torch.save(features, cached_features_file)
return features
def convert_examples_to_features(self, examples):
if self.hparams['example_type'] == 'multiple-choice':
features = convert_multiple_choice_examples_to_features(
examples,
self.tokenizer,
max_length=self.hparams['max_seq_length'],
label_list=self.labels
)
elif self.hparams['example_type'] == 'text':
features = convert_text_examples_to_features(
examples,
self.tokenizer,
max_length=self.hparams['max_seq_length'],
label_list=self.labels,
output_mode=self.output_mode,
)
elif self.hparams['example_type'] == 'tokens':
features = convert_tokens_examples_to_features(
examples,
self.labels,
self.hparams['max_seq_length'],
self.tokenizer,
cls_token_at_end=bool(self.config.model_type in ["xlnet"]),
cls_token=self.tokenizer.cls_token,
cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0,
sep_token=self.tokenizer.sep_token,
sep_token_extra=bool(self.config.model_type in ["roberta"]),
pad_on_left=bool(self.config.model_type in ["xlnet"]),
pad_token=self.tokenizer.pad_token_id,
pad_token_segment_id=self.tokenizer.pad_token_type_id,
pad_token_label_id=self.pad_token_label_id,
)
return features
def make_loader(self, features, batch_size):
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids or 0 for f in features], dtype=torch.long)
# all_candidates = torch.tensor([f.candidates for f in features], dtype=torch.long)
if self.hparams['output_mode'] == 'classification':
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
elif self.hparams['output_mode'] == 'regression':
all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
return DataLoader(
TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels),
# TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_candidates),
batch_size=batch_size,
)
def train_dataloader(self):
train_batch_size = self.hparams['train_batch_size']
train_features = self.load_features('train')
dataloader = self.make_loader(train_features, train_batch_size)
t_total = (
(len(dataloader.dataset) // (train_batch_size * max(1, self.hparams['n_gpu'])))
// self.hparams['gradient_accumulation_steps']
* float(self.hparams['num_train_epochs'])
)
scheduler = get_linear_schedule_with_warmup(
self.opt, num_warmup_steps=self.hparams['warmup_steps'], num_training_steps=t_total
)
self.lr_scheduler = scheduler
return dataloader
def val_dataloader(self):
dev_features = self.load_features('dev')
dataloader = self.make_loader(dev_features, self.hparams['eval_batch_size'])
return dataloader
def test_dataloader(self):
test_features = self.load_features('test')
dataloader = self.make_loader(test_features, self.hparams['eval_batch_size'])
return dataloader
def training_step(self, batch, batch_idx):
inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]}
if self.config.model_type != 'distilbert':
inputs['token_type_ids'] = (
batch[2] if self.config.model_type in ['bert', 'xlnet', 'albert'] else None
) # XLM and RoBERTa don't use token_type_ids
outputs = self(**inputs)
loss = outputs[0]
tensorboard_logs = {'loss': loss, 'rate': self.lr_scheduler.get_last_lr()[-1]}
return {'loss': loss, 'log': tensorboard_logs}
def validation_step(self, batch, batch_nb):
inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
'labels': batch[3]}
# XLM and RoBERTa don't use token_type_ids
inputs['token_type_ids'] = None
if self.config.model_type in ['bert', 'xlnet', 'albert']:
inputs['token_type_ids'] = batch[2]
outputs = self(**inputs)
tmp_eval_loss, logits = outputs[:2]
preds = logits.detach().cpu().numpy()
out_label_ids = inputs['labels'].detach().cpu().numpy()
return {'val_loss': tmp_eval_loss.detach().cpu(),
'pred': preds,
'target': out_label_ids}
def test_step(self, batch, batch_nb):
return self.validation_step(batch, batch_nb)
def _feature_file(self, mode):
if mode in ('train', 'dev', 'test'):
lang = self.hparams['{}_lang'.format(mode)]
else:
lang = self.hparams['test_lang']
return os.path.join(
self.hparams['data_dir'],
'cached_{}_{}_{}_{}'.format(
lang,
mode,
list(filter(None, self.hparams['model_name_or_path'].split('/'))).pop(),
str(self.hparams['max_seq_length']),
),
)
def is_logger(self):
return self.trainer.global_rank <= 0
def configure_optimizers(self):
"""Prepare optimizer and schedule (linear warmup and decay)"""
model = self.model
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{
'params': [p for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)],
'weight_decay': self.hparams['weight_decay'],
},
{
'params': [p for n, p in model.named_parameters()
if any(nd in n for nd in no_decay)],
'weight_decay': 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters,
lr=self.hparams['learning_rate'],
eps=self.hparams['adam_epsilon'])
self.opt = optimizer
return [optimizer]
def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx,
second_order_closure=None):
if self.trainer.use_tpu:
import torch_xla.core.xla_model as xm
xm.optimizer_step(optimizer)
else:
optimizer.step()
optimizer.zero_grad()
self.lr_scheduler.step()
def get_tqdm_dict(self):
avg_loss = getattr(self.trainer, 'avg_loss', 0.0)
tqdm_dict = {'loss': '{:.3f}'.format(avg_loss), 'lr': self.lr_scheduler.get_last_lr()[-1]}
return tqdm_dict
def run_module(self):
trainer = create_trainer(self, self.hparams)
hparams_copy = copy.deepcopy(self.hparams)
if self.hparams['do_train']:
checkpoints = list(sorted(glob.glob(os.path.join(self.hparams['output_dir'], 'checkpointepoch=*.ckpt'), recursive=True)))
if len(checkpoints) == 0:
trainer.fit(self)
checkpoints = list(sorted(glob.glob(os.path.join(self.hparams['output_dir'], 'checkpointepoch=*.ckpt'), recursive=True)))
self.trained_model = self.load_from_checkpoint(checkpoints[-1])
self.trained_model.hparams = hparams_copy
# Optionally, predict on dev set and write to output_dir
if self.hparams['do_predict']:
trainer.test(self.trained_model)
# Fixes __temp_weight_ddp_end.ckpt bug
# See https://github.com/PyTorchLightning/pytorch-lightning/issues/1142
class MonkeyPatchedTrainer(pl.Trainer):
def load_spawn_weights(self, original_model):
pass
pl.Trainer = MonkeyPatchedTrainer
class LoggingCallback(pl.Callback):
def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
logger.info("***** Validation results *****")
if pl_module.is_logger():
metrics = trainer.callback_metrics
# Log results
for key in sorted(metrics):
if key not in ["log", "progress_bar"]:
logger.info("{} = {}\n".format(key, str(metrics[key])))
def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
logger.info("***** Test results *****")
print(trainer.callback_metrics)
if pl_module.is_logger():
metrics = trainer.callback_metrics
# Log and save results to file
output_dir = pl_module.hparams['output_dir']
test_lang = pl_module.hparams['test_lang']
output_test_results_file = os.path.join(output_dir, 'test_results_{}.txt'.format(test_lang))
with open(output_test_results_file, "w") as writer:
for key in sorted(metrics):
if key not in ["log", "progress_bar"]:
logger.info("{} = {}\n".format(key, str(metrics[key])))
writer.write("{} = {}\n".format(key, str(metrics[key])))
def create_trainer(model, hparams):
# init model
set_seed(hparams)
# if os.path.exists(hparams['output_dir']) and os.listdir(hparams['output_dir']) and hparams['do_train']:
# raise ValueError('Output directory ({}) already exists and is not empty.'.format(hparams['output_dir']))
checkpoint_callback = pl.callbacks.ModelCheckpoint(
filepath=hparams['output_dir'], prefix='checkpoint', monitor='val_loss', mode='min', save_top_k=5
)
train_params = dict(
accumulate_grad_batches=hparams['gradient_accumulation_steps'],
gpus=hparams['n_gpu'],
max_epochs=hparams['num_train_epochs'],
early_stop_callback=False,
gradient_clip_val=hparams['max_grad_norm'],
checkpoint_callback=checkpoint_callback,
callbacks=[LoggingCallback()],
)
if hparams['fp16']:
train_params['use_amp'] = hparams['fp16']
train_params['amp_level'] = hparams['fp16_opt_level']
if hparams['n_tpu_cores'] > 0:
train_params['tpu_cores'] = hparams['n_tpu_cores']
train_params['gpus'] = 0
if hparams['n_gpu'] > 1:
train_params['distributed_backend'] = 'ddp'
trainer = pl.Trainer(**train_params)
return trainer