Spaces:
No application file
No application file
import argparse | |
import logging | |
import os | |
import glob | |
import random | |
import copy | |
import numpy as np | |
import pytorch_lightning as pl | |
import torch | |
import torch.nn as nn | |
from torch.nn import CrossEntropyLoss, MSELoss | |
from torch.utils.data import DataLoader, TensorDataset | |
from transformers import ( | |
AdamW, | |
AutoConfig, | |
AutoModel, | |
AutoModelForPreTraining, | |
AutoModelForQuestionAnswering, | |
AutoModelForSequenceClassification, | |
AutoModelForTokenClassification, | |
AutoModelWithLMHead, | |
AutoModelForMultipleChoice, | |
AutoTokenizer, | |
get_linear_schedule_with_warmup, | |
) | |
from ..data import load_dataset | |
from ..data.examples import * | |
logger = logging.getLogger(__name__) | |
MODEL_MODES = { | |
'base': AutoModel, | |
'sequence-classification': AutoModelForSequenceClassification, | |
'question-answering': AutoModelForQuestionAnswering, | |
'pretraining': AutoModelForPreTraining, | |
'token-classification': AutoModelForTokenClassification, | |
'language-modeling': AutoModelWithLMHead, | |
'multiple-choice': AutoModelForMultipleChoice, | |
} | |
def get_model_class(model_type, mode): | |
return MODEL_MODES[mode] | |
def set_seed(hparams): | |
random.seed(hparams['seed']) | |
np.random.seed(hparams['seed']) | |
torch.manual_seed(hparams['seed']) | |
if hparams['n_gpu'] > 0: | |
torch.cuda.manual_seed_all(hparams['seed']) | |
class BaseModule(pl.LightningModule): | |
""" | |
The base module has 4 components: config, tokenizer, transformer model, | |
and dataset | |
Loading of a dataset: | |
1. Load instances of a dataset in the form of `Examples` | |
2. Convert all examples into features - may require tokenizer | |
3. Create a tensor dataset and loader given all the converted features | |
""" | |
def __init__(self, hparams): | |
super().__init__() | |
hparams['mode'] = self.mode | |
hparams['output_mode'] = self.output_mode | |
hparams['example_type'] = self.example_type | |
hparams['dev_lang'] = hparams['train_lang'] | |
self.hparams = hparams # must come after super | |
self.dataset = load_dataset(hparams['dataset'], hparams['data_dir']) | |
if self.output_mode == 'classification': | |
self.labels = self.dataset.get_labels(hparams['train_lang']) | |
# setup config object | |
config_name = hparams['config_name'] or hparams['model_name_or_path'] | |
args = {} | |
if self.output_mode == 'classification': | |
hparams['num_labels'] = len(self.dataset.get_labels(hparams['train_lang'])) | |
args = {'num_labels': hparams['num_labels']} | |
self.config = AutoConfig.from_pretrained( | |
config_name, | |
**args, | |
cache_dir=hparams['cache_dir'] | |
) | |
# setup tokenizer object | |
tok_name = hparams['tokenizer_name'] or hparams['model_name_or_path'] | |
self.tokenizer = AutoTokenizer.from_pretrained( | |
tok_name, | |
config=self.config, | |
cache_dir=hparams['cache_dir'], | |
) | |
# setup transformer model | |
model_class = get_model_class(self.config.model_type, hparams['mode']) | |
self.model = model_class.from_pretrained( | |
hparams['model_name_or_path'], | |
config=self.config, | |
cache_dir=hparams['cache_dir'], | |
) | |
def forward(self, **inputs): | |
return self.model(**inputs) | |
def prepare_data(self): | |
"""Cache feature files on disk for every mode at the onset""" | |
modes = self.dataset.modes() | |
for mode in modes: | |
cached_features_file = self._feature_file(mode) | |
if not os.path.exists(cached_features_file)\ | |
or self.hparams['overwrite_cache']: | |
self.load_features(mode) | |
def load_features(self, mode): | |
"""Load examples and convert them into features""" | |
if mode in ('train', 'dev', 'test'): | |
lang = self.hparams['{}_lang'.format(mode)] | |
else: | |
lang = self.hparams['test_lang'] | |
examples = self.dataset.get_examples(lang, mode) | |
cached_features_file = self._feature_file(mode) | |
if os.path.exists(cached_features_file)\ | |
and not self.hparams['overwrite_cache']: | |
features = torch.load(cached_features_file) | |
else: | |
features = self.convert_examples_to_features(examples) | |
torch.save(features, cached_features_file) | |
return features | |
def convert_examples_to_features(self, examples): | |
if self.hparams['example_type'] == 'multiple-choice': | |
features = convert_multiple_choice_examples_to_features( | |
examples, | |
self.tokenizer, | |
max_length=self.hparams['max_seq_length'], | |
label_list=self.labels | |
) | |
elif self.hparams['example_type'] == 'text': | |
features = convert_text_examples_to_features( | |
examples, | |
self.tokenizer, | |
max_length=self.hparams['max_seq_length'], | |
label_list=self.labels, | |
output_mode=self.output_mode, | |
) | |
elif self.hparams['example_type'] == 'tokens': | |
features = convert_tokens_examples_to_features( | |
examples, | |
self.labels, | |
self.hparams['max_seq_length'], | |
self.tokenizer, | |
cls_token_at_end=bool(self.config.model_type in ["xlnet"]), | |
cls_token=self.tokenizer.cls_token, | |
cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0, | |
sep_token=self.tokenizer.sep_token, | |
sep_token_extra=bool(self.config.model_type in ["roberta"]), | |
pad_on_left=bool(self.config.model_type in ["xlnet"]), | |
pad_token=self.tokenizer.pad_token_id, | |
pad_token_segment_id=self.tokenizer.pad_token_type_id, | |
pad_token_label_id=self.pad_token_label_id, | |
) | |
return features | |
def make_loader(self, features, batch_size): | |
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) | |
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) | |
all_token_type_ids = torch.tensor([f.token_type_ids or 0 for f in features], dtype=torch.long) | |
# all_candidates = torch.tensor([f.candidates for f in features], dtype=torch.long) | |
if self.hparams['output_mode'] == 'classification': | |
all_labels = torch.tensor([f.label for f in features], dtype=torch.long) | |
elif self.hparams['output_mode'] == 'regression': | |
all_labels = torch.tensor([f.label for f in features], dtype=torch.float) | |
return DataLoader( | |
TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels), | |
# TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_candidates), | |
batch_size=batch_size, | |
) | |
def train_dataloader(self): | |
train_batch_size = self.hparams['train_batch_size'] | |
train_features = self.load_features('train') | |
dataloader = self.make_loader(train_features, train_batch_size) | |
t_total = ( | |
(len(dataloader.dataset) // (train_batch_size * max(1, self.hparams['n_gpu']))) | |
// self.hparams['gradient_accumulation_steps'] | |
* float(self.hparams['num_train_epochs']) | |
) | |
scheduler = get_linear_schedule_with_warmup( | |
self.opt, num_warmup_steps=self.hparams['warmup_steps'], num_training_steps=t_total | |
) | |
self.lr_scheduler = scheduler | |
return dataloader | |
def val_dataloader(self): | |
dev_features = self.load_features('dev') | |
dataloader = self.make_loader(dev_features, self.hparams['eval_batch_size']) | |
return dataloader | |
def test_dataloader(self): | |
test_features = self.load_features('test') | |
dataloader = self.make_loader(test_features, self.hparams['eval_batch_size']) | |
return dataloader | |
def training_step(self, batch, batch_idx): | |
inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} | |
if self.config.model_type != 'distilbert': | |
inputs['token_type_ids'] = ( | |
batch[2] if self.config.model_type in ['bert', 'xlnet', 'albert'] else None | |
) # XLM and RoBERTa don't use token_type_ids | |
outputs = self(**inputs) | |
loss = outputs[0] | |
tensorboard_logs = {'loss': loss, 'rate': self.lr_scheduler.get_last_lr()[-1]} | |
return {'loss': loss, 'log': tensorboard_logs} | |
def validation_step(self, batch, batch_nb): | |
inputs = {'input_ids': batch[0], | |
'attention_mask': batch[1], | |
'labels': batch[3]} | |
# XLM and RoBERTa don't use token_type_ids | |
inputs['token_type_ids'] = None | |
if self.config.model_type in ['bert', 'xlnet', 'albert']: | |
inputs['token_type_ids'] = batch[2] | |
outputs = self(**inputs) | |
tmp_eval_loss, logits = outputs[:2] | |
preds = logits.detach().cpu().numpy() | |
out_label_ids = inputs['labels'].detach().cpu().numpy() | |
return {'val_loss': tmp_eval_loss.detach().cpu(), | |
'pred': preds, | |
'target': out_label_ids} | |
def test_step(self, batch, batch_nb): | |
return self.validation_step(batch, batch_nb) | |
def _feature_file(self, mode): | |
if mode in ('train', 'dev', 'test'): | |
lang = self.hparams['{}_lang'.format(mode)] | |
else: | |
lang = self.hparams['test_lang'] | |
return os.path.join( | |
self.hparams['data_dir'], | |
'cached_{}_{}_{}_{}'.format( | |
lang, | |
mode, | |
list(filter(None, self.hparams['model_name_or_path'].split('/'))).pop(), | |
str(self.hparams['max_seq_length']), | |
), | |
) | |
def is_logger(self): | |
return self.trainer.global_rank <= 0 | |
def configure_optimizers(self): | |
"""Prepare optimizer and schedule (linear warmup and decay)""" | |
model = self.model | |
no_decay = ['bias', 'LayerNorm.weight'] | |
optimizer_grouped_parameters = [ | |
{ | |
'params': [p for n, p in model.named_parameters() | |
if not any(nd in n for nd in no_decay)], | |
'weight_decay': self.hparams['weight_decay'], | |
}, | |
{ | |
'params': [p for n, p in model.named_parameters() | |
if any(nd in n for nd in no_decay)], | |
'weight_decay': 0.0, | |
}, | |
] | |
optimizer = AdamW(optimizer_grouped_parameters, | |
lr=self.hparams['learning_rate'], | |
eps=self.hparams['adam_epsilon']) | |
self.opt = optimizer | |
return [optimizer] | |
def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, | |
second_order_closure=None): | |
if self.trainer.use_tpu: | |
import torch_xla.core.xla_model as xm | |
xm.optimizer_step(optimizer) | |
else: | |
optimizer.step() | |
optimizer.zero_grad() | |
self.lr_scheduler.step() | |
def get_tqdm_dict(self): | |
avg_loss = getattr(self.trainer, 'avg_loss', 0.0) | |
tqdm_dict = {'loss': '{:.3f}'.format(avg_loss), 'lr': self.lr_scheduler.get_last_lr()[-1]} | |
return tqdm_dict | |
def run_module(self): | |
trainer = create_trainer(self, self.hparams) | |
hparams_copy = copy.deepcopy(self.hparams) | |
if self.hparams['do_train']: | |
checkpoints = list(sorted(glob.glob(os.path.join(self.hparams['output_dir'], 'checkpointepoch=*.ckpt'), recursive=True))) | |
if len(checkpoints) == 0: | |
trainer.fit(self) | |
checkpoints = list(sorted(glob.glob(os.path.join(self.hparams['output_dir'], 'checkpointepoch=*.ckpt'), recursive=True))) | |
self.trained_model = self.load_from_checkpoint(checkpoints[-1]) | |
self.trained_model.hparams = hparams_copy | |
# Optionally, predict on dev set and write to output_dir | |
if self.hparams['do_predict']: | |
trainer.test(self.trained_model) | |
# Fixes __temp_weight_ddp_end.ckpt bug | |
# See https://github.com/PyTorchLightning/pytorch-lightning/issues/1142 | |
class MonkeyPatchedTrainer(pl.Trainer): | |
def load_spawn_weights(self, original_model): | |
pass | |
pl.Trainer = MonkeyPatchedTrainer | |
class LoggingCallback(pl.Callback): | |
def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule): | |
logger.info("***** Validation results *****") | |
if pl_module.is_logger(): | |
metrics = trainer.callback_metrics | |
# Log results | |
for key in sorted(metrics): | |
if key not in ["log", "progress_bar"]: | |
logger.info("{} = {}\n".format(key, str(metrics[key]))) | |
def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule): | |
logger.info("***** Test results *****") | |
print(trainer.callback_metrics) | |
if pl_module.is_logger(): | |
metrics = trainer.callback_metrics | |
# Log and save results to file | |
output_dir = pl_module.hparams['output_dir'] | |
test_lang = pl_module.hparams['test_lang'] | |
output_test_results_file = os.path.join(output_dir, 'test_results_{}.txt'.format(test_lang)) | |
with open(output_test_results_file, "w") as writer: | |
for key in sorted(metrics): | |
if key not in ["log", "progress_bar"]: | |
logger.info("{} = {}\n".format(key, str(metrics[key]))) | |
writer.write("{} = {}\n".format(key, str(metrics[key]))) | |
def create_trainer(model, hparams): | |
# init model | |
set_seed(hparams) | |
# if os.path.exists(hparams['output_dir']) and os.listdir(hparams['output_dir']) and hparams['do_train']: | |
# raise ValueError('Output directory ({}) already exists and is not empty.'.format(hparams['output_dir'])) | |
checkpoint_callback = pl.callbacks.ModelCheckpoint( | |
filepath=hparams['output_dir'], prefix='checkpoint', monitor='val_loss', mode='min', save_top_k=5 | |
) | |
train_params = dict( | |
accumulate_grad_batches=hparams['gradient_accumulation_steps'], | |
gpus=hparams['n_gpu'], | |
max_epochs=hparams['num_train_epochs'], | |
early_stop_callback=False, | |
gradient_clip_val=hparams['max_grad_norm'], | |
checkpoint_callback=checkpoint_callback, | |
callbacks=[LoggingCallback()], | |
) | |
if hparams['fp16']: | |
train_params['use_amp'] = hparams['fp16'] | |
train_params['amp_level'] = hparams['fp16_opt_level'] | |
if hparams['n_tpu_cores'] > 0: | |
train_params['tpu_cores'] = hparams['n_tpu_cores'] | |
train_params['gpus'] = 0 | |
if hparams['n_gpu'] > 1: | |
train_params['distributed_backend'] = 'ddp' | |
trainer = pl.Trainer(**train_params) | |
return trainer | |