NMT-LaVi / modules /prototypes.py
hieungo1410's picture
'add'
8cb4f3b
import torch.nn as nn
from torchtext import data
import copy
import layers as layers
class Embedder(nn.Module):
def __init__(self, vocab_size, d_model):
super().__init__()
self.vocab_size = vocab_size
self.d_model = d_model
self.embed = nn.Embedding(vocab_size, d_model)
def forward(self, x):
return self.embed(x)
class EncoderLayer(nn.Module):
def __init__(self, d_model, heads, dropout=0.1):
"""An layer of the encoder. Contain a self-attention accepting padding mask
Args:
d_model: the inner dimension size of the layer
heads: number of heads used in the attention
dropout: applied dropout value during training
"""
super().__init__()
self.norm_1 = layers.Norm(d_model)
self.norm_2 = layers.Norm(d_model)
self.attn = layers.MultiHeadAttention(heads, d_model, dropout=dropout)
self.ff = layers.FeedForward(d_model, dropout=dropout)
self.dropout_1 = nn.Dropout(dropout)
self.dropout_2 = nn.Dropout(dropout)
def forward(self, x, src_mask):
"""Run the encoding layer
Args:
x: the input (either embedding values or previous layer output), should be in shape [batch_size, src_len, d_model]
src_mask: the padding mask, should be [batch_size, 1, src_len]
Return:
an output that have the same shape as input, [batch_size, src_len, d_model]
the attention used [batch_size, src_len, src_len]
"""
x2 = self.norm_1(x)
# Self attention only
x_sa, sa = self.attn(x2, x2, x2, src_mask)
x = x + self.dropout_1(x_sa)
x2 = self.norm_2(x)
x = x + self.dropout_2(self.ff(x2))
return x, sa
class DecoderLayer(nn.Module):
def __init__(self, d_model, heads, dropout=0.1):
"""An layer of the decoder. Contain a self-attention that accept no-peeking mask and a normal attention tha t accept padding mask
Args:
d_model: the inner dimension size of the layer
heads: number of heads used in the attention
dropout: applied dropout value during training
"""
super().__init__()
self.norm_1 = layers.Norm(d_model)
self.norm_2 = layers.Norm(d_model)
self.norm_3 = layers.Norm(d_model)
self.dropout_1 = nn.Dropout(dropout)
self.dropout_2 = nn.Dropout(dropout)
self.dropout_3 = nn.Dropout(dropout)
self.attn_1 = layers.MultiHeadAttention(heads, d_model, dropout=dropout)
self.attn_2 = layers.MultiHeadAttention(heads, d_model, dropout=dropout)
self.ff = layers.FeedForward(d_model, dropout=dropout)
def forward(self, x, memory, src_mask, trg_mask):
"""Run the decoding layer
Args:
x: the input (either embedding values or previous layer output), should be in shape [batch_size, tgt_len, d_model]
memory: the outputs of the encoding section, used for normal attention. [batch_size, src_len, d_model]
src_mask: the padding mask for the memory, [batch_size, 1, src_len]
tgt_mask: the no-peeking mask for the decoder, [batch_size, tgt_len, tgt_len]
Return:
an output that have the same shape as input, [batch_size, tgt_len, d_model]
the self-attention and normal attention received [batch_size, head, tgt_len, tgt_len] & [batch_size, head, tgt_len, src_len]
"""
x2 = self.norm_1(x)
# Self-attention
x_sa, sa = self.attn_1(x2, x2, x2, trg_mask)
x = x + self.dropout_1(x_sa)
x2 = self.norm_2(x)
# Normal multi-head attention
x_na, na = self.attn_2(x2, memory, memory, src_mask)
x = x + self.dropout_2(x_na)
x2 = self.norm_3(x)
x = x + self.dropout_3(self.ff(x2))
return x, (sa, na)
def get_clones(module, N, keep_module=True):
if(keep_module and N >= 1):
# create N-1 copies in addition to the original
return nn.ModuleList([module] + [copy.deepcopy(module) for i in range(N-1)])
else:
# create N new copy
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
class Encoder(nn.Module):
"""A wrapper that embed, positional encode, and self-attention encode the inputs.
Args:
vocab_size: the size of the vocab. Used for embedding
d_model: the inner dim of the module
N: number of layers used
heads: number of heads used in the attention
dropout: applied dropout value during training
max_seq_length: the maximum length value used for this encoder. Needed for PositionalEncoder, due to caching
"""
def __init__(self, vocab_size, d_model, N, heads, dropout, max_seq_length=200):
super().__init__()
self.N = N
self.embed = nn.Embedding(vocab_size, d_model)
self.pe = layers.PositionalEncoder(d_model, dropout=dropout, max_seq_length=max_seq_length)
self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
self.norm = layers.Norm(d_model)
self._max_seq_length = max_seq_length
def forward(self, src, src_mask, output_attention=False, seq_length_check=False):
"""Accepts a batch of indexed tokens, return the encoded values.
Args:
src: int Tensor of [batch_size, src_len]
src_mask: the padding mask, [batch_size, 1, src_len]
output_attention: if set, output a list containing used attention
seq_length_check: if set, automatically trim the input if it goes past the expected sequence length.
Returns:
the encoded values [batch_size, src_len, d_model]
if available, list of N (self-attention) calculated. They are in form of [batch_size, heads, src_len, src_len]
"""
if(seq_length_check and src.shape[1] > self._max_seq_length):
src = src[:, :self._max_seq_length]
src_mask = src_mask[:, :, :self._max_seq_length]
x = self.embed(src)
x = self.pe(x)
attentions = [None] * self.N
for i in range(self.N):
x, attn = self.layers[i](x, src_mask)
attentions[i] = attn
x = self.norm(x)
return x if(not output_attention) else (x, attentions)
class Decoder(nn.Module):
"""A wrapper that receive the encoder outputs, run through the decoder process for a determined input
Args:
vocab_size: the size of the vocab. Used for embedding
d_model: the inner dim of the module
N: number of layers used
heads: number of heads used in the attention
dropout: applied dropout value during training
max_seq_length: the maximum length value used for this encoder. Needed for PositionalEncoder, due to caching
"""
def __init__(self, vocab_size, d_model, N, heads, dropout, max_seq_length=200):
super().__init__()
self.N = N
self.embed = nn.Embedding(vocab_size, d_model)
self.pe = layers.PositionalEncoder(d_model, dropout=dropout, max_seq_length=max_seq_length)
self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
self.norm = layers.Norm(d_model)
self._max_seq_length = max_seq_length
def forward(self, trg, memory, src_mask, trg_mask, output_attention=False):
"""Accepts a batch of indexed tokens and the encoding outputs, return the decoded values.
Args:
trg: input Tensor of [batch_size, trg_len]
memory: output of Encoder [batch_size, src_len, d_model]
src_mask: the padding mask, [batch_size, 1, src_len]
trg_mask: the no-peeking mask, [batch_size, tgt_len, tgt_len]
output_attention: if set, output a list containing used attention
Returns:
the decoded values [batch_size, tgt_len, d_model]
if available, list of N (self-attention, attention) calculated. They are in form of [batch_size, heads, tgt_len, tgt/src_len]
"""
x = self.embed(trg)
x = self.pe(x)
attentions = [None] * self.N
for i in range(self.N):
x, attn = self.layers[i](x, memory, src_mask, trg_mask)
attentions[i] = attn
x = self.norm(x)
return x if(not output_attention) else (x, attentions)
class Config:
"""Deprecated"""
def __init__(self):
self.opt = {
'train_src_data':'/workspace/khoai23/opennmt/data/iwslt_en_vi/train.en',
'train_trg_data':'/workspace/khoai23/opennmt/data/iwslt_en_vi/train.vi',
'valid_src_data':'/workspace/khoai23/opennmt/data/iwslt_en_vi/tst2013.en',
'valid_trg_data':'/workspace/khoai23/opennmt/data/iwslt_en_vi/tst2013.vi',
'src_lang':'en', # useless atm
'trg_lang':'en',#'vi_spacy_model', # useless atm
'max_strlen':160,
'batchsize':1500,
'device':'cuda',
'd_model': 512,
'n_layers': 6,
'heads': 8,
'dropout': 0.1,
'lr':0.0001,
'epochs':30,
'printevery': 200,
'k':5,
}