Spaces:

hieungo1410
/

NMT-LaVi

No application file

App Files Files Community

NMT-LaVi / modules /prototypes.py

hieungo1410

'add'

8cb4f3b over 1 year ago

raw

history blame contribute delete

9.23 kB

	import torch.nn as nn
	from torchtext import data
	import copy
	import layers as layers

	class Embedder(nn.Module):
	def __init__(self, vocab_size, d_model):
	super().__init__()
	self.vocab_size = vocab_size
	self.d_model = d_model

	self.embed = nn.Embedding(vocab_size, d_model)

	def forward(self, x):
	return self.embed(x)

	class EncoderLayer(nn.Module):
	def __init__(self, d_model, heads, dropout=0.1):
	"""An layer of the encoder. Contain a self-attention accepting padding mask
	Args:
	d_model: the inner dimension size of the layer
	heads: number of heads used in the attention
	dropout: applied dropout value during training
	"""
	super().__init__()
	self.norm_1 = layers.Norm(d_model)
	self.norm_2 = layers.Norm(d_model)
	self.attn = layers.MultiHeadAttention(heads, d_model, dropout=dropout)
	self.ff = layers.FeedForward(d_model, dropout=dropout)
	self.dropout_1 = nn.Dropout(dropout)
	self.dropout_2 = nn.Dropout(dropout)

	def forward(self, x, src_mask):
	"""Run the encoding layer
	Args:
	x: the input (either embedding values or previous layer output), should be in shape [batch_size, src_len, d_model]
	src_mask: the padding mask, should be [batch_size, 1, src_len]
	Return:
	an output that have the same shape as input, [batch_size, src_len, d_model]
	the attention used [batch_size, src_len, src_len]
	"""
	x2 = self.norm_1(x)
	# Self attention only
	x_sa, sa = self.attn(x2, x2, x2, src_mask)
	x = x + self.dropout_1(x_sa)
	x2 = self.norm_2(x)
	x = x + self.dropout_2(self.ff(x2))
	return x, sa

	class DecoderLayer(nn.Module):
	def __init__(self, d_model, heads, dropout=0.1):
	"""An layer of the decoder. Contain a self-attention that accept no-peeking mask and a normal attention tha t accept padding mask
	Args:
	d_model: the inner dimension size of the layer
	heads: number of heads used in the attention
	dropout: applied dropout value during training
	"""
	super().__init__()
	self.norm_1 = layers.Norm(d_model)
	self.norm_2 = layers.Norm(d_model)
	self.norm_3 = layers.Norm(d_model)

	self.dropout_1 = nn.Dropout(dropout)
	self.dropout_2 = nn.Dropout(dropout)
	self.dropout_3 = nn.Dropout(dropout)

	self.attn_1 = layers.MultiHeadAttention(heads, d_model, dropout=dropout)
	self.attn_2 = layers.MultiHeadAttention(heads, d_model, dropout=dropout)
	self.ff = layers.FeedForward(d_model, dropout=dropout)

	def forward(self, x, memory, src_mask, trg_mask):
	"""Run the decoding layer
	Args:
	x: the input (either embedding values or previous layer output), should be in shape [batch_size, tgt_len, d_model]
	memory: the outputs of the encoding section, used for normal attention. [batch_size, src_len, d_model]
	src_mask: the padding mask for the memory, [batch_size, 1, src_len]
	tgt_mask: the no-peeking mask for the decoder, [batch_size, tgt_len, tgt_len]
	Return:
	an output that have the same shape as input, [batch_size, tgt_len, d_model]
	the self-attention and normal attention received [batch_size, head, tgt_len, tgt_len] & [batch_size, head, tgt_len, src_len]
	"""
	x2 = self.norm_1(x)
	# Self-attention
	x_sa, sa = self.attn_1(x2, x2, x2, trg_mask)
	x = x + self.dropout_1(x_sa)
	x2 = self.norm_2(x)
	# Normal multi-head attention
	x_na, na = self.attn_2(x2, memory, memory, src_mask)
	x = x + self.dropout_2(x_na)
	x2 = self.norm_3(x)
	x = x + self.dropout_3(self.ff(x2))
	return x, (sa, na)

	def get_clones(module, N, keep_module=True):
	if(keep_module and N >= 1):
	# create N-1 copies in addition to the original
	return nn.ModuleList([module] + [copy.deepcopy(module) for i in range(N-1)])
	else:
	# create N new copy
	return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

	class Encoder(nn.Module):
	"""A wrapper that embed, positional encode, and self-attention encode the inputs.
	Args:
	vocab_size: the size of the vocab. Used for embedding
	d_model: the inner dim of the module
	N: number of layers used
	heads: number of heads used in the attention
	dropout: applied dropout value during training
	max_seq_length: the maximum length value used for this encoder. Needed for PositionalEncoder, due to caching
	"""
	def __init__(self, vocab_size, d_model, N, heads, dropout, max_seq_length=200):
	super().__init__()
	self.N = N
	self.embed = nn.Embedding(vocab_size, d_model)
	self.pe = layers.PositionalEncoder(d_model, dropout=dropout, max_seq_length=max_seq_length)
	self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
	self.norm = layers.Norm(d_model)

	self._max_seq_length = max_seq_length

	def forward(self, src, src_mask, output_attention=False, seq_length_check=False):
	"""Accepts a batch of indexed tokens, return the encoded values.
	Args:
	src: int Tensor of [batch_size, src_len]
	src_mask: the padding mask, [batch_size, 1, src_len]
	output_attention: if set, output a list containing used attention
	seq_length_check: if set, automatically trim the input if it goes past the expected sequence length.
	Returns:
	the encoded values [batch_size, src_len, d_model]
	if available, list of N (self-attention) calculated. They are in form of [batch_size, heads, src_len, src_len]
	"""
	if(seq_length_check and src.shape[1] > self._max_seq_length):
	src = src[:, :self._max_seq_length]
	src_mask = src_mask[:, :, :self._max_seq_length]
	x = self.embed(src)
	x = self.pe(x)
	attentions = [None] * self.N
	for i in range(self.N):
	x, attn = self.layers[i](x, src_mask)
	attentions[i] = attn
	x = self.norm(x)
	return x if(not output_attention) else (x, attentions)

	class Decoder(nn.Module):
	"""A wrapper that receive the encoder outputs, run through the decoder process for a determined input
	Args:
	vocab_size: the size of the vocab. Used for embedding
	d_model: the inner dim of the module
	N: number of layers used
	heads: number of heads used in the attention
	dropout: applied dropout value during training
	max_seq_length: the maximum length value used for this encoder. Needed for PositionalEncoder, due to caching
	"""
	def __init__(self, vocab_size, d_model, N, heads, dropout, max_seq_length=200):
	super().__init__()
	self.N = N
	self.embed = nn.Embedding(vocab_size, d_model)
	self.pe = layers.PositionalEncoder(d_model, dropout=dropout, max_seq_length=max_seq_length)
	self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
	self.norm = layers.Norm(d_model)

	self._max_seq_length = max_seq_length

	def forward(self, trg, memory, src_mask, trg_mask, output_attention=False):
	"""Accepts a batch of indexed tokens and the encoding outputs, return the decoded values.
	Args:
	trg: input Tensor of [batch_size, trg_len]
	memory: output of Encoder [batch_size, src_len, d_model]
	src_mask: the padding mask, [batch_size, 1, src_len]
	trg_mask: the no-peeking mask, [batch_size, tgt_len, tgt_len]
	output_attention: if set, output a list containing used attention
	Returns:
	the decoded values [batch_size, tgt_len, d_model]
	if available, list of N (self-attention, attention) calculated. They are in form of [batch_size, heads, tgt_len, tgt/src_len]
	"""
	x = self.embed(trg)
	x = self.pe(x)

	attentions = [None] * self.N
	for i in range(self.N):
	x, attn = self.layers[i](x, memory, src_mask, trg_mask)
	attentions[i] = attn
	x = self.norm(x)
	return x if(not output_attention) else (x, attentions)


	class Config:
	"""Deprecated"""
	def __init__(self):
	self.opt = {
	'train_src_data':'/workspace/khoai23/opennmt/data/iwslt_en_vi/train.en',
	'train_trg_data':'/workspace/khoai23/opennmt/data/iwslt_en_vi/train.vi',
	'valid_src_data':'/workspace/khoai23/opennmt/data/iwslt_en_vi/tst2013.en',
	'valid_trg_data':'/workspace/khoai23/opennmt/data/iwslt_en_vi/tst2013.vi',
	'src_lang':'en', # useless atm
	'trg_lang':'en',#'vi_spacy_model', # useless atm
	'max_strlen':160,
	'batchsize':1500,
	'device':'cuda',
	'd_model': 512,
	'n_layers': 6,
	'heads': 8,
	'dropout': 0.1,
	'lr':0.0001,
	'epochs':30,
	'printevery': 200,
	'k':5,
	}