Transformer_de_en_multi30K / model /sublayers.py

Add trained transformer model

0f53151 over 1 year ago

7.89 kB

	# importing required libraries
	import math
	import copy
	import time
	import random
	import spacy
	import numpy as np
	import os

	# torch packages
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch import Tensor
	import torch.optim as optim

	class MultiHeadAttention(nn.Module):
	"""
	We can refer to the following blog to understand in depth about the transformer and MHA
	https://medium.com/@hunter-j-phillips/multi-head-attention-7924371d477a

	Here we are clubbing all the linear layers together and duplicating the inputs and
	then performing matrix multiplications
	"""
	def __init__(self, dk, dv, h, pdropout=0.1):
	"""
	Input Args:

	dk(int): Key dimensions used for generating Key weight matrix
	dv(int): Val dimensions used for generating val weight matrix
	h(int) : Number of heads in MHA
	"""
	super().__init__()
	assert dk == dv
	self.dk = dk
	self.dv = dv
	self.h = h
	self.dmodel = self.dk * self.h # model dimension

	# Add the params in modulelist as the params in the conv list needs to be tracked
	# wq, wk, wv -> multiple linear weights for the number of heads
	self.WQ = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
	self.WK = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
	self.WV = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
	# Output Weights
	self.WO = nn.Linear(self.h*self.dv, self.dmodel) # shape -> (dmodel, dmodel)
	self.softmax = nn.Softmax(dim=-1)
	self.dropout = nn.Dropout(p = pdropout)

	def forward(self, query, key, val, mask=None):
	"""
	Forward pass for MHA

	X has a size of (batch_size, seq_length, d_model)
	Wq, Wk, and Wv have a size of (d_model, d_model)

	Perform Scaled Dot Product Attention on multi head attention.

	Notation: B - batch size, S/T - max src/trg token-sequence length
	query shape = (B, S, dmodel)
	key shape = (B, S, dmodel)
	val shape = (B, S, dmodel)
	"""
	# Weight the queries
	Q = self.WQ(query) # shape -> (B, S, dmodel)
	K = self.WK(key) # shape -> (B, S, dmodel)
	V = self.WV(val) # shape -> (B, S, dmodel)

	# Separate last dimension to number of head and dk
	batch_size = Q.size(0)
	Q = Q.view(batch_size, -1, self.h, self.dk) # shape -> (B, S, h, dk)
	K = K.view(batch_size, -1, self.h, self.dk) # shape -> (B, S, h, dk)
	V = V.view(batch_size, -1, self.h, self.dk) # shape -> (B, S, h, dk)

	# each sequence is split across n_heads, with each head receiving seq_length tokens
	# with d_key elements in each token instead of d_model.
	Q = Q.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
	K = K.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
	V = V.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)

	# dot product of Q and K
	scaled_dot_product = torch.matmul(Q, K.permute(0, 1, 3, 2)) / math.sqrt(self.dk)

	# fill those positions of product as (-1e10) where mask positions are 0
	if mask is not None:
	scaled_dot_product = scaled_dot_product.masked_fill(mask == 0, -1e10)

	attn_probs = self.softmax(scaled_dot_product)

	# Create head
	head = torch.matmul(self.dropout(attn_probs), V) # shape -> (B, h, S, S) * (B, h, S, dk) = (B, h, S, dk)
	# Prepare the head to pass it through output linear layer
	head = head.permute(0, 2, 1, 3).contiguous() # shape -> (B, S, h, dk)
	# Concatenate the head together
	head = head.view(batch_size, -1, self.h* self.dk) # shape -> (B, S, (h*dk = dmodel))
	# Pass through output layer
	token_representation = self.WO(head)
	return token_representation, attn_probs


	class Embedding(nn.Module):
	"""
	Embedding lookup table which is used by the positional
	embedding block.
	Embedding lookup table is shared across input and output
	"""
	def __init__(self, vocab_size, dmodel):
	"""
	Embedding lookup needs a vocab size and model
	dimension size matrix for creating lookups
	"""
	super().__init__()
	self.embedding_lookup = nn.Embedding(vocab_size, dmodel)
	self.vocab_size = vocab_size
	self.dmodel = dmodel

	def forward(self, token_ids):
	"""
	For a given token lookup the embedding vector

	As per the paper, we also multiply the embedding vector with sqrt of dmodel
	"""
	assert token_ids.ndim == 2, \
	f'Expected: (batch size, max token sequence length), got {token_ids.shape}'

	embedding_vector = self.embedding_lookup(token_ids)

	return embedding_vector * math.sqrt(self.dmodel)


	class PositionalEncoding(nn.Module):
	def __init__(self, dmodel, max_seq_length = 5000, pdropout = 0.1,):
	"""
	dmodel(int): model dimensions
	max_seq_length(int): Maximum input sequence length
	pdropout(float): Dropout probability
	"""
	super().__init__()
	self.dropout = nn.Dropout(p = pdropout)

	# Calculate frequencies
	position_ids = torch.arange(0, max_seq_length).unsqueeze(1)
	# -ve sign is added because the exponents are inverted when you multiply position and frequencies
	frequencies = torch.pow(10000, -torch.arange(0, dmodel, 2, dtype = torch.float)/ dmodel)

	# Create positional encoding table
	positional_encoding_table = torch.zeros(max_seq_length, dmodel)
	# Fill the table with even entries with sin and odd entries with cosine
	positional_encoding_table[:, 0::2] = torch.sin(position_ids * frequencies)
	positional_encoding_table[:, 1::2] = torch.cos(position_ids * frequencies)

	# Registering the position enconding in state_dict but the its not included
	# in named parameter as it is not trainable
	self.register_buffer("positional_encoding_table", positional_encoding_table)

	def forward(self, embeddings_batch):
	"""
	embeddings_batch shape = (batch size, seq_length, dmodel)
	positional_encoding_table shape = (max_seq_length, dmodel)
	"""
	assert embeddings_batch.ndim == 3, \
	f"Embeddings batch should have dimension of 3 but got {embeddings_batch.ndim}"
	assert embeddings_batch.size()[-1] == self.positional_encoding_table.size()[-1], \
	f"Embedding batch shape and positional_encoding_table shape should match, expected Embedding batch shape : {embeddings_batch.shape[-1]} while positional_encoding_table shape : {self.positional_encoding_table[-1]}"

	# Get encodings for the given input sequence length
	pos_encodings = self.positional_encoding_table[:embeddings_batch.shape[1]] # Choose only seq_length out of max_seq_length

	# Final output
	out = embeddings_batch + pos_encodings
	out = self.dropout(out)
	return out


	class PositionwiseFeedForward(nn.Module):
	def __init__(self, dmodel, dff, pdropout = 0.1):
	super().__init__()

	self.dropout = nn.Dropout(p = pdropout)

	self.W1 = nn.Linear(dmodel, dff) # Intermediate layer
	self.W2 = nn.Linear(dff, dmodel) # Output layer

	self.relu = nn.ReLU()

	def forward(self, x):
	"""
	Perform Feedforward calculation

	x shape = (B - batch size, S/T - max token sequence length, D- model dimension).
	"""
	out = self.W2(self.relu(self.dropout(self.W1(x))))
	return out