Rzoro's picture
Add trained transformer model
0f53151
# importing required libraries
import math
import copy
import time
import random
import spacy
import numpy as np
import os
# torch packages
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
import torch.optim as optim
class MultiHeadAttention(nn.Module):
"""
We can refer to the following blog to understand in depth about the transformer and MHA
https://medium.com/@hunter-j-phillips/multi-head-attention-7924371d477a
Here we are clubbing all the linear layers together and duplicating the inputs and
then performing matrix multiplications
"""
def __init__(self, dk, dv, h, pdropout=0.1):
"""
Input Args:
dk(int): Key dimensions used for generating Key weight matrix
dv(int): Val dimensions used for generating val weight matrix
h(int) : Number of heads in MHA
"""
super().__init__()
assert dk == dv
self.dk = dk
self.dv = dv
self.h = h
self.dmodel = self.dk * self.h # model dimension
# Add the params in modulelist as the params in the conv list needs to be tracked
# wq, wk, wv -> multiple linear weights for the number of heads
self.WQ = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
self.WK = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
self.WV = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
# Output Weights
self.WO = nn.Linear(self.h*self.dv, self.dmodel) # shape -> (dmodel, dmodel)
self.softmax = nn.Softmax(dim=-1)
self.dropout = nn.Dropout(p = pdropout)
def forward(self, query, key, val, mask=None):
"""
Forward pass for MHA
X has a size of (batch_size, seq_length, d_model)
Wq, Wk, and Wv have a size of (d_model, d_model)
Perform Scaled Dot Product Attention on multi head attention.
Notation: B - batch size, S/T - max src/trg token-sequence length
query shape = (B, S, dmodel)
key shape = (B, S, dmodel)
val shape = (B, S, dmodel)
"""
# Weight the queries
Q = self.WQ(query) # shape -> (B, S, dmodel)
K = self.WK(key) # shape -> (B, S, dmodel)
V = self.WV(val) # shape -> (B, S, dmodel)
# Separate last dimension to number of head and dk
batch_size = Q.size(0)
Q = Q.view(batch_size, -1, self.h, self.dk) # shape -> (B, S, h, dk)
K = K.view(batch_size, -1, self.h, self.dk) # shape -> (B, S, h, dk)
V = V.view(batch_size, -1, self.h, self.dk) # shape -> (B, S, h, dk)
# each sequence is split across n_heads, with each head receiving seq_length tokens
# with d_key elements in each token instead of d_model.
Q = Q.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
K = K.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
V = V.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
# dot product of Q and K
scaled_dot_product = torch.matmul(Q, K.permute(0, 1, 3, 2)) / math.sqrt(self.dk)
# fill those positions of product as (-1e10) where mask positions are 0
if mask is not None:
scaled_dot_product = scaled_dot_product.masked_fill(mask == 0, -1e10)
attn_probs = self.softmax(scaled_dot_product)
# Create head
head = torch.matmul(self.dropout(attn_probs), V) # shape -> (B, h, S, S) * (B, h, S, dk) = (B, h, S, dk)
# Prepare the head to pass it through output linear layer
head = head.permute(0, 2, 1, 3).contiguous() # shape -> (B, S, h, dk)
# Concatenate the head together
head = head.view(batch_size, -1, self.h* self.dk) # shape -> (B, S, (h*dk = dmodel))
# Pass through output layer
token_representation = self.WO(head)
return token_representation, attn_probs
class Embedding(nn.Module):
"""
Embedding lookup table which is used by the positional
embedding block.
Embedding lookup table is shared across input and output
"""
def __init__(self, vocab_size, dmodel):
"""
Embedding lookup needs a vocab size and model
dimension size matrix for creating lookups
"""
super().__init__()
self.embedding_lookup = nn.Embedding(vocab_size, dmodel)
self.vocab_size = vocab_size
self.dmodel = dmodel
def forward(self, token_ids):
"""
For a given token lookup the embedding vector
As per the paper, we also multiply the embedding vector with sqrt of dmodel
"""
assert token_ids.ndim == 2, \
f'Expected: (batch size, max token sequence length), got {token_ids.shape}'
embedding_vector = self.embedding_lookup(token_ids)
return embedding_vector * math.sqrt(self.dmodel)
class PositionalEncoding(nn.Module):
def __init__(self, dmodel, max_seq_length = 5000, pdropout = 0.1,):
"""
dmodel(int): model dimensions
max_seq_length(int): Maximum input sequence length
pdropout(float): Dropout probability
"""
super().__init__()
self.dropout = nn.Dropout(p = pdropout)
# Calculate frequencies
position_ids = torch.arange(0, max_seq_length).unsqueeze(1)
# -ve sign is added because the exponents are inverted when you multiply position and frequencies
frequencies = torch.pow(10000, -torch.arange(0, dmodel, 2, dtype = torch.float)/ dmodel)
# Create positional encoding table
positional_encoding_table = torch.zeros(max_seq_length, dmodel)
# Fill the table with even entries with sin and odd entries with cosine
positional_encoding_table[:, 0::2] = torch.sin(position_ids * frequencies)
positional_encoding_table[:, 1::2] = torch.cos(position_ids * frequencies)
# Registering the position enconding in state_dict but the its not included
# in named parameter as it is not trainable
self.register_buffer("positional_encoding_table", positional_encoding_table)
def forward(self, embeddings_batch):
"""
embeddings_batch shape = (batch size, seq_length, dmodel)
positional_encoding_table shape = (max_seq_length, dmodel)
"""
assert embeddings_batch.ndim == 3, \
f"Embeddings batch should have dimension of 3 but got {embeddings_batch.ndim}"
assert embeddings_batch.size()[-1] == self.positional_encoding_table.size()[-1], \
f"Embedding batch shape and positional_encoding_table shape should match, expected Embedding batch shape : {embeddings_batch.shape[-1]} while positional_encoding_table shape : {self.positional_encoding_table[-1]}"
# Get encodings for the given input sequence length
pos_encodings = self.positional_encoding_table[:embeddings_batch.shape[1]] # Choose only seq_length out of max_seq_length
# Final output
out = embeddings_batch + pos_encodings
out = self.dropout(out)
return out
class PositionwiseFeedForward(nn.Module):
def __init__(self, dmodel, dff, pdropout = 0.1):
super().__init__()
self.dropout = nn.Dropout(p = pdropout)
self.W1 = nn.Linear(dmodel, dff) # Intermediate layer
self.W2 = nn.Linear(dff, dmodel) # Output layer
self.relu = nn.ReLU()
def forward(self, x):
"""
Perform Feedforward calculation
x shape = (B - batch size, S/T - max token sequence length, D- model dimension).
"""
out = self.W2(self.relu(self.dropout(self.W1(x))))
return out