TangSan003's picture
Load model
8516514
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from safetensors.torch import load_file
from transformers import RobertaModel as HFRobertaModel
from utils import RobertaConfig
from pprintpp import pprint
class RobertaEmbeddings(nn.Module):
"""
Converts our tokens to embedding vectors and then adds positional embeddings (and potentially token type embeddings)
to our data! We wont need to token type embeddings until we do our QA finetuning.
"""
def __init__(self, config):
super(RobertaEmbeddings, self).__init__()
### Embeddings for Tokens ###
self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_dimension, padding_idx=config.pad_token)
### Positional Embeddings ###
self.position_embeddings = nn.Embedding(config.context_length, config.embedding_dimension)
### Layernorm and Dropout ###
self.layernorm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_p)
def forward(self, input_ids):
batch_size, seq_length = input_ids.shape
### Convert Tokens to Embeddings ###
x = self.word_embeddings(input_ids)
### Add Positional Information ###
avail_idx = torch.arange(0, seq_length, dtype=torch.long, device=input_ids.device)
pos_embed = self.position_embeddings(avail_idx)
x = x + pos_embed
x = self.layernorm(x)
x = self.dropout(x)
return x
class RobertaAttention(nn.Module):
"""
Regular Self-Attention but in this case we utilize flash_attention
incorporated in the F.scaled_dot_product_attention to speed up our training.
"""
def __init__(self, config):
super(RobertaAttention, self).__init__()
### Store Config ###
self.config = config
### Sanity Checks ###
assert config.embedding_dimension % config.num_attention_heads == 0, "Double check embedding dim divisible by number of heads"
### Attention Head Dim ###
self.head_dim = config.embedding_dimension // config.num_attention_heads
### Attention Projections ###
self.q_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
self.k_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
self.v_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
### Post Attention Projection ###
self.out_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
def forward(self, x, attention_mask=None):
### Store Shape ###
batch, seq_len, embed_dim = x.shape
### Compute Attention with Flash Attention ###
q = self.q_proj(x).reshape(batch, seq_len, self.config.num_attention_heads, self.head_dim).transpose(1,
2).contiguous()
k = self.k_proj(x).reshape(batch, seq_len, self.config.num_attention_heads, self.head_dim).transpose(1,
2).contiguous()
v = self.v_proj(x).reshape(batch, seq_len, self.config.num_attention_heads, self.head_dim).transpose(1,
2).contiguous()
### Compute Attention (Attention Mask has shape Batch x Sequence len x Sequence len) ###
attention_out = F.scaled_dot_product_attention(q, k, v,
attn_mask=attention_mask,
dropout_p=self.config.attention_dropout_p if self.training else 0.0)
### Compute Output Projection ###
attention_out = attention_out.transpose(1, 2).flatten(2)
attention_out = self.out_proj(attention_out)
return attention_out
class RobertaFeedForward(nn.Module):
"""
Regular MLP module after our attention computation.
"""
def __init__(self, config):
super(RobertaFeedForward, self).__init__()
hidden_size = config.embedding_dimension * config.mlp_ratio
self.intermediate_dense = nn.Linear(config.embedding_dimension, hidden_size)
self.activation = nn.GELU()
self.intermediate_dropout = nn.Dropout(config.hidden_dropout_p)
self.output_dense = nn.Linear(hidden_size, config.embedding_dimension)
self.output_dropout = nn.Dropout(config.hidden_dropout_p)
def forward(self, x):
x = self.intermediate_dense(x)
x = self.activation(x)
x = self.intermediate_dropout(x)
x = self.output_dense(x)
x = self.output_dropout(x)
return x
class RobertaEncoderLayer(nn.Module):
"""
Single transformer block stacking together Attention and our FeedForward
layers, with normalization and residual connections.
"""
def __init__(self, config):
super(RobertaEncoderLayer, self).__init__()
self.attention = RobertaAttention(config)
self.dropout = nn.Dropout(config.hidden_dropout_p)
self.layer_norm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
self.feed_forward = RobertaFeedForward(config)
self.final_layer_norm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
def forward(self, x, attention_mask=None):
x = x + self.dropout(self.attention(x, attention_mask=attention_mask))
x = self.layer_norm(x)
x = x + self.feed_forward(x)
x = self.final_layer_norm(x)
return x
class RobertaEncoder(nn.Module):
"""
This will be the stack of all of our transformer blocks
"""
def __init__(self, config):
super(RobertaEncoder, self).__init__()
self.config = config
### Transformer Layers ###
self.layers = nn.ModuleList(
[
RobertaEncoderLayer(config) for _ in range(config.num_transformer_blocks)
]
)
def forward(
self,
x,
attention_mask=None,
):
batch_size, seq_len, embed_dim = x.shape
if attention_mask is not None:
### Make Sure Attention Mask is a Boolean Tensor ###
attention_mask = attention_mask.bool()
### Now our Attention Mask is in (Batch x Sequence Length) where we have 0 for tokens we don't want to attend to ###
### F.scaled_dot_product_attention expects a mask of the shape (Batch x ..., x Seq_len x Seq_len) ###
### the "..." in this case is any extra dimensions (such as heads of attention). lets expand our mask to (Batch x 1 x Seq_len x Seq_len) ###
### The 1 in this case refers to the number of heads of attention we want, so it is a dummy index to broadcast over ###
### In each (Seq_len x Seq_len) matrix for every batch, we want False for all columns corresponding to padding tokens ###
attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1, 1, seq_len, 1)
for layer in self.layers:
x = layer(x, attention_mask=attention_mask)
return x
class RobertaMLMHead(nn.Module):
"""
The Masked Language model head is a stack of two linear layers with an activation in between!
"""
def __init__(self, config):
super(RobertaMLMHead, self).__init__()
self.config = config
### Projection Layer for Hidden States ###
self.dense = nn.Linear(config.embedding_dimension, config.embedding_dimension)
self.layer_norm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
self.activation = nn.GELU()
### Mapping to Vocabulary ###
self.decoder = nn.Linear(config.embedding_dimension, config.vocab_size)
def forward(self, inputs):
### Pass through Projection/Activation/Norm ###
x = self.dense(inputs)
x = self.activation(x)
x = self.layer_norm(x)
### Prediction of Masked Tokens ###
x = self.decoder(x)
return x
class RobertaModel(nn.Module):
"""
Backbone of our model, has to be pretrained via MLM on a ton of data!
"""
def __init__(self, config):
super(RobertaModel, self).__init__()
self.config = config
### Define all Parts of the Model ###
self.embeddings = RobertaEmbeddings(config)
self.encoder = RobertaEncoder(config)
def forward(self, input_ids, attention_mask=None):
embeddings = self.embeddings(input_ids)
output = self.encoder(embeddings, attention_mask)
return output
class RobertaForMaskedLM(nn.Module):
"""
This model will perform the masked language modeling task.
"""
def __init__(self, config):
super(RobertaForMaskedLM, self).__init__()
self.config = config
### Define Model and MLM Head ###
self.roberta = RobertaModel(config)
self.mlm_head = RobertaMLMHead(config)
self.apply(_init_weights_)
def forward(self,
input_ids,
attention_mask=None,
labels=None):
### Pass data through model ###
hidden_states = self.roberta(input_ids,
attention_mask)
preds = self.mlm_head(hidden_states)
### Compute Loss if Labels are Available ###
loss = None
if labels is not None:
### Flatten Logits to (B*S x N) and Labels to (B*S) ###
preds = preds.flatten(end_dim=1)
labels = labels.flatten()
loss = F.cross_entropy(preds, labels)
return hidden_states, preds, loss
else:
return hidden_states, preds
class RobertaForQuestionAnswering(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.load_backbone()
self.qa_head = nn.Linear(config.embedding_dimension, 2)
def load_backbone(self):
if self.config.pretrained_backbone == "pretrained_huggingface":
print("Loading Huggingface RoBERTa Model")
self.roberta = HFRobertaModel.from_pretrained(self.config.hf_model_name)
else:
self.roberta = RobertaModel(self.config)
if self.config.pretrained_backbone == "pretrained":
# state_dict = load_file(self.config.path_to_pretrained_weights)
# print(self.config.path_to_pretrained_weights)
if self.config.path_to_pretrained_weights is None:
# state_dict = HFRobertaModel.from_pretrained(RobertaConfig.hf_model_name).state_dict()
raise Exception(
"Provide the argument `path_to_pretrained_weights` in the config, else we cant load them!")
else:
if not os.path.isfile(self.config.path_to_pretrained_weights):
raise Exception(
f"Provided path to safetensors weights {self.config.path_to_pretrained_weights} is invalid!")
print(f"Loading RobertaModel Backbone from {self.config.path_to_pretrained_weights}")
state_dict = load_file(self.config.path_to_pretrained_weights)
# Filter and rename keys
backbone_keys = {}
for key in state_dict.keys():
if "roberta" in key:
new_key = key.replace("roberta.", "")
backbone_keys[new_key] = state_dict[key]
else:
continue
self.roberta.load_state_dict(backbone_keys)
def forward(self,
input_ids,
attention_mask=None,
start_positions=None,
end_positions=None):
if self.config.pretrained_backbone == "pretrained_huggingface":
output = self.roberta(input_ids, attention_mask=attention_mask).last_hidden_state
else:
output = self.roberta(input_ids, attention_mask=attention_mask)
logit = self.qa_head(output)
#
start_logits, end_logits = logit.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
if start_positions is not None and end_positions is not None:
#
if len(start_positions.size()) >1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
start_loss = F.cross_entropy(start_logits, start_positions, ignore_index=ignored_index)
end_loss = F.cross_entropy(end_logits, end_positions, ignore_index=ignored_index)
total_loss = (start_loss + end_loss) / 2
return total_loss, start_logits, end_logits
return start_logits, end_logits
def _init_weights_(module):
"""
Simple weight intialization taken directly from the huggingface
`modeling_roberta.py` implementation!
"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=0.02)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=0.02)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if __name__ == "__main__":
config = RobertaConfig(pretrained_backbone = "pretrained",
path_to_pretrained_weights="/home/tangsan/AllNlpProject/CoQAChat/model/RoBERTa/finetune_qa_hf_roberta_backbone/checkpoint-27162/model.safetensors")
model = RobertaForQuestionAnswering(config=config)
rand= torch.randint(0,100,size=(4,8))
start_positions=torch.tensor([1,2,3,4])
end_positions=torch.tensor([5,6,7,8])
model(rand, start_positions=start_positions, end_positions=end_positions)