Spaces:

TangSan003
/

question-answering-api

Sleeping

App Files Files Community

question-answering-api / model.py

TangSan003

Load model

8516514 16 days ago

raw

history blame contribute delete

14.5 kB

	import os
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from safetensors.torch import load_file
	from transformers import RobertaModel as HFRobertaModel

	from utils import RobertaConfig


	from pprintpp import pprint

	class RobertaEmbeddings(nn.Module):
	"""
	Converts our tokens to embedding vectors and then adds positional embeddings (and potentially token type embeddings)
	to our data! We wont need to token type embeddings until we do our QA finetuning.
	"""

	def __init__(self, config):
	super(RobertaEmbeddings, self).__init__()

	### Embeddings for Tokens ###
	self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_dimension, padding_idx=config.pad_token)

	### Positional Embeddings ###
	self.position_embeddings = nn.Embedding(config.context_length, config.embedding_dimension)

	### Layernorm and Dropout ###
	self.layernorm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
	self.dropout = nn.Dropout(config.hidden_dropout_p)

	def forward(self, input_ids):
	batch_size, seq_length = input_ids.shape

	### Convert Tokens to Embeddings ###
	x = self.word_embeddings(input_ids)

	### Add Positional Information ###
	avail_idx = torch.arange(0, seq_length, dtype=torch.long, device=input_ids.device)
	pos_embed = self.position_embeddings(avail_idx)
	x = x + pos_embed

	x = self.layernorm(x)
	x = self.dropout(x)

	return x


	class RobertaAttention(nn.Module):
	"""
	Regular Self-Attention but in this case we utilize flash_attention
	incorporated in the F.scaled_dot_product_attention to speed up our training.
	"""

	def __init__(self, config):
	super(RobertaAttention, self).__init__()

	### Store Config ###
	self.config = config

	### Sanity Checks ###
	assert config.embedding_dimension % config.num_attention_heads == 0, "Double check embedding dim divisible by number of heads"

	### Attention Head Dim ###
	self.head_dim = config.embedding_dimension // config.num_attention_heads

	### Attention Projections ###
	self.q_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
	self.k_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
	self.v_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)

	### Post Attention Projection ###
	self.out_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)

	def forward(self, x, attention_mask=None):
	### Store Shape ###
	batch, seq_len, embed_dim = x.shape

	### Compute Attention with Flash Attention ###
	q = self.q_proj(x).reshape(batch, seq_len, self.config.num_attention_heads, self.head_dim).transpose(1,
	2).contiguous()
	k = self.k_proj(x).reshape(batch, seq_len, self.config.num_attention_heads, self.head_dim).transpose(1,
	2).contiguous()
	v = self.v_proj(x).reshape(batch, seq_len, self.config.num_attention_heads, self.head_dim).transpose(1,
	2).contiguous()

	### Compute Attention (Attention Mask has shape Batch x Sequence len x Sequence len) ###
	attention_out = F.scaled_dot_product_attention(q, k, v,
	attn_mask=attention_mask,
	dropout_p=self.config.attention_dropout_p if self.training else 0.0)

	### Compute Output Projection ###
	attention_out = attention_out.transpose(1, 2).flatten(2)
	attention_out = self.out_proj(attention_out)

	return attention_out


	class RobertaFeedForward(nn.Module):
	"""
	Regular MLP module after our attention computation.
	"""

	def __init__(self, config):
	super(RobertaFeedForward, self).__init__()

	hidden_size = config.embedding_dimension * config.mlp_ratio
	self.intermediate_dense = nn.Linear(config.embedding_dimension, hidden_size)
	self.activation = nn.GELU()
	self.intermediate_dropout = nn.Dropout(config.hidden_dropout_p)

	self.output_dense = nn.Linear(hidden_size, config.embedding_dimension)
	self.output_dropout = nn.Dropout(config.hidden_dropout_p)

	def forward(self, x):
	x = self.intermediate_dense(x)
	x = self.activation(x)
	x = self.intermediate_dropout(x)

	x = self.output_dense(x)
	x = self.output_dropout(x)
	return x


	class RobertaEncoderLayer(nn.Module):
	"""
	Single transformer block stacking together Attention and our FeedForward
	layers, with normalization and residual connections.
	"""

	def __init__(self, config):
	super(RobertaEncoderLayer, self).__init__()

	self.attention = RobertaAttention(config)
	self.dropout = nn.Dropout(config.hidden_dropout_p)
	self.layer_norm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
	self.feed_forward = RobertaFeedForward(config)
	self.final_layer_norm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)

	def forward(self, x, attention_mask=None):
	x = x + self.dropout(self.attention(x, attention_mask=attention_mask))
	x = self.layer_norm(x)

	x = x + self.feed_forward(x)
	x = self.final_layer_norm(x)

	return x


	class RobertaEncoder(nn.Module):
	"""
	This will be the stack of all of our transformer blocks
	"""

	def __init__(self, config):
	super(RobertaEncoder, self).__init__()

	self.config = config

	### Transformer Layers ###
	self.layers = nn.ModuleList(
	[
	RobertaEncoderLayer(config) for _ in range(config.num_transformer_blocks)
	]
	)

	def forward(
	self,
	x,
	attention_mask=None,
	):

	batch_size, seq_len, embed_dim = x.shape

	if attention_mask is not None:
	### Make Sure Attention Mask is a Boolean Tensor ###
	attention_mask = attention_mask.bool()

	### Now our Attention Mask is in (Batch x Sequence Length) where we have 0 for tokens we don't want to attend to ###
	### F.scaled_dot_product_attention expects a mask of the shape (Batch x ..., x Seq_len x Seq_len) ###
	### the "..." in this case is any extra dimensions (such as heads of attention). lets expand our mask to (Batch x 1 x Seq_len x Seq_len) ###
	### The 1 in this case refers to the number of heads of attention we want, so it is a dummy index to broadcast over ###
	### In each (Seq_len x Seq_len) matrix for every batch, we want False for all columns corresponding to padding tokens ###
	attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1, 1, seq_len, 1)

	for layer in self.layers:
	x = layer(x, attention_mask=attention_mask)

	return x


	class RobertaMLMHead(nn.Module):
	"""
	The Masked Language model head is a stack of two linear layers with an activation in between!
	"""

	def __init__(self, config):
	super(RobertaMLMHead, self).__init__()

	self.config = config

	### Projection Layer for Hidden States ###
	self.dense = nn.Linear(config.embedding_dimension, config.embedding_dimension)
	self.layer_norm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
	self.activation = nn.GELU()

	### Mapping to Vocabulary ###
	self.decoder = nn.Linear(config.embedding_dimension, config.vocab_size)

	def forward(self, inputs):
	### Pass through Projection/Activation/Norm ###
	x = self.dense(inputs)
	x = self.activation(x)
	x = self.layer_norm(x)

	### Prediction of Masked Tokens ###
	x = self.decoder(x)

	return x


	class RobertaModel(nn.Module):
	"""
	Backbone of our model, has to be pretrained via MLM on a ton of data!
	"""

	def __init__(self, config):
	super(RobertaModel, self).__init__()

	self.config = config

	### Define all Parts of the Model ###
	self.embeddings = RobertaEmbeddings(config)
	self.encoder = RobertaEncoder(config)

	def forward(self, input_ids, attention_mask=None):
	embeddings = self.embeddings(input_ids)
	output = self.encoder(embeddings, attention_mask)

	return output


	class RobertaForMaskedLM(nn.Module):
	"""
	This model will perform the masked language modeling task.
	"""

	def __init__(self, config):
	super(RobertaForMaskedLM, self).__init__()

	self.config = config

	### Define Model and MLM Head ###
	self.roberta = RobertaModel(config)
	self.mlm_head = RobertaMLMHead(config)

	self.apply(_init_weights_)

	def forward(self,
	input_ids,
	attention_mask=None,
	labels=None):

	### Pass data through model ###
	hidden_states = self.roberta(input_ids,
	attention_mask)

	preds = self.mlm_head(hidden_states)

	### Compute Loss if Labels are Available ###
	loss = None
	if labels is not None:

	### Flatten Logits to (BS x N) and Labels to (BS) ###
	preds = preds.flatten(end_dim=1)
	labels = labels.flatten()

	loss = F.cross_entropy(preds, labels)

	return hidden_states, preds, loss

	else:
	return hidden_states, preds




	class RobertaForQuestionAnswering(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config

	self.load_backbone()
	self.qa_head = nn.Linear(config.embedding_dimension, 2)


	def load_backbone(self):
	if self.config.pretrained_backbone == "pretrained_huggingface":
	print("Loading Huggingface RoBERTa Model")
	self.roberta = HFRobertaModel.from_pretrained(self.config.hf_model_name)
	else:
	self.roberta = RobertaModel(self.config)
	if self.config.pretrained_backbone == "pretrained":
	# state_dict = load_file(self.config.path_to_pretrained_weights)
	# print(self.config.path_to_pretrained_weights)
	if self.config.path_to_pretrained_weights is None:
	# state_dict = HFRobertaModel.from_pretrained(RobertaConfig.hf_model_name).state_dict()
	raise Exception(
	"Provide the argument `path_to_pretrained_weights` in the config, else we cant load them!")
	else:
	if not os.path.isfile(self.config.path_to_pretrained_weights):
	raise Exception(
	f"Provided path to safetensors weights {self.config.path_to_pretrained_weights} is invalid!")
	print(f"Loading RobertaModel Backbone from {self.config.path_to_pretrained_weights}")

	state_dict = load_file(self.config.path_to_pretrained_weights)

	# Filter and rename keys
	backbone_keys = {}
	for key in state_dict.keys():
	if "roberta" in key:
	new_key = key.replace("roberta.", "")

	backbone_keys[new_key] = state_dict[key]
	else:
	continue

	self.roberta.load_state_dict(backbone_keys)

	def forward(self,
	input_ids,
	attention_mask=None,
	start_positions=None,
	end_positions=None):

	if self.config.pretrained_backbone == "pretrained_huggingface":
	output = self.roberta(input_ids, attention_mask=attention_mask).last_hidden_state

	else:
	output = self.roberta(input_ids, attention_mask=attention_mask)

	logit = self.qa_head(output)
	#
	start_logits, end_logits = logit.split(1, dim=-1)
	start_logits = start_logits.squeeze(-1)
	end_logits = end_logits.squeeze(-1)


	if start_positions is not None and end_positions is not None:

	#
	if len(start_positions.size()) >1:
	start_positions = start_positions.squeeze(-1)
	if len(end_positions.size()) > 1:
	end_positions = end_positions.squeeze(-1)


	ignored_index = start_logits.size(1)

	start_positions = start_positions.clamp(0, ignored_index)
	end_positions = end_positions.clamp(0, ignored_index)

	start_loss = F.cross_entropy(start_logits, start_positions, ignore_index=ignored_index)
	end_loss = F.cross_entropy(end_logits, end_positions, ignore_index=ignored_index)

	total_loss = (start_loss + end_loss) / 2

	return total_loss, start_logits, end_logits
	return start_logits, end_logits


	def _init_weights_(module):
	"""
	Simple weight intialization taken directly from the huggingface
	`modeling_roberta.py` implementation!
	"""
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=0.02)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=0.02)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()
	elif isinstance(module, nn.LayerNorm):
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)


	if __name__ == "__main__":

	config = RobertaConfig(pretrained_backbone = "pretrained",
	path_to_pretrained_weights="/home/tangsan/AllNlpProject/CoQAChat/model/RoBERTa/finetune_qa_hf_roberta_backbone/checkpoint-27162/model.safetensors")
	model = RobertaForQuestionAnswering(config=config)

	rand= torch.randint(0,100,size=(4,8))
	start_positions=torch.tensor([1,2,3,4])
	end_positions=torch.tensor([5,6,7,8])
	model(rand, start_positions=start_positions, end_positions=end_positions)