Spaces:

KyanChen
/

RSPrompter

Runtime error

App Files Files Community

RSPrompter / mmpretrain /models /multimodal /blip2 /Qformer.py

KyanChen

Upload 303 files

4d0eb62 about 2 years ago

raw

history blame contribute delete

32.1 kB

	# flake8: noqa
	"""
	* Copyright (c) 2023, salesforce.com, inc.
	"""
	from typing import Tuple

	import torch
	import torch.utils.checkpoint
	from torch import Tensor, device, nn
	from torch.nn import CrossEntropyLoss
	from transformers.activations import ACT2FN
	from transformers.modeling_outputs import (
	BaseModelOutputWithPastAndCrossAttentions,
	BaseModelOutputWithPoolingAndCrossAttentions,
	CausalLMOutputWithCrossAttentions)
	from transformers.modeling_utils import apply_chunking_to_forward
	from transformers.models.bert.configuration_bert import BertConfig
	from transformers.utils import logging

	from mmpretrain.registry import MODELS
	from ..blip.language_model import (BertAttention, BertIntermediate,
	BertOnlyMLMHead, BertOutput, BertPooler,
	BertPreTrainedModel)

	logger = logging.get_logger(__name__)


	class BertEmbeddings(nn.Module):
	"""Construct the embeddings from word and position embeddings."""

	def __init__(self, config):
	super().__init__()
	self.word_embeddings = nn.Embedding(
	config.vocab_size,
	config.hidden_size,
	padding_idx=config.pad_token_id)
	self.position_embeddings = nn.Embedding(config.max_position_embeddings,
	config.hidden_size)

	# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
	# any TensorFlow checkpoint file
	self.LayerNorm = nn.LayerNorm(
	config.hidden_size, eps=config.layer_norm_eps)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)

	# position_ids (1, len position emb) is contiguous in memory and exported when serialized
	self.register_buffer(
	'position_ids',
	torch.arange(config.max_position_embeddings).expand((1, -1)))
	self.position_embedding_type = getattr(config,
	'position_embedding_type',
	'absolute')

	self.config = config

	def forward(
	self,
	input_ids=None,
	position_ids=None,
	query_embeds=None,
	past_key_values_length=0,
	):
	if input_ids is not None:
	seq_length = input_ids.size()[1]
	else:
	seq_length = 0

	if position_ids is None:
	position_ids = self.position_ids[:, past_key_values_length:
	seq_length +
	past_key_values_length].clone()

	if input_ids is not None:
	embeddings = self.word_embeddings(input_ids)
	if self.position_embedding_type == 'absolute':
	position_embeddings = self.position_embeddings(position_ids)
	embeddings = embeddings + position_embeddings

	if query_embeds is not None:
	embeddings = torch.cat((query_embeds, embeddings), dim=1)
	else:
	embeddings = query_embeds

	embeddings = self.LayerNorm(embeddings)
	embeddings = self.dropout(embeddings)
	return embeddings


	class BertLayer(nn.Module):

	def __init__(self, config, layer_num):
	super().__init__()
	self.config = config
	self.chunk_size_feed_forward = config.chunk_size_feed_forward
	self.seq_len_dim = 1
	self.attention = BertAttention(config)
	self.layer_num = layer_num
	if (self.config.add_cross_attention
	and layer_num % self.config.cross_attention_freq == 0):
	self.crossattention = BertAttention(
	config, is_cross_attention=self.config.add_cross_attention)
	self.has_cross_attention = True
	else:
	self.has_cross_attention = False
	self.intermediate = BertIntermediate(config)
	self.output = BertOutput(config)

	self.intermediate_query = BertIntermediate(config)
	self.output_query = BertOutput(config)

	def forward(
	self,
	hidden_states,
	attention_mask=None,
	head_mask=None,
	encoder_hidden_states=None,
	encoder_attention_mask=None,
	past_key_value=None,
	output_attentions=False,
	query_length=0,
	):
	# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
	self_attn_past_key_value = (
	past_key_value[:2] if past_key_value is not None else None)
	self_attention_outputs = self.attention(
	hidden_states,
	attention_mask,
	head_mask,
	output_attentions=output_attentions,
	past_key_value=self_attn_past_key_value,
	)
	attention_output = self_attention_outputs[0]
	outputs = self_attention_outputs[1:-1]

	present_key_value = self_attention_outputs[-1]

	if query_length > 0:
	query_attention_output = attention_output[:, :query_length, :]

	if self.has_cross_attention:
	assert (
	encoder_hidden_states is not None
	), 'encoder_hidden_states must be given for cross-attention layers'
	cross_attention_outputs = self.crossattention(
	query_attention_output,
	attention_mask,
	head_mask,
	encoder_hidden_states,
	encoder_attention_mask,
	output_attentions=output_attentions,
	)
	query_attention_output = cross_attention_outputs[0]
	outputs = (
	outputs + cross_attention_outputs[1:-1]
	) # add cross attentions if we output attention weights

	layer_output = apply_chunking_to_forward(
	self.feed_forward_chunk_query,
	self.chunk_size_feed_forward,
	self.seq_len_dim,
	query_attention_output,
	)
	if attention_output.shape[1] > query_length:
	layer_output_text = apply_chunking_to_forward(
	self.feed_forward_chunk,
	self.chunk_size_feed_forward,
	self.seq_len_dim,
	attention_output[:, query_length:, :],
	)
	layer_output = torch.cat([layer_output, layer_output_text],
	dim=1)
	else:
	layer_output = apply_chunking_to_forward(
	self.feed_forward_chunk,
	self.chunk_size_feed_forward,
	self.seq_len_dim,
	attention_output,
	)
	outputs = (layer_output, ) + outputs

	outputs = outputs + (present_key_value, )

	return outputs

	def feed_forward_chunk(self, attention_output):
	intermediate_output = self.intermediate(attention_output)
	layer_output = self.output(intermediate_output, attention_output)
	return layer_output

	def feed_forward_chunk_query(self, attention_output):
	intermediate_output = self.intermediate_query(attention_output)
	layer_output = self.output_query(intermediate_output, attention_output)
	return layer_output


	class BertEncoder(nn.Module):

	def __init__(self, config):
	super().__init__()
	self.config = config
	self.layer = nn.ModuleList(
	[BertLayer(config, i) for i in range(config.num_hidden_layers)])

	def forward(
	self,
	hidden_states,
	attention_mask=None,
	head_mask=None,
	encoder_hidden_states=None,
	encoder_attention_mask=None,
	past_key_values=None,
	use_cache=None,
	output_attentions=False,
	output_hidden_states=False,
	return_dict=True,
	query_length=0,
	):
	all_hidden_states = () if output_hidden_states else None
	all_self_attentions = () if output_attentions else None
	all_cross_attentions = (() if output_attentions
	and self.config.add_cross_attention else None)

	next_decoder_cache = () if use_cache else None

	for i in range(self.config.num_hidden_layers):
	layer_module = self.layer[i]
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states, )

	layer_head_mask = head_mask[i] if head_mask is not None else None
	past_key_value = past_key_values[
	i] if past_key_values is not None else None

	if getattr(self.config, 'gradient_checkpointing',
	False) and self.training:

	if use_cache:
	logger.warn(
	'`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
	)
	use_cache = False

	def create_custom_forward(module):

	def custom_forward(*inputs):
	return module(*inputs, past_key_value,
	output_attentions, query_length)

	return custom_forward

	layer_outputs = torch.utils.checkpoint.checkpoint(
	create_custom_forward(layer_module),
	hidden_states,
	attention_mask,
	layer_head_mask,
	encoder_hidden_states,
	encoder_attention_mask,
	)
	else:
	layer_outputs = layer_module(
	hidden_states,
	attention_mask,
	layer_head_mask,
	encoder_hidden_states,
	encoder_attention_mask,
	past_key_value,
	output_attentions,
	query_length,
	)

	hidden_states = layer_outputs[0]
	if use_cache:
	next_decoder_cache += (layer_outputs[-1], )
	if output_attentions:
	all_self_attentions = all_self_attentions + (
	layer_outputs[1], )
	all_cross_attentions = all_cross_attentions + (
	layer_outputs[2], )

	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states, )

	if not return_dict:
	return tuple(v for v in [
	hidden_states,
	next_decoder_cache,
	all_hidden_states,
	all_self_attentions,
	all_cross_attentions,
	] if v is not None)
	return BaseModelOutputWithPastAndCrossAttentions(
	last_hidden_state=hidden_states,
	past_key_values=next_decoder_cache,
	hidden_states=all_hidden_states,
	attentions=all_self_attentions,
	cross_attentions=all_cross_attentions,
	)


	class BertModel(BertPreTrainedModel):
	"""The model can behave as an encoder (with only self-attention) as well as
	a decoder, in which case a layer of cross-attention is added between the
	self-attention layers, following the architecture described in `Attention
	is all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani,
	Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N.

	Gomez, Lukasz Kaiser and Illia Polosukhin. argument and
	:obj:`add_cross_attention` set to :obj:`True`; an
	:obj:`encoder_hidden_states` is then expected as an input to the forward
	pass.
	"""

	def __init__(self, config, add_pooling_layer=False):
	super().__init__(config)
	self.config = config

	self.embeddings = BertEmbeddings(config)

	self.encoder = BertEncoder(config)

	self.pooler = BertPooler(config) if add_pooling_layer else None

	self.init_weights()

	def get_input_embeddings(self):
	return self.embeddings.word_embeddings

	def set_input_embeddings(self, value):
	self.embeddings.word_embeddings = value

	def _prune_heads(self, heads_to_prune):
	"""Prunes heads of the model.

	heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
	class PreTrainedModel
	"""
	for layer, heads in heads_to_prune.items():
	self.encoder.layer[layer].attention.prune_heads(heads)

	def get_extended_attention_mask(
	self,
	attention_mask: Tensor,
	input_shape: Tuple[int],
	device: device,
	is_decoder: bool,
	has_query: bool = False,
	) -> Tensor:
	"""Makes broadcastable attention and causal masks so that future and
	masked tokens are ignored.

	Arguments:
	attention_mask (:obj:`torch.Tensor`):
	Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
	input_shape (:obj:`Tuple[int]`):
	The shape of the input to the model.
	device: (:obj:`torch.device`):
	The device of the input to the model.

	Returns:
	:obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
	"""
	# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
	# ourselves in which case we just need to make it broadcastable to all heads.
	if attention_mask.dim() == 3:
	extended_attention_mask = attention_mask[:, None, :, :]
	elif attention_mask.dim() == 2:
	# Provided a padding mask of dimensions [batch_size, seq_length]
	# - if the model is a decoder, apply a causal mask in addition to the padding mask
	# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
	if is_decoder:
	batch_size, seq_length = input_shape

	seq_ids = torch.arange(seq_length, device=device)
	causal_mask = (
	seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <=
	seq_ids[None, :, None])

	# add a prefix ones mask to the causal mask
	# causal and attention masks must have same type with pytorch version < 1.3
	causal_mask = causal_mask.to(attention_mask.dtype)

	if causal_mask.shape[1] < attention_mask.shape[1]:
	prefix_seq_len = attention_mask.shape[
	1] - causal_mask.shape[1]
	if has_query: # UniLM style attention mask
	causal_mask = torch.cat(
	[
	torch.zeros(
	(batch_size, prefix_seq_len, seq_length),
	device=device,
	dtype=causal_mask.dtype,
	),
	causal_mask,
	],
	axis=1,
	)
	causal_mask = torch.cat(
	[
	torch.ones(
	(batch_size, causal_mask.shape[1],
	prefix_seq_len),
	device=device,
	dtype=causal_mask.dtype,
	),
	causal_mask,
	],
	axis=-1,
	)
	extended_attention_mask = (
	causal_mask[:, None, :, :] *
	attention_mask[:, None, None, :])
	else:
	extended_attention_mask = attention_mask[:, None, None, :]
	else:
	raise ValueError(
	'Wrong shape for input_ids (shape {}) or attention_mask (shape {})'
	.format(input_shape, attention_mask.shape))

	# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
	# masked positions, this operation will create a tensor which is 0.0 for
	# positions we want to attend and -10000.0 for masked positions.
	# Since we are adding it to the raw scores before the softmax, this is
	# effectively the same as removing these entirely.
	extended_attention_mask = extended_attention_mask.to(
	dtype=self.dtype) # fp16 compatibility
	extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
	return extended_attention_mask

	def forward(
	self,
	input_ids=None,
	attention_mask=None,
	position_ids=None,
	head_mask=None,
	query_embeds=None,
	encoder_hidden_states=None,
	encoder_attention_mask=None,
	past_key_values=None,
	use_cache=None,
	output_attentions=None,
	output_hidden_states=None,
	return_dict=None,
	is_decoder=False,
	):
	r"""
	encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
	Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
	the model is configured as a decoder.
	encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
	Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
	the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.
	past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
	Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
	If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
	(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
	instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
	use_cache (:obj:`bool`, `optional`):
	If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
	decoding (see :obj:`past_key_values`).
	"""
	output_attentions = (
	output_attentions if output_attentions is not None else
	self.config.output_attentions)
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else
	self.config.output_hidden_states)
	return_dict = (
	return_dict
	if return_dict is not None else self.config.use_return_dict)

	# use_cache = use_cache if use_cache is not None else self.config.use_cache
	if input_ids is None:
	assert (
	query_embeds is not None
	), 'You have to specify query_embeds when input_ids is None'

	# past_key_values_length
	past_key_values_length = (
	past_key_values[0][0].shape[2] -
	self.config.query_length if past_key_values is not None else 0)

	query_length = query_embeds.shape[1] if query_embeds is not None else 0

	embedding_output = self.embeddings(
	input_ids=input_ids,
	position_ids=position_ids,
	query_embeds=query_embeds,
	past_key_values_length=past_key_values_length,
	)

	input_shape = embedding_output.size()[:-1]
	batch_size, seq_length = input_shape
	device = embedding_output.device

	if attention_mask is None:
	attention_mask = torch.ones(
	((batch_size, seq_length + past_key_values_length)),
	device=device)

	# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
	# ourselves in which case we just need to make it broadcastable to all heads.
	if is_decoder:
	extended_attention_mask = self.get_extended_attention_mask(
	attention_mask,
	input_ids.shape,
	device,
	is_decoder,
	has_query=(query_embeds is not None),
	)
	else:
	extended_attention_mask = self.get_extended_attention_mask(
	attention_mask, input_shape, device, is_decoder)

	# If a 2D or 3D attention mask is provided for the cross-attention
	# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
	if encoder_hidden_states is not None:
	if type(encoder_hidden_states) == list:
	encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
	0].size()
	else:
	(
	encoder_batch_size,
	encoder_sequence_length,
	_,
	) = encoder_hidden_states.size()
	encoder_hidden_shape = (encoder_batch_size,
	encoder_sequence_length)

	if type(encoder_attention_mask) == list:
	encoder_extended_attention_mask = [
	self.invert_attention_mask(mask)
	for mask in encoder_attention_mask
	]
	elif encoder_attention_mask is None:
	encoder_attention_mask = torch.ones(
	encoder_hidden_shape, device=device)
	encoder_extended_attention_mask = self.invert_attention_mask(
	encoder_attention_mask)
	else:
	encoder_extended_attention_mask = self.invert_attention_mask(
	encoder_attention_mask)
	else:
	encoder_extended_attention_mask = None

	# Prepare head mask if needed
	# 1.0 in head_mask indicate we keep the head
	# attention_probs has shape bsz x n_heads x N x N
	# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
	# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
	head_mask = self.get_head_mask(head_mask,
	self.config.num_hidden_layers)

	encoder_outputs = self.encoder(
	embedding_output,
	attention_mask=extended_attention_mask,
	head_mask=head_mask,
	encoder_hidden_states=encoder_hidden_states,
	encoder_attention_mask=encoder_extended_attention_mask,
	past_key_values=past_key_values,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	query_length=query_length,
	)
	sequence_output = encoder_outputs[0]
	pooled_output = (
	self.pooler(sequence_output) if self.pooler is not None else None)

	if not return_dict:
	return (sequence_output, pooled_output) + encoder_outputs[1:]

	return BaseModelOutputWithPoolingAndCrossAttentions(
	last_hidden_state=sequence_output,
	pooler_output=pooled_output,
	past_key_values=encoder_outputs.past_key_values,
	hidden_states=encoder_outputs.hidden_states,
	attentions=encoder_outputs.attentions,
	cross_attentions=encoder_outputs.cross_attentions,
	)


	class BertLMHeadModel(BertPreTrainedModel):

	_keys_to_ignore_on_load_unexpected = [r'pooler']
	_keys_to_ignore_on_load_missing = [
	r'position_ids', r'predictions.decoder.bias'
	]

	def __init__(self, config):
	super().__init__(config)

	self.bert = BertModel(config, add_pooling_layer=False)
	self.cls = BertOnlyMLMHead(config)

	self.init_weights()

	def get_output_embeddings(self):
	return self.cls.predictions.decoder

	def set_output_embeddings(self, new_embeddings):
	self.cls.predictions.decoder = new_embeddings

	def forward(
	self,
	input_ids=None,
	attention_mask=None,
	position_ids=None,
	head_mask=None,
	query_embeds=None,
	encoder_hidden_states=None,
	encoder_attention_mask=None,
	labels=None,
	past_key_values=None,
	use_cache=True,
	output_attentions=None,
	output_hidden_states=None,
	return_dict=None,
	return_logits=False,
	is_decoder=True,
	reduction='mean',
	):
	r"""
	encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
	Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
	the model is configured as a decoder.
	encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
	Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
	the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.
	labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
	Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
	``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
	ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
	past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4
	tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
	Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
	If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
	(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
	instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
	use_cache (:obj:`bool`, `optional`):
	If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
	decoding (see :obj:`past_key_values`).
	Returns:
	Example::
	>>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
	>>> import torch
	>>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
	>>> config = BertConfig.from_pretrained("bert-base-cased")
	>>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
	>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
	>>> outputs = model(**inputs)
	>>> prediction_logits = outputs.logits
	"""
	return_dict = (
	return_dict
	if return_dict is not None else self.config.use_return_dict)
	if labels is not None:
	use_cache = False
	if past_key_values is not None:
	query_embeds = None

	outputs = self.bert(
	input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	head_mask=head_mask,
	query_embeds=query_embeds,
	encoder_hidden_states=encoder_hidden_states,
	encoder_attention_mask=encoder_attention_mask,
	past_key_values=past_key_values,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	is_decoder=is_decoder,
	)

	sequence_output = outputs[0]
	if query_embeds is not None:
	sequence_output = outputs[0][:, query_embeds.shape[1]:, :]
	prediction_scores = self.cls(sequence_output)

	if return_logits:
	return prediction_scores[:, :-1, :].contiguous()

	lm_loss = None
	if labels is not None:
	# we are doing next-token prediction; shift prediction scores and input ids by one
	shifted_prediction_scores = prediction_scores[:, :
	-1, :].contiguous()
	labels = labels[:, 1:].contiguous()
	loss_fct = CrossEntropyLoss(
	reduction=reduction, label_smoothing=0.1)
	lm_loss = loss_fct(
	shifted_prediction_scores.view(-1, self.config.vocab_size),
	labels.view(-1),
	)
	if reduction == 'none':
	lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)

	if not return_dict:
	output = (prediction_scores, ) + outputs[2:]
	return ((lm_loss, ) + output) if lm_loss is not None else output

	return CausalLMOutputWithCrossAttentions(
	loss=lm_loss,
	logits=prediction_scores,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	cross_attentions=outputs.cross_attentions,
	)

	def prepare_inputs_for_generation(self,
	input_ids,
	query_embeds,
	past=None,
	attention_mask=None,
	**model_kwargs):
	# if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
	if attention_mask is None:
	attention_mask = input_ids.new_ones(input_ids.shape)
	query_mask = input_ids.new_ones(query_embeds.shape[:-1])
	attention_mask = torch.cat([query_mask, attention_mask], dim=-1)

	# cut decoder_input_ids if past is used
	if past is not None:
	input_ids = input_ids[:, -1:]

	return {
	'input_ids':
	input_ids,
	'query_embeds':
	query_embeds,
	'attention_mask':
	attention_mask,
	'past_key_values':
	past,
	'encoder_hidden_states':
	model_kwargs.get('encoder_hidden_states', None),
	'encoder_attention_mask':
	model_kwargs.get('encoder_attention_mask', None),
	'is_decoder':
	True,
	}

	def _reorder_cache(self, past, beam_idx):
	reordered_past = ()
	for layer_past in past:
	reordered_past += (tuple(
	past_state.index_select(0, beam_idx)
	for past_state in layer_past), )
	return reordered_past


	@MODELS.register_module()
	class Qformer(BertLMHeadModel):

	def __init__(self, model_style: str, vision_model_width: int,
	add_cross_attention: bool, cross_attention_freq: int,
	num_query_token: int) -> None:

	config = BertConfig.from_pretrained(model_style)
	config.add_cross_attention = add_cross_attention
	config.encoder_width = vision_model_width
	config.cross_attention_freq = cross_attention_freq
	config.query_length = num_query_token
	super().__init__(config)