Upload folder using huggingface_hub

d1ceb73 verified 11 months ago

75.7 kB

	# coding=utf-8
	# Copyright 2022 Google AI and The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""PyTorch OWL-ViT model."""

	from dataclasses import dataclass
	from functools import lru_cache
	from typing import Any, Dict, Optional, Tuple, Union

	import torch
	import torch.utils.checkpoint
	from torch import Tensor, nn

	from ...activations import ACT2FN
	from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
	from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
	from ...modeling_utils import PreTrainedModel
	from ...utils import (
	ModelOutput,
	add_start_docstrings,
	add_start_docstrings_to_model_forward,
	is_vision_available,
	logging,
	replace_return_docstrings,
	)
	from .configuration_owlvit import OwlViTConfig, OwlViTTextConfig, OwlViTVisionConfig


	if is_vision_available():
	from transformers.image_transforms import center_to_corners_format


	logger = logging.get_logger(__name__)

	_CHECKPOINT_FOR_DOC = "google/owlvit-base-patch32"

	# See all OwlViT models at https://huggingface.co/models?filter=owlvit


	# Copied from transformers.models.clip.modeling_clip.contrastive_loss with clip->owlvit
	def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
	return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))


	# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->owlvit
	def owlvit_loss(similarity: torch.Tensor) -> torch.Tensor:
	caption_loss = contrastive_loss(similarity)
	image_loss = contrastive_loss(similarity.t())
	return (caption_loss + image_loss) / 2.0


	@dataclass
	class OwlViTOutput(ModelOutput):
	"""
	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `return_loss` is `True`):
	Contrastive loss for image-text similarity.
	logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
	The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
	similarity scores.
	logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
	The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
	similarity scores.
	text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`):
	The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
	image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
	The image embeddings obtained by applying the projection layer to the pooled output of
	[`OwlViTVisionModel`].
	text_model_output (Tuple[`BaseModelOutputWithPooling`]):
	The output of the [`OwlViTTextModel`].
	vision_model_output (`BaseModelOutputWithPooling`):
	The output of the [`OwlViTVisionModel`].
	"""

	loss: Optional[torch.FloatTensor] = None
	logits_per_image: torch.FloatTensor = None
	logits_per_text: torch.FloatTensor = None
	text_embeds: torch.FloatTensor = None
	image_embeds: torch.FloatTensor = None
	text_model_output: BaseModelOutputWithPooling = None
	vision_model_output: BaseModelOutputWithPooling = None

	def to_tuple(self) -> Tuple[Any]:
	return tuple(
	self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
	for k in self.keys()
	)


	# Copied from transformers.models.detr.modeling_detr._upcast
	def _upcast(t: Tensor) -> Tensor:
	# Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
	if t.is_floating_point():
	return t if t.dtype in (torch.float32, torch.float64) else t.float()
	else:
	return t if t.dtype in (torch.int32, torch.int64) else t.int()


	# Copied from transformers.models.detr.modeling_detr.box_area
	def box_area(boxes: Tensor) -> Tensor:
	"""
	Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

	Args:
	boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
	Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
	< x2` and `0 <= y1 < y2`.

	Returns:
	`torch.FloatTensor`: a tensor containing the area for each box.
	"""
	boxes = _upcast(boxes)
	return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


	# Copied from transformers.models.detr.modeling_detr.box_iou
	def box_iou(boxes1, boxes2):
	area1 = box_area(boxes1)
	area2 = box_area(boxes2)

	left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
	right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]

	width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2]
	inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M]

	union = area1[:, None] + area2 - inter

	iou = inter / union
	return iou, union


	# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
	def generalized_box_iou(boxes1, boxes2):
	"""
	Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.

	Returns:
	`torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
	"""
	# degenerate boxes gives inf / nan results
	# so do an early check
	if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
	raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
	if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
	raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
	iou, union = box_iou(boxes1, boxes2)

	top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
	bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

	width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2]
	area = width_height[:, :, 0] * width_height[:, :, 1]

	return iou - (area - union) / area


	@dataclass
	class OwlViTObjectDetectionOutput(ModelOutput):
	"""
	Output type of [`OwlViTForObjectDetection`].

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` are provided)):
	Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
	bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
	scale-invariant IoU loss.
	loss_dict (`Dict`, optional):
	A dictionary containing the individual losses. Useful for logging.
	logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
	Classification logits (including no-object) for all queries.
	pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
	Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
	values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
	possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to retrieve the
	unnormalized bounding boxes.
	text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
	The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
	image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
	Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
	image embeddings for each patch.
	class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
	Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total
	number of patches is (image_size / patch_size)**2.
	text_model_output (Tuple[`BaseModelOutputWithPooling`]):
	The output of the [`OwlViTTextModel`].
	vision_model_output (`BaseModelOutputWithPooling`):
	The output of the [`OwlViTVisionModel`].
	"""

	loss: Optional[torch.FloatTensor] = None
	loss_dict: Optional[Dict] = None
	logits: torch.FloatTensor = None
	pred_boxes: torch.FloatTensor = None
	text_embeds: torch.FloatTensor = None
	image_embeds: torch.FloatTensor = None
	class_embeds: torch.FloatTensor = None
	text_model_output: BaseModelOutputWithPooling = None
	vision_model_output: BaseModelOutputWithPooling = None

	def to_tuple(self) -> Tuple[Any]:
	return tuple(
	self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
	for k in self.keys()
	)


	@dataclass
	class OwlViTImageGuidedObjectDetectionOutput(ModelOutput):
	"""
	Output type of [`OwlViTForObjectDetection.image_guided_detection`].

	Args:
	logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
	Classification logits (including no-object) for all queries.
	target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
	Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
	values are normalized in [0, 1], relative to the size of each individual target image in the batch
	(disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
	retrieve the unnormalized bounding boxes.
	query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
	Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
	values are normalized in [0, 1], relative to the size of each individual query image in the batch
	(disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
	retrieve the unnormalized bounding boxes.
	image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
	Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
	image embeddings for each patch.
	query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
	Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
	image embeddings for each patch.
	class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
	Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total
	number of patches is (image_size / patch_size)**2.
	text_model_output (Tuple[`BaseModelOutputWithPooling`]):
	The output of the [`OwlViTTextModel`].
	vision_model_output (`BaseModelOutputWithPooling`):
	The output of the [`OwlViTVisionModel`].
	"""

	logits: torch.FloatTensor = None
	image_embeds: torch.FloatTensor = None
	query_image_embeds: torch.FloatTensor = None
	target_pred_boxes: torch.FloatTensor = None
	query_pred_boxes: torch.FloatTensor = None
	class_embeds: torch.FloatTensor = None
	text_model_output: BaseModelOutputWithPooling = None
	vision_model_output: BaseModelOutputWithPooling = None

	def to_tuple(self) -> Tuple[Any]:
	return tuple(
	self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
	for k in self.keys()
	)


	class OwlViTVisionEmbeddings(nn.Module):
	def __init__(self, config: OwlViTVisionConfig):
	super().__init__()
	self.config = config
	self.embed_dim = config.hidden_size
	self.class_embedding = nn.Parameter(torch.randn(config.hidden_size))

	self.patch_embedding = nn.Conv2d(
	in_channels=config.num_channels,
	out_channels=self.embed_dim,
	kernel_size=config.patch_size,
	stride=config.patch_size,
	bias=False,
	)

	self.num_patches = (config.image_size // config.patch_size) ** 2
	self.num_positions = self.num_patches + 1
	self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
	self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)

	def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
	batch_size = pixel_values.shape[0]
	patch_embeds = self.patch_embedding(pixel_values) # shape = [batch_size, num_channels, height, width]
	patch_embeds = patch_embeds.flatten(2).transpose(1, 2)

	class_embeds = self.class_embedding.expand(batch_size, 1, -1)
	embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
	embeddings = embeddings + self.position_embedding(self.position_ids)

	return embeddings


	class OwlViTTextEmbeddings(nn.Module):
	def __init__(self, config: OwlViTTextConfig):
	super().__init__()
	self.token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
	self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)

	# position_ids (1, len position emb) is contiguous in memory and exported when serialized
	self.register_buffer(
	"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
	)

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	) -> torch.Tensor:
	seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]

	if position_ids is None:
	position_ids = self.position_ids[:, :seq_length]

	if inputs_embeds is None:
	inputs_embeds = self.token_embedding(input_ids)

	position_embeddings = self.position_embedding(position_ids)
	embeddings = inputs_embeds + position_embeddings

	return embeddings


	class OwlViTAttention(nn.Module):
	"""Multi-headed attention from 'Attention Is All You Need' paper"""

	def __init__(self, config):
	super().__init__()
	self.config = config
	self.embed_dim = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = self.embed_dim // self.num_heads
	if self.head_dim * self.num_heads != self.embed_dim:
	raise ValueError(
	f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
	f" {self.num_heads})."
	)
	self.scale = self.head_dim**-0.5
	self.dropout = config.attention_dropout

	self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

	def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
	return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	causal_attention_mask: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
	"""Input shape: Batch x Time x Channel"""

	bsz, tgt_len, embed_dim = hidden_states.size()

	# get query proj
	query_states = self.q_proj(hidden_states) * self.scale
	key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
	value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

	proj_shape = (bsz * self.num_heads, -1, self.head_dim)
	query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
	key_states = key_states.view(*proj_shape)
	value_states = value_states.view(*proj_shape)

	src_len = key_states.size(1)
	attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

	if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
	raise ValueError(
	f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
	f" {attn_weights.size()}"
	)

	# apply the causal_attention_mask first
	if causal_attention_mask is not None:
	if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
	raise ValueError(
	f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
	f" {causal_attention_mask.size()}"
	)
	attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
	attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

	if attention_mask is not None:
	if attention_mask.size() != (bsz, 1, tgt_len, src_len):
	raise ValueError(
	f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
	)
	attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
	attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

	attn_weights = nn.functional.softmax(attn_weights, dim=-1)

	if output_attentions:
	# this operation is a bit akward, but it's required to
	# make sure that attn_weights keeps its gradient.
	# In order to do so, attn_weights have to reshaped
	# twice and have to be reused in the following
	attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
	attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
	else:
	attn_weights_reshaped = None

	attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)

	# For int8 compatibility, sometimes the `attn_probs` are in `fp32`
	attn_probs = attn_probs.to(value_states.dtype)

	attn_output = torch.bmm(attn_probs, value_states)

	if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
	raise ValueError(
	f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
	f" {attn_output.size()}"
	)

	attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
	attn_output = attn_output.transpose(1, 2)
	attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)

	attn_output = self.out_proj(attn_output)

	return attn_output, attn_weights_reshaped


	# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->OwlViT
	class OwlViTMLP(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.activation_fn = ACT2FN[config.hidden_act]
	self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
	self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	hidden_states = self.fc1(hidden_states)
	hidden_states = self.activation_fn(hidden_states)
	hidden_states = self.fc2(hidden_states)
	return hidden_states


	# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoderLayer with AltCLIP->OwlViT
	class OwlViTEncoderLayer(nn.Module):
	def __init__(self, config: OwlViTConfig):
	super().__init__()
	self.embed_dim = config.hidden_size
	self.self_attn = OwlViTAttention(config)
	self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
	self.mlp = OwlViTMLP(config)
	self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: torch.Tensor,
	causal_attention_mask: torch.Tensor,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.FloatTensor]:
	"""
	Args:
	hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
	attention_mask (`torch.FloatTensor`): attention mask of size
	`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
	`(config.encoder_attention_heads,)`.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under
	returned tensors for more detail.
	"""
	residual = hidden_states

	hidden_states = self.layer_norm1(hidden_states)
	hidden_states, attn_weights = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	causal_attention_mask=causal_attention_mask,
	output_attentions=output_attentions,
	)
	hidden_states = residual + hidden_states

	residual = hidden_states
	hidden_states = self.layer_norm2(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states

	outputs = (hidden_states,)

	if output_attentions:
	outputs += (attn_weights,)

	return outputs


	class OwlViTPreTrainedModel(PreTrainedModel):
	"""
	An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
	models.
	"""

	config_class = OwlViTConfig
	base_model_prefix = "owlvit"
	supports_gradient_checkpointing = True
	_no_split_modules = ["OwlViTEncoderLayer"]

	def _init_weights(self, module):
	"""Initialize the weights"""
	factor = self.config.initializer_factor
	if isinstance(module, OwlViTTextEmbeddings):
	module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
	module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
	elif isinstance(module, OwlViTVisionEmbeddings):
	factor = self.config.initializer_factor
	nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim*-0.5 factor)
	nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
	nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
	elif isinstance(module, OwlViTAttention):
	factor = self.config.initializer_factor
	in_proj_std = (module.embed_dim*-0.5) ((2 * module.config.num_hidden_layers) ** -0.5) * factor
	out_proj_std = (module.embed_dim*-0.5) factor
	nn.init.normal_(module.q_proj.weight, std=in_proj_std)
	nn.init.normal_(module.k_proj.weight, std=in_proj_std)
	nn.init.normal_(module.v_proj.weight, std=in_proj_std)
	nn.init.normal_(module.out_proj.weight, std=out_proj_std)
	elif isinstance(module, OwlViTMLP):
	factor = self.config.initializer_factor
	in_proj_std = (module.config.hidden_size*-0.5) ((2 * module.config.num_hidden_layers) ** -0.5) * factor
	fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
	nn.init.normal_(module.fc1.weight, std=fc_std)
	nn.init.normal_(module.fc2.weight, std=in_proj_std)
	elif isinstance(module, OwlViTModel):
	nn.init.normal_(
	module.text_projection.weight,
	std=module.text_embed_dim*-0.5 self.config.initializer_factor,
	)
	nn.init.normal_(
	module.visual_projection.weight,
	std=module.vision_embed_dim*-0.5 self.config.initializer_factor,
	)
	if isinstance(module, nn.LayerNorm):
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)
	if isinstance(module, nn.Linear) and module.bias is not None:
	module.bias.data.zero_()


	OWLVIT_START_DOCSTRING = r"""

	This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
	library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
	etc.)

	This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
	Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
	and behavior.

	Parameters:
	config ([`OwlViTConfig`]): Model configuration class with all the parameters of the model.
	Initializing with a config file does not load the weights associated with the model, only the
	configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
	"""

	OWLVIT_TEXT_INPUTS_DOCSTRING = r"""
	Args:
	input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
	Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
	[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
	IDs?](../glossary#input-ids)
	attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, optional):
	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.
	[What are attention masks?](../glossary#attention-mask)
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
	tensors for more detail.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
	more detail.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	"""

	OWLVIT_VISION_INPUTS_DOCSTRING = r"""
	Args:
	pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	Pixel values.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
	tensors for more detail.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
	more detail.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	"""

	OWLVIT_INPUTS_DOCSTRING = r"""
	Args:
	input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
	Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
	[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
	IDs?](../glossary#input-ids)
	attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, optional):
	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.
	[What are attention masks?](../glossary#attention-mask)
	pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	Pixel values.
	return_loss (`bool`, optional):
	Whether or not to return the contrastive loss.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
	tensors for more detail.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
	more detail.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	"""

	OWLVIT_OBJECT_DETECTION_INPUTS_DOCSTRING = r"""
	Args:
	pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	Pixel values.
	input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, optional):
	Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
	[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
	IDs?](../glossary#input-ids).
	attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, optional):
	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.
	[What are attention masks?](../glossary#attention-mask)
	output_hidden_states (`bool`, optional):
	Whether or not to return the last hidden state. See `text_model_last_hidden_state` and
	`vision_model_last_hidden_state` under returned tensors for more detail.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	"""

	OWLVIT_IMAGE_GUIDED_OBJECT_DETECTION_INPUTS_DOCSTRING = r"""
	Args:
	pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	Pixel values.
	query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	Pixel values of query image(s) to be detected. Pass in one query image per target image.
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
	tensors for more detail.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
	more detail.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	"""


	class OwlViTEncoder(nn.Module):
	"""
	Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
	[`OwlViTEncoderLayer`].

	Args:
	config: OwlViTConfig
	"""

	def __init__(self, config: OwlViTConfig):
	super().__init__()
	self.layers = nn.ModuleList([OwlViTEncoderLayer(config) for _ in range(config.num_hidden_layers)])
	self.gradient_checkpointing = False

	def forward(
	self,
	inputs_embeds,
	attention_mask: Optional[torch.Tensor] = None,
	causal_attention_mask: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, BaseModelOutput]:
	r"""
	Args:
	inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`).
	attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, optional):
	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.
	[What are attention masks?](../glossary#attention-mask)
	causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, optional):
	Causal mask for the text model. Mask values selected in `[0, 1]`:
	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.
	[What are attention masks?](../glossary#attention-mask)
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under
	returned tensors for more detail.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
	for more detail.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	encoder_states = () if output_hidden_states else None
	all_attentions = () if output_attentions else None

	hidden_states = inputs_embeds
	for encoder_layer in self.layers:
	if output_hidden_states:
	encoder_states = encoder_states + (hidden_states,)
	if self.gradient_checkpointing and self.training:
	layer_outputs = self._gradient_checkpointing_func(
	encoder_layer.__call__,
	hidden_states,
	attention_mask,
	causal_attention_mask,
	output_attentions,
	)
	else:
	layer_outputs = encoder_layer(
	hidden_states,
	attention_mask,
	causal_attention_mask,
	output_attentions=output_attentions,
	)

	hidden_states = layer_outputs[0]

	if output_attentions:
	all_attentions = all_attentions + (layer_outputs[1],)

	if output_hidden_states:
	encoder_states = encoder_states + (hidden_states,)

	if not return_dict:
	return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
	return BaseModelOutput(
	last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
	)


	class OwlViTTextTransformer(nn.Module):
	def __init__(self, config: OwlViTTextConfig):
	super().__init__()
	self.config = config
	embed_dim = config.hidden_size
	self.embeddings = OwlViTTextEmbeddings(config)
	self.encoder = OwlViTEncoder(config)
	self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

	@add_start_docstrings_to_model_forward(OWLVIT_TEXT_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTTextConfig)
	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, BaseModelOutputWithPooling]:
	r"""
	Returns:
	"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	input_shape = input_ids.size()
	input_ids = input_ids.view(-1, input_shape[-1])
	hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)

	# num_samples, seq_len = input_shape where num_samples = batch_size * num_max_text_queries
	# OWLVIT's text model uses causal mask, prepare it here.
	# https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
	causal_attention_mask = _create_4d_causal_attention_mask(
	input_shape, hidden_states.dtype, device=hidden_states.device
	)
	# expand attention_mask
	if attention_mask is not None:
	# [num_samples, seq_len] -> [num_samples, 1, tgt_seq_len, src_seq_len]
	attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)

	encoder_outputs = self.encoder(
	inputs_embeds=hidden_states,
	attention_mask=attention_mask,
	causal_attention_mask=causal_attention_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	last_hidden_state = encoder_outputs[0]
	last_hidden_state = self.final_layer_norm(last_hidden_state)

	# take features from the end of tokens embedding (end of token is the highest number in each sequence)
	# casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
	pooled_output = last_hidden_state[
	torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
	input_ids.to(torch.int).argmax(dim=-1).to(last_hidden_state.device),
	]

	if not return_dict:
	return (last_hidden_state, pooled_output) + encoder_outputs[1:]

	return BaseModelOutputWithPooling(
	last_hidden_state=last_hidden_state,
	pooler_output=pooled_output,
	hidden_states=encoder_outputs.hidden_states,
	attentions=encoder_outputs.attentions,
	)


	class OwlViTTextModel(OwlViTPreTrainedModel):
	config_class = OwlViTTextConfig

	def __init__(self, config: OwlViTTextConfig):
	super().__init__(config)
	self.text_model = OwlViTTextTransformer(config)
	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self) -> nn.Module:
	return self.text_model.embeddings.token_embedding

	def set_input_embeddings(self, value):
	self.text_model.embeddings.token_embedding = value

	@add_start_docstrings_to_model_forward(OWLVIT_TEXT_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTTextConfig)
	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, BaseModelOutputWithPooling]:
	r"""
	Returns:

	Examples:
	```python
	>>> from transformers import AutoProcessor, OwlViTTextModel

	>>> model = OwlViTTextModel.from_pretrained("google/owlvit-base-patch32")
	>>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
	>>> inputs = processor(
	... text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
	... )
	>>> outputs = model(**inputs)
	>>> last_hidden_state = outputs.last_hidden_state
	>>> pooled_output = outputs.pooler_output # pooled (EOS token) states
	```"""

	# Get embeddings for all text queries in all batch samples
	return self.text_model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)


	class OwlViTVisionTransformer(nn.Module):
	def __init__(self, config: OwlViTVisionConfig):
	super().__init__()
	self.config = config

	self.embeddings = OwlViTVisionEmbeddings(config)
	self.pre_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.encoder = OwlViTEncoder(config)
	self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

	@add_start_docstrings_to_model_forward(OWLVIT_VISION_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTVisionConfig)
	def forward(
	self,
	pixel_values: torch.FloatTensor,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, BaseModelOutputWithPooling]:
	r"""
	Returns:
	"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# Cast the input to the expected `dtype`
	expected_input_dtype = self.embeddings.patch_embedding.weight.dtype
	pixel_values = pixel_values.to(expected_input_dtype)

	hidden_states = self.embeddings(pixel_values)
	hidden_states = self.pre_layernorm(hidden_states)

	encoder_outputs = self.encoder(
	inputs_embeds=hidden_states,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	last_hidden_state = encoder_outputs[0]
	pooled_output = last_hidden_state[:, 0, :]

	pooled_output = self.post_layernorm(pooled_output)

	if not return_dict:
	return (last_hidden_state, pooled_output) + encoder_outputs[1:]

	return BaseModelOutputWithPooling(
	last_hidden_state=last_hidden_state,
	pooler_output=pooled_output,
	hidden_states=encoder_outputs.hidden_states,
	attentions=encoder_outputs.attentions,
	)


	class OwlViTVisionModel(OwlViTPreTrainedModel):
	config_class = OwlViTVisionConfig
	main_input_name = "pixel_values"

	def __init__(self, config: OwlViTVisionConfig):
	super().__init__(config)
	self.vision_model = OwlViTVisionTransformer(config)
	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self) -> nn.Module:
	return self.vision_model.embeddings.patch_embedding

	@add_start_docstrings_to_model_forward(OWLVIT_VISION_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTVisionConfig)
	def forward(
	self,
	pixel_values: Optional[torch.FloatTensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, BaseModelOutputWithPooling]:
	r"""
	Returns:

	Examples:
	```python
	>>> from PIL import Image
	>>> import requests
	>>> from transformers import AutoProcessor, OwlViTVisionModel

	>>> model = OwlViTVisionModel.from_pretrained("google/owlvit-base-patch32")
	>>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
	>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
	>>> image = Image.open(requests.get(url, stream=True).raw)

	>>> inputs = processor(images=image, return_tensors="pt")

	>>> outputs = model(**inputs)
	>>> last_hidden_state = outputs.last_hidden_state
	>>> pooled_output = outputs.pooler_output # pooled CLS states
	```"""
	return self.vision_model(
	pixel_values=pixel_values,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)


	@add_start_docstrings(OWLVIT_START_DOCSTRING)
	class OwlViTModel(OwlViTPreTrainedModel):
	config_class = OwlViTConfig

	def __init__(self, config: OwlViTConfig):
	super().__init__(config)

	if not isinstance(config.text_config, OwlViTTextConfig):
	raise TypeError(
	"config.text_config is expected to be of type OwlViTTextConfig but is of type"
	f" {type(config.text_config)}."
	)

	if not isinstance(config.vision_config, OwlViTVisionConfig):
	raise TypeError(
	"config.vision_config is expected to be of type OwlViTVisionConfig but is of type"
	f" {type(config.vision_config)}."
	)

	text_config = config.text_config
	vision_config = config.vision_config

	self.projection_dim = config.projection_dim
	self.text_embed_dim = text_config.hidden_size
	self.vision_embed_dim = vision_config.hidden_size

	self.text_model = OwlViTTextTransformer(text_config)
	self.vision_model = OwlViTVisionTransformer(vision_config)

	self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
	self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
	self.logit_scale = nn.Parameter(torch.tensor(config.logit_scale_init_value))

	# Initialize weights and apply final processing
	self.post_init()

	@add_start_docstrings_to_model_forward(OWLVIT_TEXT_INPUTS_DOCSTRING)
	def get_text_features(
	self,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> torch.FloatTensor:
	r"""
	Returns:
	text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
	applying the projection layer to the pooled output of [`OwlViTTextModel`].

	Examples:
	```python
	>>> from transformers import AutoProcessor, OwlViTModel

	>>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
	>>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
	>>> inputs = processor(
	... text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
	... )
	>>> text_features = model.get_text_features(**inputs)
	```"""
	# Use OWL-ViT model's config for some fields (if specified) instead of those of vision & text components.
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# Get embeddings for all text queries in all batch samples
	text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=return_dict)
	pooled_output = text_output[1]
	text_features = self.text_projection(pooled_output)

	return text_features

	@add_start_docstrings_to_model_forward(OWLVIT_VISION_INPUTS_DOCSTRING)
	def get_image_features(
	self,
	pixel_values: Optional[torch.FloatTensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> torch.FloatTensor:
	r"""
	Returns:
	image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
	applying the projection layer to the pooled output of [`OwlViTVisionModel`].

	Examples:
	```python
	>>> from PIL import Image
	>>> import requests
	>>> from transformers import AutoProcessor, OwlViTModel

	>>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
	>>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
	>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
	>>> image = Image.open(requests.get(url, stream=True).raw)
	>>> inputs = processor(images=image, return_tensors="pt")
	>>> image_features = model.get_image_features(**inputs)
	```"""
	# Use OWL-ViT model's config for some fields (if specified) instead of those of vision & text components.
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	vision_outputs = self.vision_model(
	pixel_values=pixel_values,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	pooled_output = vision_outputs[1]
	image_features = self.visual_projection(pooled_output)

	return image_features

	@add_start_docstrings_to_model_forward(OWLVIT_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=OwlViTOutput, config_class=OwlViTConfig)
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	pixel_values: Optional[torch.FloatTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	return_loss: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_base_image_embeds: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, OwlViTOutput]:
	r"""
	Returns:

	Examples:
	```python
	>>> from PIL import Image
	>>> import requests
	>>> from transformers import AutoProcessor, OwlViTModel

	>>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
	>>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
	>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
	>>> image = Image.open(requests.get(url, stream=True).raw)
	>>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt")
	>>> outputs = model(**inputs)
	>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
	>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
	```"""
	# Use OWL-ViT model's config for some fields (if specified) instead of those of vision & text components.
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	vision_outputs = self.vision_model(
	pixel_values=pixel_values,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	# Get embeddings for all text queries in all batch samples
	text_outputs = self.text_model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	text_embeds = text_outputs[1]
	text_embeds = self.text_projection(text_embeds)
	image_embeds = vision_outputs[1]
	image_embeds = self.visual_projection(image_embeds)

	# normalized features
	image_embeds = image_embeds / torch.linalg.norm(image_embeds, ord=2, dim=-1, keepdim=True)
	text_embeds_norm = text_embeds / torch.linalg.norm(text_embeds, ord=2, dim=-1, keepdim=True)

	# cosine similarity as logits and set it on the correct device
	logit_scale = self.logit_scale.exp().to(image_embeds.device)

	logits_per_text = torch.matmul(text_embeds_norm, image_embeds.t()) * logit_scale
	logits_per_image = logits_per_text.t()

	loss = None
	if return_loss:
	loss = owlvit_loss(logits_per_text)

	text_embeds = text_embeds_norm

	if not return_dict:
	output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
	return ((loss,) + output) if loss is not None else output

	return OwlViTOutput(
	loss=loss,
	logits_per_image=logits_per_image,
	logits_per_text=logits_per_text,
	text_embeds=text_embeds,
	image_embeds=image_embeds,
	text_model_output=text_outputs,
	vision_model_output=vision_outputs,
	)


	class OwlViTBoxPredictionHead(nn.Module):
	def __init__(self, config: OwlViTConfig, out_dim: int = 4):
	super().__init__()

	width = config.vision_config.hidden_size
	self.dense0 = nn.Linear(width, width)
	self.dense1 = nn.Linear(width, width)
	self.gelu = nn.GELU()
	self.dense2 = nn.Linear(width, out_dim)

	def forward(self, image_features: torch.Tensor) -> torch.FloatTensor:
	output = self.dense0(image_features)
	output = self.gelu(output)
	output = self.dense1(output)
	output = self.gelu(output)
	output = self.dense2(output)
	return output


	class OwlViTClassPredictionHead(nn.Module):
	def __init__(self, config: OwlViTConfig):
	super().__init__()

	out_dim = config.text_config.hidden_size
	self.query_dim = config.vision_config.hidden_size

	self.dense0 = nn.Linear(self.query_dim, out_dim)
	self.logit_shift = nn.Linear(self.query_dim, 1)
	self.logit_scale = nn.Linear(self.query_dim, 1)
	self.elu = nn.ELU()

	def forward(
	self,
	image_embeds: torch.FloatTensor,
	query_embeds: Optional[torch.FloatTensor],
	query_mask: Optional[torch.Tensor],
	) -> Tuple[torch.FloatTensor]:
	image_class_embeds = self.dense0(image_embeds)
	if query_embeds is None:
	device = image_class_embeds.device
	batch_size, num_patches = image_class_embeds.shape[:2]
	pred_logits = torch.zeros((batch_size, num_patches, self.query_dim)).to(device)
	return (pred_logits, image_class_embeds)

	# Normalize image and text features
	image_class_embeds = image_class_embeds / (torch.linalg.norm(image_class_embeds, dim=-1, keepdim=True) + 1e-6)
	query_embeds = query_embeds / (torch.linalg.norm(query_embeds, dim=-1, keepdim=True) + 1e-6)

	# Get class predictions
	pred_logits = torch.einsum("...pd,...qd->...pq", image_class_embeds, query_embeds)

	# Apply a learnable shift and scale to logits
	logit_shift = self.logit_shift(image_embeds)
	logit_scale = self.logit_scale(image_embeds)
	logit_scale = self.elu(logit_scale) + 1
	pred_logits = (pred_logits + logit_shift) * logit_scale

	if query_mask is not None:
	if query_mask.ndim > 1:
	query_mask = torch.unsqueeze(query_mask, dim=-2)

	pred_logits = torch.where(query_mask == 0, torch.finfo(pred_logits.dtype).min, pred_logits)
	pred_logits = pred_logits.to(torch.float32)

	return (pred_logits, image_class_embeds)


	class OwlViTForObjectDetection(OwlViTPreTrainedModel):
	config_class = OwlViTConfig

	def __init__(self, config: OwlViTConfig):
	super().__init__(config)

	self.owlvit = OwlViTModel(config)
	self.class_head = OwlViTClassPredictionHead(config)
	self.box_head = OwlViTBoxPredictionHead(config)

	self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
	self.sigmoid = nn.Sigmoid()

	self.sqrt_num_patches = config.vision_config.image_size // config.vision_config.patch_size
	self.box_bias = self.compute_box_bias(self.sqrt_num_patches)

	@staticmethod
	def normalize_grid_corner_coordinates(num_patches: int) -> torch.Tensor:
	# Create grid coordinates using torch
	x_coordinates = torch.arange(1, num_patches + 1, dtype=torch.float32)
	y_coordinates = torch.arange(1, num_patches + 1, dtype=torch.float32)
	xx, yy = torch.meshgrid(x_coordinates, y_coordinates, indexing="xy")

	# Stack the coordinates and divide by num_patches
	box_coordinates = torch.stack((xx, yy), dim=-1)
	box_coordinates /= num_patches

	# Flatten (h, w, 2) -> (h*w, 2)
	box_coordinates = box_coordinates.view(-1, 2)

	return box_coordinates

	@lru_cache(maxsize=2)
	def compute_box_bias(self, num_patches: int, feature_map: Optional[torch.FloatTensor] = None) -> torch.Tensor:
	if feature_map is not None:
	raise ValueError("feature_map has been deprecated as an input. Please pass in num_patches instead")
	# The box center is biased to its position on the feature grid
	box_coordinates = self.normalize_grid_corner_coordinates(num_patches)
	box_coordinates = torch.clip(box_coordinates, 0.0, 1.0)

	# Unnormalize xy
	box_coord_bias = torch.log(box_coordinates + 1e-4) - torch.log1p(-box_coordinates + 1e-4)

	# The box size is biased to the patch size
	box_size = torch.full_like(box_coord_bias, 1.0 / num_patches)
	box_size_bias = torch.log(box_size + 1e-4) - torch.log1p(-box_size + 1e-4)

	# Compute box bias
	box_bias = torch.cat([box_coord_bias, box_size_bias], dim=-1)
	return box_bias

	def box_predictor(
	self,
	image_feats: torch.FloatTensor,
	feature_map: torch.FloatTensor,
	) -> torch.FloatTensor:
	"""
	Args:
	image_feats:
	Features extracted from the image, returned by the `image_text_embedder` method.
	feature_map:
	A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
	Returns:
	pred_boxes:
	List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
	"""
	# Bounding box detection head [batch_size, num_boxes, 4].
	pred_boxes = self.box_head(image_feats)

	# Compute the location of each token on the grid and use it to compute a bias for the bbox prediction
	box_bias = self.box_bias.to(feature_map.device)
	pred_boxes += box_bias
	pred_boxes = self.sigmoid(pred_boxes)
	return pred_boxes

	def class_predictor(
	self,
	image_feats: torch.FloatTensor,
	query_embeds: Optional[torch.FloatTensor] = None,
	query_mask: Optional[torch.Tensor] = None,
	) -> Tuple[torch.FloatTensor]:
	"""
	Args:
	image_feats:
	Features extracted from the `image_text_embedder`.
	query_embeds:
	Text query embeddings.
	query_mask:
	Must be provided with query_embeddings. A mask indicating which query embeddings are valid.
	"""
	(pred_logits, image_class_embeds) = self.class_head(image_feats, query_embeds, query_mask)

	return (pred_logits, image_class_embeds)

	def image_text_embedder(
	self,
	input_ids: torch.Tensor,
	pixel_values: torch.FloatTensor,
	attention_mask: torch.Tensor,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	) -> Tuple[torch.FloatTensor]:
	# Encode text and image
	outputs = self.owlvit(
	pixel_values=pixel_values,
	input_ids=input_ids,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=True,
	)

	# Get image embeddings
	last_hidden_state = outputs.vision_model_output[0]
	image_embeds = self.owlvit.vision_model.post_layernorm(last_hidden_state)

	# Resize class token
	class_token_out = torch.broadcast_to(image_embeds[:, :1, :], image_embeds[:, :-1].shape)

	# Merge image embedding with class tokens
	image_embeds = image_embeds[:, 1:, :] * class_token_out
	image_embeds = self.layer_norm(image_embeds)

	# Resize to [batch_size, num_patches, num_patches, hidden_size]
	new_size = (
	image_embeds.shape[0],
	self.sqrt_num_patches,
	self.sqrt_num_patches,
	image_embeds.shape[-1],
	)
	image_embeds = image_embeds.reshape(new_size)
	text_embeds = outputs[-4]

	return (text_embeds, image_embeds, outputs)

	def image_embedder(
	self,
	pixel_values: torch.FloatTensor,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	) -> Tuple[torch.FloatTensor]:
	# Get OwlViTModel vision embeddings (same as CLIP)
	vision_outputs = self.owlvit.vision_model(pixel_values=pixel_values, return_dict=True)

	# Apply post_layernorm to last_hidden_state, return non-projected output
	last_hidden_state = vision_outputs[0]
	image_embeds = self.owlvit.vision_model.post_layernorm(last_hidden_state)

	# Resize class token
	class_token_out = torch.broadcast_to(image_embeds[:, :1, :], image_embeds[:, :-1].shape)

	# Merge image embedding with class tokens
	image_embeds = image_embeds[:, 1:, :] * class_token_out
	image_embeds = self.layer_norm(image_embeds)

	# Resize to [batch_size, num_patches, num_patches, hidden_size]
	new_size = (
	image_embeds.shape[0],
	self.sqrt_num_patches,
	self.sqrt_num_patches,
	image_embeds.shape[-1],
	)
	image_embeds = image_embeds.reshape(new_size)

	return (image_embeds, vision_outputs)

	def embed_image_query(
	self, query_image_features: torch.FloatTensor, query_feature_map: torch.FloatTensor
	) -> torch.FloatTensor:
	_, class_embeds = self.class_predictor(query_image_features)
	pred_boxes = self.box_predictor(query_image_features, query_feature_map)
	pred_boxes_as_corners = center_to_corners_format(pred_boxes)

	# Loop over query images
	best_class_embeds = []
	best_box_indices = []
	pred_boxes_device = pred_boxes_as_corners.device

	for i in range(query_image_features.shape[0]):
	each_query_box = torch.tensor([[0, 0, 1, 1]], device=pred_boxes_device)
	each_query_pred_boxes = pred_boxes_as_corners[i]
	ious, _ = box_iou(each_query_box, each_query_pred_boxes)

	# If there are no overlapping boxes, fall back to generalized IoU
	if torch.all(ious[0] == 0.0):
	ious = generalized_box_iou(each_query_box, each_query_pred_boxes)

	# Use an adaptive threshold to include all boxes within 80% of the best IoU
	iou_threshold = torch.max(ious) * 0.8

	selected_inds = (ious[0] >= iou_threshold).nonzero()
	if selected_inds.numel():
	selected_embeddings = class_embeds[i][selected_inds.squeeze(1)]
	mean_embeds = torch.mean(class_embeds[i], axis=0)
	mean_sim = torch.einsum("d,id->i", mean_embeds, selected_embeddings)
	best_box_ind = selected_inds[torch.argmin(mean_sim)]
	best_class_embeds.append(class_embeds[i][best_box_ind])
	best_box_indices.append(best_box_ind)

	if best_class_embeds:
	query_embeds = torch.stack(best_class_embeds)
	box_indices = torch.stack(best_box_indices)
	else:
	query_embeds, box_indices = None, None

	return query_embeds, box_indices, pred_boxes

	@add_start_docstrings_to_model_forward(OWLVIT_IMAGE_GUIDED_OBJECT_DETECTION_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=OwlViTImageGuidedObjectDetectionOutput, config_class=OwlViTConfig)
	def image_guided_detection(
	self,
	pixel_values: torch.FloatTensor,
	query_pixel_values: Optional[torch.FloatTensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> OwlViTImageGuidedObjectDetectionOutput:
	r"""
	Returns:

	Examples:
	```python
	>>> import requests
	>>> from PIL import Image
	>>> import torch
	>>> from transformers import AutoProcessor, OwlViTForObjectDetection

	>>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch16")
	>>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch16")
	>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
	>>> image = Image.open(requests.get(url, stream=True).raw)
	>>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg"
	>>> query_image = Image.open(requests.get(query_url, stream=True).raw)
	>>> inputs = processor(images=image, query_images=query_image, return_tensors="pt")
	>>> with torch.no_grad():
	... outputs = model.image_guided_detection(**inputs)
	>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
	>>> target_sizes = torch.Tensor([image.size[::-1]])
	>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
	>>> results = processor.post_process_image_guided_detection(
	... outputs=outputs, threshold=0.6, nms_threshold=0.3, target_sizes=target_sizes
	... )
	>>> i = 0 # Retrieve predictions for the first image
	>>> boxes, scores = results[i]["boxes"], results[i]["scores"]
	>>> for box, score in zip(boxes, scores):
	... box = [round(i, 2) for i in box.tolist()]
	... print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
	Detected similar object with confidence 0.856 at location [10.94, 50.4, 315.8, 471.39]
	Detected similar object with confidence 1.0 at location [334.84, 25.33, 636.16, 374.71]
	```"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.return_dict

	# Compute feature maps for the input and query images
	query_feature_map = self.image_embedder(pixel_values=query_pixel_values)[0]
	feature_map, vision_outputs = self.image_embedder(
	pixel_values=pixel_values,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	)

	batch_size, num_patches, num_patches, hidden_dim = feature_map.shape
	image_feats = torch.reshape(feature_map, (batch_size, num_patches * num_patches, hidden_dim))

	batch_size, num_patches, num_patches, hidden_dim = query_feature_map.shape
	query_image_feats = torch.reshape(query_feature_map, (batch_size, num_patches * num_patches, hidden_dim))
	# Get top class embedding and best box index for each query image in batch
	query_embeds, best_box_indices, query_pred_boxes = self.embed_image_query(query_image_feats, query_feature_map)

	# Predict object classes [batch_size, num_patches, num_queries+1]
	(pred_logits, class_embeds) = self.class_predictor(image_feats=image_feats, query_embeds=query_embeds)

	# Predict object boxes
	target_pred_boxes = self.box_predictor(image_feats, feature_map)

	if not return_dict:
	output = (
	feature_map,
	query_feature_map,
	target_pred_boxes,
	query_pred_boxes,
	pred_logits,
	class_embeds,
	vision_outputs.to_tuple(),
	)
	output = tuple(x for x in output if x is not None)
	return output

	return OwlViTImageGuidedObjectDetectionOutput(
	image_embeds=feature_map,
	query_image_embeds=query_feature_map,
	target_pred_boxes=target_pred_boxes,
	query_pred_boxes=query_pred_boxes,
	logits=pred_logits,
	class_embeds=class_embeds,
	text_model_output=None,
	vision_model_output=vision_outputs,
	)

	@add_start_docstrings_to_model_forward(OWLVIT_OBJECT_DETECTION_INPUTS_DOCSTRING)
	@replace_return_docstrings(output_type=OwlViTObjectDetectionOutput, config_class=OwlViTConfig)
	def forward(
	self,
	input_ids: torch.Tensor,
	pixel_values: torch.FloatTensor,
	attention_mask: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> OwlViTObjectDetectionOutput:
	r"""
	Returns:

	Examples:
	```python
	>>> import requests
	>>> from PIL import Image
	>>> import torch
	>>> from transformers import AutoProcessor, OwlViTForObjectDetection

	>>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
	>>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

	>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
	>>> image = Image.open(requests.get(url, stream=True).raw)
	>>> texts = [["a photo of a cat", "a photo of a dog"]]
	>>> inputs = processor(text=texts, images=image, return_tensors="pt")
	>>> outputs = model(**inputs)

	>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
	>>> target_sizes = torch.Tensor([image.size[::-1]])
	>>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
	>>> results = processor.post_process_object_detection(
	... outputs=outputs, threshold=0.1, target_sizes=target_sizes
	... )

	>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries
	>>> text = texts[i]
	>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

	>>> for box, score, label in zip(boxes, scores, labels):
	... box = [round(i, 2) for i in box.tolist()]
	... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
	Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
	Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
	```"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.return_dict

	# Embed images and text queries
	query_embeds, feature_map, outputs = self.image_text_embedder(
	input_ids=input_ids,
	pixel_values=pixel_values,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	)

	# Text and vision model outputs
	text_outputs = outputs.text_model_output
	vision_outputs = outputs.vision_model_output

	batch_size, num_patches, num_patches, hidden_dim = feature_map.shape
	image_feats = torch.reshape(feature_map, (batch_size, num_patches * num_patches, hidden_dim))

	# Reshape from [batch_size * max_text_queries, hidden_dim] -> [batch_size, max_text_queries, hidden_dim]
	max_text_queries = input_ids.shape[0] // batch_size
	query_embeds = query_embeds.reshape(batch_size, max_text_queries, query_embeds.shape[-1])

	# If first token is 0, then this is a padded query [batch_size, num_queries].
	input_ids = input_ids.reshape(batch_size, max_text_queries, input_ids.shape[-1])
	query_mask = input_ids[..., 0] > 0

	# Predict object classes [batch_size, num_patches, num_queries+1]
	(pred_logits, class_embeds) = self.class_predictor(image_feats, query_embeds, query_mask)

	# Predict object boxes
	pred_boxes = self.box_predictor(image_feats, feature_map)

	if not return_dict:
	output = (
	pred_logits,
	pred_boxes,
	query_embeds,
	feature_map,
	class_embeds,
	text_outputs.to_tuple(),
	vision_outputs.to_tuple(),
	)
	output = tuple(x for x in output if x is not None)
	return output

	return OwlViTObjectDetectionOutput(
	image_embeds=feature_map,
	text_embeds=query_embeds,
	pred_boxes=pred_boxes,
	logits=pred_logits,
	class_embeds=class_embeds,
	text_model_output=text_outputs,
	vision_model_output=vision_outputs,
	)