Llama-3.1-8B-DALv0.1
/
venv
/lib
/python3.12
/site-packages
/transformers
/models
/zoedepth
/modeling_zoedepth.py
# coding=utf-8 | |
# Copyright 2024 Intel Labs and The HuggingFace Inc. team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""PyTorch ZoeDepth model.""" | |
import math | |
from dataclasses import dataclass | |
from typing import List, Optional, Tuple, Union | |
import torch | |
import torch.utils.checkpoint | |
from torch import nn | |
from ...activations import ACT2FN | |
from ...file_utils import ( | |
add_start_docstrings, | |
add_start_docstrings_to_model_forward, | |
replace_return_docstrings, | |
) | |
from ...modeling_outputs import DepthEstimatorOutput | |
from ...modeling_utils import PreTrainedModel | |
from ...utils import ModelOutput, logging | |
from ...utils.backbone_utils import load_backbone | |
from .configuration_zoedepth import ZoeDepthConfig | |
logger = logging.get_logger(__name__) | |
# General docstring | |
_CONFIG_FOR_DOC = "ZoeDepthConfig" | |
class ZoeDepthDepthEstimatorOutput(ModelOutput): | |
""" | |
Extension of `DepthEstimatorOutput` to include domain logits (ZoeDepth specific). | |
Args: | |
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): | |
Classification (or regression if config.num_labels==1) loss. | |
predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`): | |
Predicted depth for each pixel. | |
domain_logits (`torch.FloatTensor` of shape `(batch_size, num_domains)`): | |
Logits for each domain (e.g. NYU and KITTI) in case multiple metric heads are used. | |
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): | |
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + | |
one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. | |
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. | |
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): | |
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, | |
sequence_length)`. | |
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |
heads. | |
""" | |
loss: Optional[torch.FloatTensor] = None | |
predicted_depth: torch.FloatTensor = None | |
domain_logits: torch.FloatTensor = None | |
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None | |
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None | |
class ZoeDepthReassembleStage(nn.Module): | |
""" | |
This class reassembles the hidden states of the backbone into image-like feature representations at various | |
resolutions. | |
This happens in 3 stages: | |
1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to | |
`config.readout_type`. | |
2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`. | |
3. Resizing the spatial dimensions (height, width). | |
Args: | |
config (`[ZoeDepthConfig]`): | |
Model configuration class defining the model architecture. | |
""" | |
def __init__(self, config): | |
super().__init__() | |
self.readout_type = config.readout_type | |
self.layers = nn.ModuleList() | |
for neck_hidden_size, factor in zip(config.neck_hidden_sizes, config.reassemble_factors): | |
self.layers.append(ZoeDepthReassembleLayer(config, channels=neck_hidden_size, factor=factor)) | |
if config.readout_type == "project": | |
self.readout_projects = nn.ModuleList() | |
hidden_size = config.backbone_hidden_size | |
for _ in config.neck_hidden_sizes: | |
self.readout_projects.append( | |
nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act]) | |
) | |
def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) -> List[torch.Tensor]: | |
""" | |
Args: | |
hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`): | |
List of hidden states from the backbone. | |
""" | |
batch_size = hidden_states[0].shape[0] | |
# stack along batch dimension | |
# shape (batch_size*num_stages, sequence_length + 1, hidden_size) | |
hidden_states = torch.cat(hidden_states, dim=0) | |
cls_token, hidden_states = hidden_states[:, 0], hidden_states[:, 1:] | |
# reshape hidden_states to (batch_size*num_stages, num_channels, height, width) | |
total_batch_size, sequence_length, num_channels = hidden_states.shape | |
hidden_states = hidden_states.reshape(total_batch_size, patch_height, patch_width, num_channels) | |
hidden_states = hidden_states.permute(0, 3, 1, 2).contiguous() | |
if self.readout_type == "project": | |
# reshape to (batch_size*num_stages, height*width, num_channels) | |
hidden_states = hidden_states.flatten(2).permute((0, 2, 1)) | |
readout = cls_token.unsqueeze(dim=1).expand_as(hidden_states) | |
# concatenate the readout token to the hidden states | |
# to get (batch_size*num_stages, height*width, 2*num_channels) | |
hidden_states = torch.cat((hidden_states, readout), -1) | |
elif self.readout_type == "add": | |
hidden_states = hidden_states + cls_token.unsqueeze(-1) | |
out = [] | |
for stage_idx, hidden_state in enumerate(hidden_states.split(batch_size, dim=0)): | |
if self.readout_type == "project": | |
hidden_state = self.readout_projects[stage_idx](hidden_state) | |
# reshape back to (batch_size, num_channels, height, width) | |
hidden_state = hidden_state.permute(0, 2, 1).reshape(batch_size, -1, patch_height, patch_width) | |
hidden_state = self.layers[stage_idx](hidden_state) | |
out.append(hidden_state) | |
return out | |
class ZoeDepthReassembleLayer(nn.Module): | |
def __init__(self, config, channels, factor): | |
super().__init__() | |
# projection | |
hidden_size = config.backbone_hidden_size | |
self.projection = nn.Conv2d(in_channels=hidden_size, out_channels=channels, kernel_size=1) | |
# up/down sampling depending on factor | |
if factor > 1: | |
self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0) | |
elif factor == 1: | |
self.resize = nn.Identity() | |
elif factor < 1: | |
# so should downsample | |
self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1) | |
# Copied from transformers.models.dpt.modeling_dpt.DPTReassembleLayer.forward with DPT->ZoeDepth | |
def forward(self, hidden_state): | |
hidden_state = self.projection(hidden_state) | |
hidden_state = self.resize(hidden_state) | |
return hidden_state | |
# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->ZoeDepth | |
class ZoeDepthFeatureFusionStage(nn.Module): | |
def __init__(self, config): | |
super().__init__() | |
self.layers = nn.ModuleList() | |
for _ in range(len(config.neck_hidden_sizes)): | |
self.layers.append(ZoeDepthFeatureFusionLayer(config)) | |
def forward(self, hidden_states): | |
# reversing the hidden_states, we start from the last | |
hidden_states = hidden_states[::-1] | |
fused_hidden_states = [] | |
# first layer only uses the last hidden_state | |
fused_hidden_state = self.layers[0](hidden_states[0]) | |
fused_hidden_states.append(fused_hidden_state) | |
# looping from the last layer to the second | |
for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]): | |
fused_hidden_state = layer(fused_hidden_state, hidden_state) | |
fused_hidden_states.append(fused_hidden_state) | |
return fused_hidden_states | |
# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer with DPT->ZoeDepth | |
class ZoeDepthPreActResidualLayer(nn.Module): | |
""" | |
ResidualConvUnit, pre-activate residual unit. | |
Args: | |
config (`[ZoeDepthConfig]`): | |
Model configuration class defining the model architecture. | |
""" | |
# Ignore copy | |
def __init__(self, config): | |
super().__init__() | |
self.use_batch_norm = config.use_batch_norm_in_fusion_residual | |
use_bias_in_fusion_residual = ( | |
config.use_bias_in_fusion_residual | |
if config.use_bias_in_fusion_residual is not None | |
else not self.use_batch_norm | |
) | |
self.activation1 = nn.ReLU() | |
self.convolution1 = nn.Conv2d( | |
config.fusion_hidden_size, | |
config.fusion_hidden_size, | |
kernel_size=3, | |
stride=1, | |
padding=1, | |
bias=use_bias_in_fusion_residual, | |
) | |
self.activation2 = nn.ReLU() | |
self.convolution2 = nn.Conv2d( | |
config.fusion_hidden_size, | |
config.fusion_hidden_size, | |
kernel_size=3, | |
stride=1, | |
padding=1, | |
bias=use_bias_in_fusion_residual, | |
) | |
if self.use_batch_norm: | |
self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size, eps=config.batch_norm_eps) | |
self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size, eps=config.batch_norm_eps) | |
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: | |
residual = hidden_state | |
hidden_state = self.activation1(hidden_state) | |
hidden_state = self.convolution1(hidden_state) | |
if self.use_batch_norm: | |
hidden_state = self.batch_norm1(hidden_state) | |
hidden_state = self.activation2(hidden_state) | |
hidden_state = self.convolution2(hidden_state) | |
if self.use_batch_norm: | |
hidden_state = self.batch_norm2(hidden_state) | |
return hidden_state + residual | |
# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer with DPT->ZoeDepth | |
class ZoeDepthFeatureFusionLayer(nn.Module): | |
"""Feature fusion layer, merges feature maps from different stages. | |
Args: | |
config (`[ZoeDepthConfig]`): | |
Model configuration class defining the model architecture. | |
align_corners (`bool`, *optional*, defaults to `True`): | |
The align_corner setting for bilinear upsample. | |
""" | |
def __init__(self, config, align_corners=True): | |
super().__init__() | |
self.align_corners = align_corners | |
self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True) | |
self.residual_layer1 = ZoeDepthPreActResidualLayer(config) | |
self.residual_layer2 = ZoeDepthPreActResidualLayer(config) | |
def forward(self, hidden_state, residual=None): | |
if residual is not None: | |
if hidden_state.shape != residual.shape: | |
residual = nn.functional.interpolate( | |
residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False | |
) | |
hidden_state = hidden_state + self.residual_layer1(residual) | |
hidden_state = self.residual_layer2(hidden_state) | |
hidden_state = nn.functional.interpolate( | |
hidden_state, scale_factor=2, mode="bilinear", align_corners=self.align_corners | |
) | |
hidden_state = self.projection(hidden_state) | |
return hidden_state | |
class ZoeDepthNeck(nn.Module): | |
""" | |
ZoeDepthNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as | |
input and produces another list of tensors as output. For ZoeDepth, it includes 2 stages: | |
* ZoeDepthReassembleStage | |
* ZoeDepthFeatureFusionStage. | |
Args: | |
config (dict): config dict. | |
""" | |
# Copied from transformers.models.dpt.modeling_dpt.DPTNeck.__init__ with DPT->ZoeDepth | |
def __init__(self, config): | |
super().__init__() | |
self.config = config | |
# postprocessing: only required in case of a non-hierarchical backbone (e.g. ViT, BEiT) | |
if config.backbone_config is not None and config.backbone_config.model_type in ["swinv2"]: | |
self.reassemble_stage = None | |
else: | |
self.reassemble_stage = ZoeDepthReassembleStage(config) | |
self.convs = nn.ModuleList() | |
for channel in config.neck_hidden_sizes: | |
self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False)) | |
# fusion | |
self.fusion_stage = ZoeDepthFeatureFusionStage(config) | |
def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) -> List[torch.Tensor]: | |
""" | |
Args: | |
hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`): | |
List of hidden states from the backbone. | |
""" | |
if not isinstance(hidden_states, (tuple, list)): | |
raise TypeError("hidden_states should be a tuple or list of tensors") | |
if len(hidden_states) != len(self.config.neck_hidden_sizes): | |
raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.") | |
# postprocess hidden states | |
if self.reassemble_stage is not None: | |
hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width) | |
features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)] | |
# fusion blocks | |
output = self.fusion_stage(features) | |
return output, features[-1] | |
class ZoeDepthRelativeDepthEstimationHead(nn.Module): | |
""" | |
Relative depth estimation head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples | |
the predictions to the input resolution after the first convolutional layer (details can be found in DPT's paper's | |
supplementary material). | |
""" | |
def __init__(self, config): | |
super().__init__() | |
self.head_in_index = config.head_in_index | |
self.projection = None | |
if config.add_projection: | |
self.projection = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
features = config.fusion_hidden_size | |
self.conv1 = nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1) | |
self.upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True) | |
self.conv2 = nn.Conv2d(features // 2, config.num_relative_features, kernel_size=3, stride=1, padding=1) | |
self.conv3 = nn.Conv2d(config.num_relative_features, 1, kernel_size=1, stride=1, padding=0) | |
def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: | |
# use last features | |
hidden_states = hidden_states[self.head_in_index] | |
if self.projection is not None: | |
hidden_states = self.projection(hidden_states) | |
hidden_states = nn.ReLU()(hidden_states) | |
hidden_states = self.conv1(hidden_states) | |
hidden_states = self.upsample(hidden_states) | |
hidden_states = self.conv2(hidden_states) | |
hidden_states = nn.ReLU()(hidden_states) | |
# we need the features here (after second conv + ReLu) | |
features = hidden_states | |
hidden_states = self.conv3(hidden_states) | |
hidden_states = nn.ReLU()(hidden_states) | |
predicted_depth = hidden_states.squeeze(dim=1) | |
return predicted_depth, features | |
def log_binom(n, k, eps=1e-7): | |
"""log(nCk) using stirling approximation""" | |
n = n + eps | |
k = k + eps | |
return n * torch.log(n) - k * torch.log(k) - (n - k) * torch.log(n - k + eps) | |
class LogBinomialSoftmax(nn.Module): | |
def __init__(self, n_classes=256, act=torch.softmax): | |
"""Compute log binomial distribution for n_classes | |
Args: | |
n_classes (`int`, *optional*, defaults to 256): | |
Number of output classes. | |
act (`torch.nn.Module`, *optional*, defaults to `torch.softmax`): | |
Activation function to apply to the output. | |
""" | |
super().__init__() | |
self.k = n_classes | |
self.act = act | |
self.register_buffer("k_idx", torch.arange(0, n_classes).view(1, -1, 1, 1), persistent=False) | |
self.register_buffer("k_minus_1", torch.Tensor([self.k - 1]).view(1, -1, 1, 1), persistent=False) | |
def forward(self, probabilities, temperature=1.0, eps=1e-4): | |
"""Compute the log binomial distribution for probabilities. | |
Args: | |
probabilities (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): | |
Tensor containing probabilities of each class. | |
temperature (`float` or `torch.Tensor` of shape `(batch_size, num_channels, height, width)`, *optional*, defaults to 1): | |
Temperature of distribution. | |
eps (`float`, *optional*, defaults to 1e-4): | |
Small number for numerical stability. | |
Returns: | |
`torch.Tensor` of shape `(batch_size, num_channels, height, width)`: | |
Log binomial distribution logbinomial(p;t). | |
""" | |
if probabilities.ndim == 3: | |
probabilities = probabilities.unsqueeze(1) # make it (batch_size, num_channels, height, width) | |
one_minus_probabilities = torch.clamp(1 - probabilities, eps, 1) | |
probabilities = torch.clamp(probabilities, eps, 1) | |
y = ( | |
log_binom(self.k_minus_1, self.k_idx) | |
+ self.k_idx * torch.log(probabilities) | |
+ (self.k_minus_1 - self.k_idx) * torch.log(one_minus_probabilities) | |
) | |
return self.act(y / temperature, dim=1) | |
class ZoeDepthConditionalLogBinomialSoftmax(nn.Module): | |
def __init__( | |
self, | |
config, | |
in_features, | |
condition_dim, | |
n_classes=256, | |
bottleneck_factor=2, | |
): | |
"""Per-pixel MLP followed by a Conditional Log Binomial softmax. | |
Args: | |
in_features (`int`): | |
Number of input channels in the main feature. | |
condition_dim (`int`): | |
Number of input channels in the condition feature. | |
n_classes (`int`, *optional*, defaults to 256): | |
Number of classes. | |
bottleneck_factor (`int`, *optional*, defaults to 2): | |
Hidden dim factor. | |
""" | |
super().__init__() | |
bottleneck = (in_features + condition_dim) // bottleneck_factor | |
self.mlp = nn.Sequential( | |
nn.Conv2d(in_features + condition_dim, bottleneck, kernel_size=1, stride=1, padding=0), | |
nn.GELU(), | |
# 2 for probabilities linear norm, 2 for temperature linear norm | |
nn.Conv2d(bottleneck, 2 + 2, kernel_size=1, stride=1, padding=0), | |
nn.Softplus(), | |
) | |
self.p_eps = 1e-4 | |
self.max_temp = config.max_temp | |
self.min_temp = config.min_temp | |
self.log_binomial_transform = LogBinomialSoftmax(n_classes, act=torch.softmax) | |
def forward(self, main_feature, condition_feature): | |
""" | |
Args: | |
main_feature (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): | |
Main feature. | |
condition_feature (torch.Tensor of shape `(batch_size, num_channels, height, width)`): | |
Condition feature. | |
Returns: | |
`torch.Tensor`: | |
Output log binomial distribution | |
""" | |
probabilities_and_temperature = self.mlp(torch.concat((main_feature, condition_feature), dim=1)) | |
probabilities, temperature = ( | |
probabilities_and_temperature[:, :2, ...], | |
probabilities_and_temperature[:, 2:, ...], | |
) | |
probabilities = probabilities + self.p_eps | |
probabilities = probabilities[:, 0, ...] / (probabilities[:, 0, ...] + probabilities[:, 1, ...]) | |
temperature = temperature + self.p_eps | |
temperature = temperature[:, 0, ...] / (temperature[:, 0, ...] + temperature[:, 1, ...]) | |
temperature = temperature.unsqueeze(1) | |
temperature = (self.max_temp - self.min_temp) * temperature + self.min_temp | |
return self.log_binomial_transform(probabilities, temperature) | |
class ZoeDepthSeedBinRegressor(nn.Module): | |
def __init__(self, config, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10): | |
"""Bin center regressor network. | |
Can be "normed" or "unnormed". If "normed", bin centers are bounded on the (min_depth, max_depth) interval. | |
Args: | |
config (`int`): | |
Model configuration. | |
n_bins (`int`, *optional*, defaults to 16): | |
Number of bin centers. | |
mlp_dim (`int`, *optional*, defaults to 256): | |
Hidden dimension. | |
min_depth (`float`, *optional*, defaults to 1e-3): | |
Min depth value. | |
max_depth (`float`, *optional*, defaults to 10): | |
Max depth value. | |
""" | |
super().__init__() | |
self.in_features = config.bottleneck_features | |
self.bin_centers_type = config.bin_centers_type | |
self.min_depth = min_depth | |
self.max_depth = max_depth | |
self.conv1 = nn.Conv2d(self.in_features, mlp_dim, 1, 1, 0) | |
self.act1 = nn.ReLU(inplace=True) | |
self.conv2 = nn.Conv2d(mlp_dim, n_bins, 1, 1, 0) | |
self.act2 = nn.ReLU(inplace=True) if self.bin_centers_type == "normed" else nn.Softplus() | |
def forward(self, x): | |
""" | |
Returns tensor of bin_width vectors (centers). One vector b for every pixel | |
""" | |
x = self.conv1(x) | |
x = self.act1(x) | |
x = self.conv2(x) | |
bin_centers = self.act2(x) | |
if self.bin_centers_type == "normed": | |
bin_centers = bin_centers + 1e-3 | |
bin_widths_normed = bin_centers / bin_centers.sum(dim=1, keepdim=True) | |
# shape (batch_size, num_channels, height, width) | |
bin_widths = (self.max_depth - self.min_depth) * bin_widths_normed | |
# pad has the form (left, right, top, bottom, front, back) | |
bin_widths = nn.functional.pad(bin_widths, (0, 0, 0, 0, 1, 0), mode="constant", value=self.min_depth) | |
# shape (batch_size, num_channels, height, width) | |
bin_edges = torch.cumsum(bin_widths, dim=1) | |
bin_centers = 0.5 * (bin_edges[:, :-1, ...] + bin_edges[:, 1:, ...]) | |
return bin_widths_normed, bin_centers | |
else: | |
return bin_centers, bin_centers | |
def inv_attractor(dx, alpha: float = 300, gamma: int = 2): | |
"""Inverse attractor: dc = dx / (1 + alpha*dx^gamma), where dx = a - c, a = attractor point, c = bin center, dc = shift in bin center | |
This is the default one according to the accompanying paper. | |
Args: | |
dx (`torch.Tensor`): | |
The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center. | |
alpha (`float`, *optional*, defaults to 300): | |
Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction. | |
gamma (`int`, *optional*, defaults to 2): | |
Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected. | |
Lower gamma = farther reach. | |
Returns: | |
torch.Tensor: Delta shifts - dc; New bin centers = Old bin centers + dc | |
""" | |
return dx.div(1 + alpha * dx.pow(gamma)) | |
class ZoeDepthAttractorLayer(nn.Module): | |
def __init__( | |
self, | |
config, | |
n_bins, | |
n_attractors=16, | |
min_depth=1e-3, | |
max_depth=10, | |
memory_efficient=False, | |
): | |
""" | |
Attractor layer for bin centers. Bin centers are bounded on the interval (min_depth, max_depth) | |
""" | |
super().__init__() | |
self.alpha = config.attractor_alpha | |
self.gemma = config.attractor_gamma | |
self.kind = config.attractor_kind | |
self.n_attractors = n_attractors | |
self.n_bins = n_bins | |
self.min_depth = min_depth | |
self.max_depth = max_depth | |
self.memory_efficient = memory_efficient | |
# MLP to predict attractor points | |
in_features = mlp_dim = config.bin_embedding_dim | |
self.conv1 = nn.Conv2d(in_features, mlp_dim, 1, 1, 0) | |
self.act1 = nn.ReLU(inplace=True) | |
self.conv2 = nn.Conv2d(mlp_dim, n_attractors * 2, 1, 1, 0) # x2 for linear norm | |
self.act2 = nn.ReLU(inplace=True) | |
def forward(self, x, prev_bin, prev_bin_embedding=None, interpolate=True): | |
""" | |
The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers | |
and the attractor points (the latter are predicted by the MLP). | |
Args: | |
x (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): | |
Feature block. | |
prev_bin (`torch.Tensor` of shape `(batch_size, prev_number_of_bins, height, width)`): | |
Previous bin centers normed. | |
prev_bin_embedding (`torch.Tensor`, *optional*): | |
Optional previous bin embeddings. | |
interpolate (`bool`, *optional*, defaults to `True`): | |
Whether to interpolate the previous bin embeddings to the size of the input features. | |
Returns: | |
`Tuple[`torch.Tensor`, `torch.Tensor`]: | |
New bin centers normed and scaled. | |
""" | |
if prev_bin_embedding is not None: | |
if interpolate: | |
prev_bin_embedding = nn.functional.interpolate( | |
prev_bin_embedding, x.shape[-2:], mode="bilinear", align_corners=True | |
) | |
x = x + prev_bin_embedding | |
x = self.conv1(x) | |
x = self.act1(x) | |
x = self.conv2(x) | |
attractors = self.act2(x) | |
attractors = attractors + 1e-3 | |
batch_size, _, height, width = attractors.shape | |
attractors = attractors.view(batch_size, self.n_attractors, 2, height, width) | |
# batch_size, num_attractors, 2, height, width | |
# note: original repo had a bug here: https://github.com/isl-org/ZoeDepth/blame/edb6daf45458569e24f50250ef1ed08c015f17a7/zoedepth/models/layers/attractor.py#L105C9-L106C50 | |
# we include the bug to maintain compatibility with the weights | |
attractors_normed = attractors[:, :, 0, ...] # batch_size, batch_size*num_attractors, height, width | |
bin_centers = nn.functional.interpolate(prev_bin, (height, width), mode="bilinear", align_corners=True) | |
# note: only attractor_type = "exp" is supported here, since no checkpoints were released with other attractor types | |
if not self.memory_efficient: | |
func = {"mean": torch.mean, "sum": torch.sum}[self.kind] | |
# shape (batch_size, num_bins, height, width) | |
delta_c = func(inv_attractor(attractors_normed.unsqueeze(2) - bin_centers.unsqueeze(1)), dim=1) | |
else: | |
delta_c = torch.zeros_like(bin_centers, device=bin_centers.device) | |
for i in range(self.n_attractors): | |
# shape (batch_size, num_bins, height, width) | |
delta_c += inv_attractor(attractors_normed[:, i, ...].unsqueeze(1) - bin_centers) | |
if self.kind == "mean": | |
delta_c = delta_c / self.n_attractors | |
bin_new_centers = bin_centers + delta_c | |
bin_centers = (self.max_depth - self.min_depth) * bin_new_centers + self.min_depth | |
bin_centers, _ = torch.sort(bin_centers, dim=1) | |
bin_centers = torch.clip(bin_centers, self.min_depth, self.max_depth) | |
return bin_new_centers, bin_centers | |
class ZoeDepthAttractorLayerUnnormed(nn.Module): | |
def __init__( | |
self, | |
config, | |
n_bins, | |
n_attractors=16, | |
min_depth=1e-3, | |
max_depth=10, | |
memory_efficient=True, | |
): | |
""" | |
Attractor layer for bin centers. Bin centers are unbounded | |
""" | |
super().__init__() | |
self.n_attractors = n_attractors | |
self.n_bins = n_bins | |
self.min_depth = min_depth | |
self.max_depth = max_depth | |
self.alpha = config.attractor_alpha | |
self.gamma = config.attractor_alpha | |
self.kind = config.attractor_kind | |
self.memory_efficient = memory_efficient | |
in_features = mlp_dim = config.bin_embedding_dim | |
self.conv1 = nn.Conv2d(in_features, mlp_dim, 1, 1, 0) | |
self.act1 = nn.ReLU(inplace=True) | |
self.conv2 = nn.Conv2d(mlp_dim, n_attractors, 1, 1, 0) | |
self.act2 = nn.Softplus() | |
def forward(self, x, prev_bin, prev_bin_embedding=None, interpolate=True): | |
""" | |
The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers | |
and the attractor points (the latter are predicted by the MLP). | |
Args: | |
x (`torch.Tensor` of shape (batch_size, num_channels, height, width)`): | |
Feature block. | |
prev_bin (`torch.Tensor` of shape (batch_size, prev_num_bins, height, width)`): | |
Previous bin centers normed. | |
prev_bin_embedding (`torch.Tensor`, *optional*): | |
Optional previous bin embeddings. | |
interpolate (`bool`, *optional*, defaults to `True`): | |
Whether to interpolate the previous bin embeddings to the size of the input features. | |
Returns: | |
`Tuple[`torch.Tensor`, `torch.Tensor`]: | |
New bin centers unbounded. Two outputs just to keep the API consistent with the normed version. | |
""" | |
if prev_bin_embedding is not None: | |
if interpolate: | |
prev_bin_embedding = nn.functional.interpolate( | |
prev_bin_embedding, x.shape[-2:], mode="bilinear", align_corners=True | |
) | |
x = x + prev_bin_embedding | |
x = self.conv1(x) | |
x = self.act1(x) | |
x = self.conv2(x) | |
attractors = self.act2(x) | |
height, width = attractors.shape[-2:] | |
bin_centers = nn.functional.interpolate(prev_bin, (height, width), mode="bilinear", align_corners=True) | |
if not self.memory_efficient: | |
func = {"mean": torch.mean, "sum": torch.sum}[self.kind] | |
# shape batch_size, num_bins, height, width | |
delta_c = func(inv_attractor(attractors.unsqueeze(2) - bin_centers.unsqueeze(1)), dim=1) | |
else: | |
delta_c = torch.zeros_like(bin_centers, device=bin_centers.device) | |
for i in range(self.n_attractors): | |
# shape batch_size, num_bins, height, width | |
delta_c += inv_attractor(attractors[:, i, ...].unsqueeze(1) - bin_centers) | |
if self.kind == "mean": | |
delta_c = delta_c / self.n_attractors | |
bin_new_centers = bin_centers + delta_c | |
bin_centers = bin_new_centers | |
return bin_new_centers, bin_centers | |
class ZoeDepthProjector(nn.Module): | |
def __init__(self, in_features, out_features, mlp_dim=128): | |
"""Projector MLP. | |
Args: | |
in_features (`int`): | |
Number of input channels. | |
out_features (`int`): | |
Number of output channels. | |
mlp_dim (`int`, *optional*, defaults to 128): | |
Hidden dimension. | |
""" | |
super().__init__() | |
self.conv1 = nn.Conv2d(in_features, mlp_dim, 1, 1, 0) | |
self.act = nn.ReLU(inplace=True) | |
self.conv2 = nn.Conv2d(mlp_dim, out_features, 1, 1, 0) | |
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: | |
hidden_state = self.conv1(hidden_state) | |
hidden_state = self.act(hidden_state) | |
hidden_state = self.conv2(hidden_state) | |
return hidden_state | |
# Copied from transformers.models.grounding_dino.modeling_grounding_dino.GroundingDinoMultiheadAttention with GroundingDino->ZoeDepth | |
class ZoeDepthMultiheadAttention(nn.Module): | |
"""Equivalent implementation of nn.MultiheadAttention with `batch_first=True`.""" | |
# Ignore copy | |
def __init__(self, hidden_size, num_attention_heads, dropout): | |
super().__init__() | |
if hidden_size % num_attention_heads != 0: | |
raise ValueError( | |
f"The hidden size ({hidden_size}) is not a multiple of the number of attention " | |
f"heads ({num_attention_heads})" | |
) | |
self.num_attention_heads = num_attention_heads | |
self.attention_head_size = int(hidden_size / num_attention_heads) | |
self.all_head_size = self.num_attention_heads * self.attention_head_size | |
self.query = nn.Linear(hidden_size, self.all_head_size) | |
self.key = nn.Linear(hidden_size, self.all_head_size) | |
self.value = nn.Linear(hidden_size, self.all_head_size) | |
self.out_proj = nn.Linear(hidden_size, hidden_size) | |
self.dropout = nn.Dropout(dropout) | |
def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: | |
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) | |
x = x.view(new_x_shape) | |
return x.permute(0, 2, 1, 3) | |
def forward( | |
self, | |
queries: torch.Tensor, | |
keys: torch.Tensor, | |
values: torch.Tensor, | |
attention_mask: Optional[torch.FloatTensor] = None, | |
output_attentions: Optional[bool] = False, | |
) -> Tuple[torch.Tensor]: | |
query_layer = self.transpose_for_scores(self.query(queries)) | |
key_layer = self.transpose_for_scores(self.key(keys)) | |
value_layer = self.transpose_for_scores(self.value(values)) | |
# Take the dot product between "query" and "key" to get the raw attention scores. | |
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) | |
attention_scores = attention_scores / math.sqrt(self.attention_head_size) | |
if attention_mask is not None: | |
# Apply the attention mask is (precomputed for all layers in ZoeDepthModel forward() function) | |
attention_scores = attention_scores + attention_mask | |
# Normalize the attention scores to probabilities. | |
attention_probs = nn.functional.softmax(attention_scores, dim=-1) | |
# This is actually dropping out entire tokens to attend to, which might | |
# seem a bit unusual, but is taken from the original Transformer paper. | |
attention_probs = self.dropout(attention_probs) | |
context_layer = torch.matmul(attention_probs, value_layer) | |
context_layer = context_layer.permute(0, 2, 1, 3).contiguous() | |
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) | |
context_layer = context_layer.view(new_context_layer_shape) | |
context_layer = self.out_proj(context_layer) | |
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) | |
return outputs | |
class ZoeDepthTransformerEncoderLayer(nn.Module): | |
def __init__(self, config, dropout=0.1, activation="relu"): | |
super().__init__() | |
hidden_size = config.patch_transformer_hidden_size | |
intermediate_size = config.patch_transformer_intermediate_size | |
num_attention_heads = config.patch_transformer_num_attention_heads | |
self.self_attn = ZoeDepthMultiheadAttention(hidden_size, num_attention_heads, dropout=dropout) | |
self.linear1 = nn.Linear(hidden_size, intermediate_size) | |
self.dropout = nn.Dropout(dropout) | |
self.linear2 = nn.Linear(intermediate_size, hidden_size) | |
self.norm1 = nn.LayerNorm(hidden_size) | |
self.norm2 = nn.LayerNorm(hidden_size) | |
self.dropout1 = nn.Dropout(dropout) | |
self.dropout2 = nn.Dropout(dropout) | |
self.activation = ACT2FN[activation] | |
def forward( | |
self, | |
src, | |
src_mask: Optional[torch.Tensor] = None, | |
): | |
queries = keys = src | |
src2 = self.self_attn(queries=queries, keys=keys, values=src, attention_mask=src_mask)[0] | |
src = src + self.dropout1(src2) | |
src = self.norm1(src) | |
src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) | |
src = src + self.dropout2(src2) | |
src = self.norm2(src) | |
return src | |
class ZoeDepthPatchTransformerEncoder(nn.Module): | |
def __init__(self, config): | |
"""ViT-like transformer block | |
Args: | |
config (`ZoeDepthConfig`): | |
Model configuration class defining the model architecture. | |
""" | |
super().__init__() | |
in_channels = config.bottleneck_features | |
self.transformer_encoder = nn.ModuleList( | |
[ZoeDepthTransformerEncoderLayer(config) for _ in range(config.num_patch_transformer_layers)] | |
) | |
self.embedding_convPxP = nn.Conv2d( | |
in_channels, config.patch_transformer_hidden_size, kernel_size=1, stride=1, padding=0 | |
) | |
def positional_encoding_1d(self, batch_size, sequence_length, embedding_dim, device="cpu", dtype=torch.float32): | |
"""Generate positional encodings | |
Args: | |
sequence_length (int): Sequence length | |
embedding_dim (int): Embedding dimension | |
Returns: | |
torch.Tensor: Positional encodings. | |
""" | |
position = torch.arange(0, sequence_length, dtype=dtype, device=device).unsqueeze(1) | |
index = torch.arange(0, embedding_dim, 2, dtype=dtype, device=device).unsqueeze(0) | |
div_term = torch.exp(index * (-torch.log(torch.tensor(10000.0, device=device)) / embedding_dim)) | |
pos_encoding = position * div_term | |
pos_encoding = torch.cat([torch.sin(pos_encoding), torch.cos(pos_encoding)], dim=1) | |
pos_encoding = pos_encoding.unsqueeze(dim=0).repeat(batch_size, 1, 1) | |
return pos_encoding | |
def forward(self, x): | |
"""Forward pass | |
Args: | |
x (torch.Tensor - NCHW): Input feature tensor | |
Returns: | |
torch.Tensor - Transformer output embeddings of shape (batch_size, sequence_length, embedding_dim) | |
""" | |
embeddings = self.embedding_convPxP(x).flatten(2) # shape (batch_size, num_channels, sequence_length) | |
# add an extra special CLS token at the start for global accumulation | |
embeddings = nn.functional.pad(embeddings, (1, 0)) | |
embeddings = embeddings.permute(0, 2, 1) | |
batch_size, sequence_length, embedding_dim = embeddings.shape | |
embeddings = embeddings + self.positional_encoding_1d( | |
batch_size, sequence_length, embedding_dim, device=embeddings.device, dtype=embeddings.dtype | |
) | |
for i in range(4): | |
embeddings = self.transformer_encoder[i](embeddings) | |
return embeddings | |
class ZoeDepthMLPClassifier(nn.Module): | |
def __init__(self, in_features, out_features) -> None: | |
super().__init__() | |
hidden_features = in_features | |
self.linear1 = nn.Linear(in_features, hidden_features) | |
self.activation = nn.ReLU() | |
self.linear2 = nn.Linear(hidden_features, out_features) | |
def forward(self, hidden_state): | |
hidden_state = self.linear1(hidden_state) | |
hidden_state = self.activation(hidden_state) | |
domain_logits = self.linear2(hidden_state) | |
return domain_logits | |
class ZoeDepthMultipleMetricDepthEstimationHeads(nn.Module): | |
""" | |
Multiple metric depth estimation heads. A MLP classifier is used to route between 2 different heads. | |
""" | |
def __init__(self, config): | |
super().__init__() | |
bin_embedding_dim = config.bin_embedding_dim | |
n_attractors = config.num_attractors | |
self.bin_configurations = config.bin_configurations | |
self.bin_centers_type = config.bin_centers_type | |
# Bottleneck convolution | |
bottleneck_features = config.bottleneck_features | |
self.conv2 = nn.Conv2d(bottleneck_features, bottleneck_features, kernel_size=1, stride=1, padding=0) | |
# Transformer classifier on the bottleneck | |
self.patch_transformer = ZoeDepthPatchTransformerEncoder(config) | |
# MLP classifier | |
self.mlp_classifier = ZoeDepthMLPClassifier(in_features=128, out_features=2) | |
# Regressor and attractor | |
if self.bin_centers_type == "normed": | |
Attractor = ZoeDepthAttractorLayer | |
elif self.bin_centers_type == "softplus": | |
Attractor = ZoeDepthAttractorLayerUnnormed | |
# We have bins for each bin configuration | |
# Create a map (ModuleDict) of 'name' -> seed_bin_regressor | |
self.seed_bin_regressors = nn.ModuleDict( | |
{ | |
conf["name"]: ZoeDepthSeedBinRegressor( | |
config, | |
n_bins=conf["n_bins"], | |
mlp_dim=bin_embedding_dim // 2, | |
min_depth=conf["min_depth"], | |
max_depth=conf["max_depth"], | |
) | |
for conf in config.bin_configurations | |
} | |
) | |
self.seed_projector = ZoeDepthProjector( | |
in_features=bottleneck_features, out_features=bin_embedding_dim, mlp_dim=bin_embedding_dim // 2 | |
) | |
self.projectors = nn.ModuleList( | |
[ | |
ZoeDepthProjector( | |
in_features=config.fusion_hidden_size, | |
out_features=bin_embedding_dim, | |
mlp_dim=bin_embedding_dim // 2, | |
) | |
for _ in range(4) | |
] | |
) | |
# Create a map (ModuleDict) of 'name' -> attractors (ModuleList) | |
self.attractors = nn.ModuleDict( | |
{ | |
configuration["name"]: nn.ModuleList( | |
[ | |
Attractor( | |
config, | |
n_bins=n_attractors[i], | |
min_depth=configuration["min_depth"], | |
max_depth=configuration["max_depth"], | |
) | |
for i in range(len(n_attractors)) | |
] | |
) | |
for configuration in config.bin_configurations | |
} | |
) | |
last_in = config.num_relative_features | |
# conditional log binomial for each bin configuration | |
self.conditional_log_binomial = nn.ModuleDict( | |
{ | |
configuration["name"]: ZoeDepthConditionalLogBinomialSoftmax( | |
config, | |
last_in, | |
bin_embedding_dim, | |
configuration["n_bins"], | |
bottleneck_factor=4, | |
) | |
for configuration in config.bin_configurations | |
} | |
) | |
def forward(self, outconv_activation, bottleneck, feature_blocks, relative_depth): | |
x = self.conv2(bottleneck) | |
# Predict which path to take | |
# Embedding is of shape (batch_size, hidden_size) | |
embedding = self.patch_transformer(x)[:, 0, :] | |
# MLP classifier to get logits of shape (batch_size, 2) | |
domain_logits = self.mlp_classifier(embedding) | |
domain_vote = torch.softmax(domain_logits.sum(dim=0, keepdim=True), dim=-1) | |
# Get the path | |
names = [configuration["name"] for configuration in self.bin_configurations] | |
bin_configurations_name = names[torch.argmax(domain_vote, dim=-1).squeeze().item()] | |
try: | |
conf = [config for config in self.bin_configurations if config["name"] == bin_configurations_name][0] | |
except IndexError: | |
raise ValueError(f"bin_configurations_name {bin_configurations_name} not found in bin_configurationss") | |
min_depth = conf["min_depth"] | |
max_depth = conf["max_depth"] | |
seed_bin_regressor = self.seed_bin_regressors[bin_configurations_name] | |
_, seed_bin_centers = seed_bin_regressor(x) | |
if self.bin_centers_type in ["normed", "hybrid2"]: | |
prev_bin = (seed_bin_centers - min_depth) / (max_depth - min_depth) | |
else: | |
prev_bin = seed_bin_centers | |
prev_bin_embedding = self.seed_projector(x) | |
attractors = self.attractors[bin_configurations_name] | |
for projector, attractor, feature in zip(self.projectors, attractors, feature_blocks): | |
bin_embedding = projector(feature) | |
bin, bin_centers = attractor(bin_embedding, prev_bin, prev_bin_embedding, interpolate=True) | |
prev_bin = bin | |
prev_bin_embedding = bin_embedding | |
last = outconv_activation | |
bin_centers = nn.functional.interpolate(bin_centers, last.shape[-2:], mode="bilinear", align_corners=True) | |
bin_embedding = nn.functional.interpolate(bin_embedding, last.shape[-2:], mode="bilinear", align_corners=True) | |
conditional_log_binomial = self.conditional_log_binomial[bin_configurations_name] | |
x = conditional_log_binomial(last, bin_embedding) | |
# Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor | |
out = torch.sum(x * bin_centers, dim=1, keepdim=True) | |
return out, domain_logits | |
class ZoeDepthMetricDepthEstimationHead(nn.Module): | |
def __init__(self, config): | |
super().__init__() | |
bin_configuration = config.bin_configurations[0] | |
n_bins = bin_configuration["n_bins"] | |
min_depth = bin_configuration["min_depth"] | |
max_depth = bin_configuration["max_depth"] | |
bin_embedding_dim = config.bin_embedding_dim | |
n_attractors = config.num_attractors | |
bin_centers_type = config.bin_centers_type | |
self.min_depth = min_depth | |
self.max_depth = max_depth | |
self.bin_centers_type = bin_centers_type | |
# Bottleneck convolution | |
bottleneck_features = config.bottleneck_features | |
self.conv2 = nn.Conv2d(bottleneck_features, bottleneck_features, kernel_size=1, stride=1, padding=0) | |
# Regressor and attractor | |
if self.bin_centers_type == "normed": | |
Attractor = ZoeDepthAttractorLayer | |
elif self.bin_centers_type == "softplus": | |
Attractor = ZoeDepthAttractorLayerUnnormed | |
self.seed_bin_regressor = ZoeDepthSeedBinRegressor( | |
config, n_bins=n_bins, min_depth=min_depth, max_depth=max_depth | |
) | |
self.seed_projector = ZoeDepthProjector(in_features=bottleneck_features, out_features=bin_embedding_dim) | |
self.projectors = nn.ModuleList( | |
[ | |
ZoeDepthProjector(in_features=config.fusion_hidden_size, out_features=bin_embedding_dim) | |
for _ in range(4) | |
] | |
) | |
self.attractors = nn.ModuleList( | |
[ | |
Attractor( | |
config, | |
n_bins=n_bins, | |
n_attractors=n_attractors[i], | |
min_depth=min_depth, | |
max_depth=max_depth, | |
) | |
for i in range(4) | |
] | |
) | |
last_in = config.num_relative_features + 1 # +1 for relative depth | |
# use log binomial instead of softmax | |
self.conditional_log_binomial = ZoeDepthConditionalLogBinomialSoftmax( | |
config, | |
last_in, | |
bin_embedding_dim, | |
n_classes=n_bins, | |
) | |
def forward(self, outconv_activation, bottleneck, feature_blocks, relative_depth): | |
x = self.conv2(bottleneck) | |
_, seed_bin_centers = self.seed_bin_regressor(x) | |
if self.bin_centers_type in ["normed", "hybrid2"]: | |
prev_bin = (seed_bin_centers - self.min_depth) / (self.max_depth - self.min_depth) | |
else: | |
prev_bin = seed_bin_centers | |
prev_bin_embedding = self.seed_projector(x) | |
# unroll this loop for better performance | |
for projector, attractor, feature in zip(self.projectors, self.attractors, feature_blocks): | |
bin_embedding = projector(feature) | |
bin, bin_centers = attractor(bin_embedding, prev_bin, prev_bin_embedding, interpolate=True) | |
prev_bin = bin.clone() | |
prev_bin_embedding = bin_embedding.clone() | |
last = outconv_activation | |
# concatenative relative depth with last. First interpolate relative depth to last size | |
relative_conditioning = relative_depth.unsqueeze(1) | |
relative_conditioning = nn.functional.interpolate( | |
relative_conditioning, size=last.shape[2:], mode="bilinear", align_corners=True | |
) | |
last = torch.cat([last, relative_conditioning], dim=1) | |
bin_embedding = nn.functional.interpolate(bin_embedding, last.shape[-2:], mode="bilinear", align_corners=True) | |
x = self.conditional_log_binomial(last, bin_embedding) | |
# Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor | |
bin_centers = nn.functional.interpolate(bin_centers, x.shape[-2:], mode="bilinear", align_corners=True) | |
out = torch.sum(x * bin_centers, dim=1, keepdim=True) | |
return out, None | |
# Copied from transformers.models.dpt.modeling_dpt.DPTPreTrainedModel with DPT->ZoeDepth,dpt->zoedepth | |
class ZoeDepthPreTrainedModel(PreTrainedModel): | |
""" | |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained | |
models. | |
""" | |
config_class = ZoeDepthConfig | |
base_model_prefix = "zoedepth" | |
main_input_name = "pixel_values" | |
supports_gradient_checkpointing = True | |
def _init_weights(self, module): | |
"""Initialize the weights""" | |
if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): | |
# Slightly different from the TF version which uses truncated_normal for initialization | |
# cf https://github.com/pytorch/pytorch/pull/5617 | |
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) | |
if module.bias is not None: | |
module.bias.data.zero_() | |
elif isinstance(module, nn.LayerNorm): | |
module.bias.data.zero_() | |
module.weight.data.fill_(1.0) | |
ZOEDEPTH_START_DOCSTRING = r""" | |
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it | |
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and | |
behavior. | |
Parameters: | |
config ([`ViTConfig`]): Model configuration class with all the parameters of the model. | |
Initializing with a config file does not load the weights associated with the model, only the | |
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. | |
""" | |
ZOEDEPTH_INPUTS_DOCSTRING = r""" | |
Args: | |
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): | |
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`] | |
for details. | |
output_attentions (`bool`, *optional*): | |
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned | |
tensors for more detail. | |
output_hidden_states (`bool`, *optional*): | |
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for | |
more detail. | |
return_dict (`bool`, *optional*): | |
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. | |
""" | |
class ZoeDepthForDepthEstimation(ZoeDepthPreTrainedModel): | |
def __init__(self, config): | |
super().__init__(config) | |
self.backbone = load_backbone(config) | |
if hasattr(self.backbone.config, "hidden_size") and hasattr(self.backbone.config, "patch_size"): | |
config.backbone_hidden_size = self.backbone.config.hidden_size | |
self.patch_size = self.backbone.config.patch_size | |
else: | |
raise ValueError( | |
"ZoeDepth assumes the backbone's config to have `hidden_size` and `patch_size` attributes" | |
) | |
self.neck = ZoeDepthNeck(config) | |
self.relative_head = ZoeDepthRelativeDepthEstimationHead(config) | |
self.metric_head = ( | |
ZoeDepthMultipleMetricDepthEstimationHeads(config) | |
if len(config.bin_configurations) > 1 | |
else ZoeDepthMetricDepthEstimationHead(config) | |
) | |
# Initialize weights and apply final processing | |
self.post_init() | |
def forward( | |
self, | |
pixel_values: torch.FloatTensor, | |
labels: Optional[torch.LongTensor] = None, | |
output_attentions: Optional[bool] = None, | |
output_hidden_states: Optional[bool] = None, | |
return_dict: Optional[bool] = None, | |
) -> Union[Tuple[torch.Tensor], DepthEstimatorOutput]: | |
r""" | |
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): | |
Ground truth depth estimation maps for computing the loss. | |
Returns: | |
Examples: | |
```python | |
>>> from transformers import AutoImageProcessor, ZoeDepthForDepthEstimation | |
>>> import torch | |
>>> import numpy as np | |
>>> from PIL import Image | |
>>> import requests | |
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" | |
>>> image = Image.open(requests.get(url, stream=True).raw) | |
>>> image_processor = AutoImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti") | |
>>> model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti") | |
>>> # prepare image for the model | |
>>> inputs = image_processor(images=image, return_tensors="pt") | |
>>> with torch.no_grad(): | |
... outputs = model(**inputs) | |
... predicted_depth = outputs.predicted_depth | |
>>> # interpolate to original size | |
>>> prediction = torch.nn.functional.interpolate( | |
... predicted_depth.unsqueeze(1), | |
... size=image.size[::-1], | |
... mode="bicubic", | |
... align_corners=False, | |
... ) | |
>>> # visualize the prediction | |
>>> output = prediction.squeeze().cpu().numpy() | |
>>> formatted = (output * 255 / np.max(output)).astype("uint8") | |
>>> depth = Image.fromarray(formatted) | |
```""" | |
loss = None | |
if labels is not None: | |
raise NotImplementedError("Training is not implemented yet") | |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
output_hidden_states = ( | |
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states | |
) | |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions | |
outputs = self.backbone.forward_with_filtered_kwargs( | |
pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions | |
) | |
hidden_states = outputs.feature_maps | |
_, _, height, width = pixel_values.shape | |
patch_size = self.patch_size | |
patch_height = height // patch_size | |
patch_width = width // patch_size | |
hidden_states, features = self.neck(hidden_states, patch_height, patch_width) | |
out = [features] + hidden_states | |
relative_depth, features = self.relative_head(hidden_states) | |
out = [features] + out | |
metric_depth, domain_logits = self.metric_head( | |
outconv_activation=out[0], bottleneck=out[1], feature_blocks=out[2:], relative_depth=relative_depth | |
) | |
metric_depth = metric_depth.squeeze(dim=1) | |
if not return_dict: | |
if domain_logits is not None: | |
output = (metric_depth, domain_logits) + outputs[1:] | |
else: | |
output = (metric_depth,) + outputs[1:] | |
return ((loss,) + output) if loss is not None else output | |
return ZoeDepthDepthEstimatorOutput( | |
loss=loss, | |
predicted_depth=metric_depth, | |
domain_logits=domain_logits, | |
hidden_states=outputs.hidden_states, | |
attentions=outputs.attentions, | |
) | |