# MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Zhenyu Li import torch import torch.nn as nn import torch.nn.functional as F from zoedepth.models.layers.swin_layers import G2LFusion from zoedepth.models.layers.transformer import TransformerEncoder, TransformerEncoderLayer from torchvision.ops import roi_align as torch_roi_align class DoubleConvWOBN(nn.Module): """(convolution => [BN] => ReLU) * 2""" def __init__(self, in_channels, out_channels, mid_channels=None): super().__init__() if not mid_channels: mid_channels = out_channels self.double_conv = nn.Sequential( nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=True), # nn.BatchNorm2d(mid_channels), nn.ReLU(inplace=True), nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=True), # nn.BatchNorm2d(mid_channels), nn.ReLU(inplace=True)) def forward(self, x): return self.double_conv(x) class DoubleConv(nn.Module): """(convolution => [BN] => ReLU) * 2""" def __init__(self, in_channels, out_channels, mid_channels=None): super().__init__() if not mid_channels: mid_channels = out_channels self.double_conv = nn.Sequential( nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False), nn.BatchNorm2d(mid_channels), nn.ReLU(inplace=True), nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False), nn.BatchNorm2d(out_channels), nn.ReLU(inplace=True) ) def forward(self, x): return self.double_conv(x) class Down(nn.Module): """Downscaling with maxpool then double conv""" def __init__(self, in_channels, out_channels): super().__init__() self.maxpool_conv = nn.Sequential( nn.MaxPool2d(2), DoubleConv(in_channels, out_channels) ) def forward(self, x): return self.maxpool_conv(x) class Upv1(nn.Module): """Upscaling then double conv""" def __init__(self, in_channels, out_channels, mid_channels=None): super().__init__() self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) if mid_channels is not None: self.conv = DoubleConvWOBN(in_channels, out_channels, mid_channels) else: self.conv = DoubleConvWOBN(in_channels, out_channels, in_channels) def forward(self, x1, x2): x1 = self.up(x1) x = torch.cat([x2, x1], dim=1) return self.conv(x) class UNetv1(nn.Module): def __init__(self, n_channels, g2l, pos_embed=False, use_area_prior=True): super(UNetv1, self).__init__() self.n_channels = n_channels self.inc = DoubleConv(n_channels, 32) self.down1 = Down(32, 256) self.down2 = Down(256, 256) self.down3 = Down(256, 256) self.down4 = Down(256, 256) self.down5 = Down(256, 256) self.up1 = Upv1(256+256+256, 256, 384) self.up2 = Upv1(256+256+256, 256, 384) self.up3 = Upv1(256+256+256, 256, 384) self.up4 = Upv1(256+256+256, 256, 384) self.up5 = Upv1(256+32+256, 32, 272) self.g2l = g2l if self.g2l: self.g2l_att = nn.ModuleList() win = 12 in_channels = [32, 256, 256, 256, 256, 256] crf_dims = [32, 256, 256, 256, 256, 256] self.g2l5 = G2LFusion(input_dim=in_channels[5], embed_dim=crf_dims[5], window_size=win, num_heads=32, depth=4, num_patches=12*16) self.g2l4 = G2LFusion(input_dim=in_channels[4], embed_dim=crf_dims[4], window_size=win, num_heads=32, depth=4, num_patches=24*32) self.g2l3 = G2LFusion(input_dim=in_channels[3], embed_dim=crf_dims[3], window_size=win, num_heads=16, depth=3, num_patches=48*64) self.g2l2 = G2LFusion(input_dim=in_channels[2], embed_dim=crf_dims[2], window_size=win, num_heads=16, depth=3, num_patches=96*128) self.g2l1 = G2LFusion(input_dim=in_channels[1], embed_dim=crf_dims[1], window_size=win, num_heads=8, depth=2, num_patches=192*256) self.g2l0 = G2LFusion(input_dim=in_channels[0], embed_dim=crf_dims[0], window_size=win, num_heads=8, depth=2, num_patches=384*512) self.conv5 = DoubleConvWOBN(in_channels[4] * 2, in_channels[4], in_channels[4]) self.conv4 = DoubleConvWOBN(in_channels[4] * 2, in_channels[4], in_channels[4]) self.conv3 = DoubleConvWOBN(in_channels[3] * 2, in_channels[3], in_channels[3]) self.conv2 = DoubleConvWOBN(in_channels[2] * 2, in_channels[2], in_channels[2]) self.conv1 = DoubleConvWOBN(in_channels[1] * 2, in_channels[1], in_channels[1]) self.conv0 = DoubleConvWOBN(in_channels[0] * 2, in_channels[0], in_channels[0]) def forward(self, input_tensor, guide_plus, guide_cat, crop_area_resize=None, bbox=None, fine_feat_crop=None, coarse_feat_whole=None, coarse_feat_whole_hack=None, coarse_feat_crop=None): # apply unscaled feat to swin if coarse_feat_whole_hack is not None: coarse_feat_whole = coarse_feat_whole_hack if crop_area_resize is None: not_use_prior = True else: not_use_prior = False x1 = self.inc(input_tensor) x2 = self.down1(x1) x3 = self.down2(x2) x4 = self.down3(x3) x5 = self.down4(x4) x6 = self.down5(x5) if self.g2l: g2l_feat5 = self.g2l5(coarse_feat_whole[0], crop_area_resize[0]) g2l_feat5 = torch_roi_align(g2l_feat5, bbox, (12, 16), 12/384, aligned=True) x6 = self.conv5(torch.cat([x6, g2l_feat5], dim=1)) x5 = self.up1(torch.cat([x6, guide_cat[0]], dim=1), x5) if self.g2l: g2l_feat4 = self.g2l4(coarse_feat_whole[1], crop_area_resize[1]) g2l_feat4 = torch_roi_align(g2l_feat4, bbox, (24, 32), 24/384, aligned=True) x5 = self.conv4(torch.cat([x5, g2l_feat4], dim=1)) x4 = self.up2(torch.cat([x5, guide_cat[1]], dim=1), x4) if self.g2l: g2l_feat3 = self.g2l3(coarse_feat_whole[2], crop_area_resize[2]) g2l_feat3 = torch_roi_align(g2l_feat3, bbox, (48, 64), 48/384, aligned=True) x4 = self.conv3(torch.cat([x4, g2l_feat3], dim=1)) x3 = self.up3(torch.cat([x4, guide_cat[2]], dim=1), x3) if self.g2l: g2l_feat2 = self.g2l2(coarse_feat_whole[3], crop_area_resize[3]) g2l_feat2 = torch_roi_align(g2l_feat2, bbox, (96, 128), 96/384, aligned=True) x3 = self.conv2(torch.cat([x3, g2l_feat2], dim=1)) x2 = self.up4(torch.cat([x3, guide_cat[3]], dim=1), x2) if self.g2l: g2l_feat1 = self.g2l1(coarse_feat_whole[4], crop_area_resize[4]) g2l_feat1 = torch_roi_align(g2l_feat1, bbox, (192, 256), 192/384, aligned=True) x2 = self.conv1(torch.cat([x2, g2l_feat1], dim=1)) x1 = self.up5(torch.cat([x2, guide_cat[4]], dim=1), x1) if self.g2l: g2l_feat0 = self.g2l0(coarse_feat_whole[5], crop_area_resize[5]) g2l_feat0 = torch_roi_align(g2l_feat0, bbox, (384, 512), 384/384, aligned=True) x1 = self.conv0(torch.cat([x1, g2l_feat0], dim=1)) output = [x1, x2, x3, x4, x5, x6] return output