Spaces:

KyanChen
/

RSPrompter

Runtime error

File size: 35,888 Bytes

1c3eb47

import copy
import math
from typing import Type, Tuple

import einops
import torch
import torch.nn as nn
from einops import rearrange
from mmcv.cnn import ConvModule
from mmcv.cnn.bricks.transformer import build_transformer_layer
from torch import Tensor

from mmdet.models import SinePositionalEncoding
from mmpl.registry import MODELS
import torch.nn.functional as F


@MODELS.register_module()
class SAMTransformerPromptGenNeck(nn.Module):
    def __init__(
            self,
            prompt_shape=(100, 6),
            in_channels=[1280]*16,
            out_channels=256,
            positional_encoding=dict(num_feats=128, normalize=True),
            n_classes=2,
            kernel_size=3,
            stride=1,
            norm_cfg=None,
            act_cfg=dict(type='ReLU')
    ):
        super(SAMTransformerPromptGenNeck, self).__init__()
        self.in_channels = in_channels
        self.kernel_size = kernel_size
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg
        self.out_put_channels = out_channels
        self.n_classes = n_classes
        self.stride = stride

        self.prompt_shape = prompt_shape
        self.num_queries = prompt_shape[0]
        self.per_query_point = prompt_shape[1]

        if isinstance(in_channels, list):
            self.pre_layers = nn.ModuleList()
            inner_channel = 32
            for idx, channel in enumerate(in_channels):
                self.pre_layers.append(
                    nn.Sequential(
                        ConvModule(
                            channel,
                            inner_channel,
                            kernel_size=1,
                            norm_cfg=self.norm_cfg,
                            act_cfg=self.act_cfg
                        ),
                        ConvModule(
                            inner_channel,
                            inner_channel*2,
                            kernel_size=kernel_size,
                            padding=kernel_size // 2,
                            stride=self.stride,
                            norm_cfg=self.norm_cfg,
                            act_cfg=self.act_cfg
                        ),
                        ConvModule(
                            inner_channel*2,
                            inner_channel,
                            kernel_size=kernel_size,
                            padding=kernel_size // 2,
                            norm_cfg=self.norm_cfg,
                            act_cfg=self.act_cfg
                        ),
                    )
                )
            self.pre_layers.append(
                nn.Sequential(
                    ConvModule(
                        inner_channel * len(in_channels),
                        out_channels,
                        kernel_size=1,
                        norm_cfg=self.norm_cfg,
                        act_cfg=self.act_cfg
                    ),
                    ConvModule(
                        out_channels,
                        out_channels,
                        kernel_size=kernel_size,
                        padding=kernel_size // 2,
                        norm_cfg=self.norm_cfg,
                        act_cfg=self.act_cfg
                    ),
                )
            )

        self.generator_pe = SinePositionalEncoding(**positional_encoding)
        self.transformer = self.build_transformer()
        self.query_feat = nn.Embedding(self.num_queries, out_channels)
        self.query_emb = nn.Embedding(self.num_queries, out_channels)

        self.output_upscaling = nn.Sequential(
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.GELU(),
            nn.UpsamplingBilinear2d(scale_factor=2),
            nn.Conv2d(out_channels, out_channels // 4, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels // 4),
            nn.GELU(),
            nn.UpsamplingBilinear2d(scale_factor=2),
            nn.Conv2d(out_channels // 4, out_channels // 8, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels // 8),
            nn.GELU(),
            nn.UpsamplingBilinear2d(scale_factor=2),
            nn.Conv2d(out_channels // 8, out_channels // 8, kernel_size=3, padding=1),
        )

        self.cls_head = nn.Sequential(
            nn.Linear(out_channels, out_channels//2),
            nn.ReLU(),
            nn.Linear(out_channels//2, n_classes)
        )

        # self.point_emb = nn.Sequential(
        #     nn.Linear(out_channels, out_channels),
        #     nn.ReLU(),
        #     nn.Linear(out_channels, out_channels),
        #     nn.ReLU(),
        #     nn.Linear(out_channels, self.per_query_point * out_channels)
        # )
        self.output_hypernetworks_mlps = MLP(out_channels, out_channels, out_channels // 8, 3)


    def build_transformer(
            self, num_encoder_layers=2, num_decoder_layers=3, embed_dims=256, num_heads=8,
            mlp_ratio=2, dropout_rate=0.0, act_cfg=dict(type="gelu")):
        """Build transformer decoder."""
        # transformer = nn.Transformer(
        #     d_model=embed_dims, nhead=num_heads, num_encoder_layers=num_encoder_layers,
        #     num_decoder_layers=num_decoder_layers, dim_feedforward=mlp_ratio * embed_dims,
        #     dropout=dropout_rate, activation=act_cfg['type'], batch_first=True, norm_first=True,
        # )
        transformer = Transformer(depth=2)
        return transformer

    def init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, inputs, prompt_encoder, mask_decoder):

        img_embs, inner_states = inputs
        if hasattr(self, 'pre_layers'):
            inner_states = inner_states[-len(self.in_channels):]
            inner_states = [einops.rearrange(x, 'b h w c -> b c h w') for x in inner_states]
            inner_states = [layer(x) for layer, x in zip(self.pre_layers[:-1], inner_states)]
            img_feats = self.pre_layers[-1](torch.cat(inner_states, dim=1))
        bs, c, h, w = img_feats.shape
        mask_pe = torch.zeros((bs, h, w), device=img_feats.device)
        img_feats_pe = self.generator_pe(mask_pe)
        query_feat = self.query_feat.weight.unsqueeze(0).expand(bs, -1, -1)  # Bx256x256
        query_emb = self.query_emb.weight.unsqueeze(0).expand(bs, -1, -1)
        img_feats, query_feats = self.transformer(
            image_embedding=img_feats,
            image_pe=img_feats_pe,
            point_embedding=query_feat,
            point_pe=query_emb)
        cls_logits = self.cls_head(query_feats)
        # point_embs = self.point_emb(query_feats)
        # point_embs = rearrange(point_embs, 'b n (t c) -> b n t c', t=self.per_query_point)  # Bx100x6x256

        src = img_feats.transpose(1, 2).view(bs, c, h, w)
        upscaled_embedding = self.output_upscaling(src)
        hyper_in = self.output_hypernetworks_mlps(query_feats)
        b, c, h, w = upscaled_embedding.shape
        l1_masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)

        # dense_masks = einops.rearrange(l1_masks, 'b (n t) h w -> (b n) t h w', t=1)
        # sparse, dense = prompt_encoder(points=None, boxes=None, masks=dense_masks)
        # dense = einops.rearrange(dense, '(b n) t h w -> b n t h w', n=self.num_queries)

        # l2_masks = []
        # iou_preds = []
        # for curr_embedding, sparse_embeddings, dense_embeddings in zip(img_embs, point_embs, dense):
        #     low_res_masks, iou_predictions = mask_decoder(
        #         image_embeddings=curr_embedding.unsqueeze(0),
        #         image_pe=prompt_encoder.get_dense_pe(),
        #         sparse_prompt_embeddings=sparse_embeddings,
        #         dense_prompt_embeddings=dense_embeddings
        #     )
        #     l2_masks.append(low_res_masks[:, 0])
        #     iou_preds.append(iou_predictions[:, 0])
        # l2_masks = torch.stack(l2_masks, dim=0)
        # iou_preds = torch.stack(iou_preds, dim=0)

        l2_masks = None
        iou_preds = None

        return cls_logits, l1_masks, l2_masks, iou_preds


@MODELS.register_module()
class SAMPromptConvNeck(nn.Module):
    def __init__(
            self,
            prompt_shape=(100, 5),
            img_feat_channels=1280,
            out_put_channels=256,
            num_img_feat_level=16,
            n_cls=2,
    ):
        super(SAMPromptConvNeck, self).__init__()
        self.prompt_shape = prompt_shape
        self.num_queries = prompt_shape[0]
        self.per_query_point = prompt_shape[1]
        self.point_size = int(math.sqrt(prompt_shape[0]))

        self.img_feat_channels = img_feat_channels
        self.out_put_channels = out_put_channels
        self.num_img_feat_level = num_img_feat_level
        self.n_cls = n_cls

        # decoder_embed_dims = img_feat_channels // 32
        decoder_embed_dims = 32
        self.decoder_input_projs = nn.ModuleList()
        # from low resolution to high resolution
        for _ in range(num_img_feat_level):
            self.decoder_input_projs.append(
                nn.Sequential(
                    nn.Conv2d(img_feat_channels, decoder_embed_dims, kernel_size=1),
                    # nn.BatchNorm2d(decoder_embed_dims),
                    nn.ReLU(),
                    nn.Conv2d(decoder_embed_dims, decoder_embed_dims, kernel_size=3, padding=1),
                    # nn.BatchNorm2d(decoder_embed_dims),
                    nn.ReLU(),
                ))
        self.level_embed = nn.Embedding(self.num_img_feat_level, decoder_embed_dims)
        self.gather_img_feats = nn.Sequential(
            nn.Conv2d(num_img_feat_level * decoder_embed_dims, out_put_channels, kernel_size=1),
            # nn.BatchNorm2d(out_put_channels),
            nn.ReLU(),
            nn.Conv2d(out_put_channels, out_put_channels, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(out_put_channels, out_put_channels*2, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(out_put_channels * 2, out_put_channels * 2, 3, padding=1),
        )

        self.img_feats_pe = nn.Parameter(torch.zeros(1, out_put_channels*2, self.point_size, self.point_size))

        self.cls_head = nn.Sequential(
            nn.Conv2d(out_put_channels * 2, out_put_channels, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(out_put_channels, n_cls, 1)
        )

        self.point_emb = nn.Sequential(
            nn.Conv2d(out_put_channels * 2, out_put_channels, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(out_put_channels, out_put_channels, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(out_put_channels, self.per_query_point * out_put_channels, 1)
        )

    def forward(self, inputs):
        inner_states = [x.permute(0, 3, 1, 2) for x in inputs]  # from low2high, all 4 layers
        bs = inner_states[0].shape[0]
        # inputs: list([B, C, H, W])
        num_layers = len(inputs)
        # import ipdb; ipdb.set_trace()
        # select the feature maps from the selected layers
        layer_start_id = num_layers - self.num_img_feat_level
        decoder_inputs = []
        for i in range(self.num_img_feat_level):
            decoder_input = self.decoder_input_projs[i](inner_states[i + layer_start_id])  # Bx256x64x64
            level_embed = self.level_embed.weight[i].unsqueeze(0).unsqueeze(-1).unsqueeze(-1).expand(bs, -1, -1, -1)
            decoder_input = decoder_input + level_embed
            decoder_inputs.append(decoder_input)
        decoder_inputs = torch.cat(decoder_inputs, dim=1)  # Bx256x64x64
        decoder_inputs = self.gather_img_feats(decoder_inputs)
        # import pdb;
        # pdb.set_trace()
        decoder_inputs = torch.nn.functional.interpolate(decoder_inputs, size=(self.point_size, self.point_size), mode='bilinear', align_corners=True)
        img_pe = self.img_feats_pe.expand(bs, -1, -1, -1)  # Bx256x64x64
        decoder_inputs = decoder_inputs + img_pe

        cls_logits = self.cls_head(decoder_inputs)  # b c h w
        cls_logits = rearrange(cls_logits, 'b c h w -> b (h w) c')
        point_embs = self.point_emb(decoder_inputs)  # b c h w
        point_embs = rearrange(point_embs, 'b (t c) h w -> b (h w) t c', t=self.per_query_point)  # Bx100x6x256

        return point_embs, cls_logits




class MLPBlock(nn.Module):
    def __init__(
        self,
        embedding_dim: int,
        mlp_dim: int,
        act: Type[nn.Module] = nn.GELU,
    ) -> None:
        super().__init__()
        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
        self.act = act()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.lin2(self.act(self.lin1(x)))


class Transformer(nn.Module):
    def __init__(
        self,
        depth: int = 2,
        embedding_dim: int = 256,
        num_heads: int = 8,
        mlp_dim: int = 1024,
        activation: Type[nn.Module] = nn.GELU,
        attention_downsample_rate: int = 2,
    ) -> None:
        super().__init__()
        self.depth = depth
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.mlp_dim = mlp_dim
        self.layers = nn.ModuleList()

        for i in range(depth):
            self.layers.append(
                AttentionBlock(
                    embedding_dim=embedding_dim,
                    num_heads=num_heads,
                    mlp_dim=mlp_dim,
                    activation=activation,
                    attention_downsample_rate=attention_downsample_rate
                )
            )

        self.final_attn_token_to_image = Attention(
            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
        )
        self.norm_final_attn = nn.LayerNorm(embedding_dim)

    def forward(
        self,
        image_embedding: Tensor,
        image_pe: Tensor,
        point_embedding: Tensor,
        point_pe: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        """
        Args:
          image_embedding (torch.Tensor): image to attend to. Should be shape
            B x embedding_dim x h x w for any h and w.
          image_pe (torch.Tensor): the positional encoding to add to the image. Must
            have the same shape as image_embedding.
          point_embedding (torch.Tensor): the embedding to add to the query points.
            Must have shape B x N_points x embedding_dim for any N_points.

        Returns:
          torch.Tensor: the processed point_embedding
          torch.Tensor: the processed image_embedding
        """
        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
        bs, c, h, w = image_embedding.shape
        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
        image_pe = image_pe.flatten(2).permute(0, 2, 1)

        # Apply transformer blocks and final layernorm
        for layer in self.layers:
            queries, keys = layer(
                queries=image_embedding,
                query_pe=image_pe,
                keys=point_embedding,
                key_pe=point_pe,
            )

        # Apply the final attention layer from the points to the image
        q = queries + image_pe
        k = keys + point_embedding

        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
        queries = queries + attn_out
        queries = self.norm_final_attn(queries)

        return queries, keys


class AttentionBlock(nn.Module):
    def __init__(
        self,
        embedding_dim: int,
        num_heads: int,
        mlp_dim: int = 2048,
        activation: Type[nn.Module] = nn.ReLU,
        attention_downsample_rate: int = 2,
        skip_first_layer_pe: bool = False,
    ) -> None:
        """
        A transformer block with four layers: (1) self-attention of sparse
        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
        block on sparse inputs, and (4) cross attention of dense inputs to sparse
        inputs.

        Arguments:
          embedding_dim (int): the channel dimension of the embeddings
          num_heads (int): the number of heads in the attention layers
          mlp_dim (int): the hidden dimension of the mlp block
          activation (nn.Module): the activation of the mlp block
          skip_first_layer_pe (bool): skip the PE on the first layer
        """
        super().__init__()
        self.self_attn = Attention(embedding_dim, num_heads)
        self.norm1 = nn.LayerNorm(embedding_dim)

        self.cross_attn_token_to_image = Attention(
            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
        )
        self.norm2 = nn.LayerNorm(embedding_dim)

        self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
        self.norm3 = nn.LayerNorm(embedding_dim)

        self.norm4 = nn.LayerNorm(embedding_dim)
        self.cross_attn_image_to_token = Attention(
            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
        )

        self.skip_first_layer_pe = skip_first_layer_pe

    def forward(
            self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
    ) -> Tuple[Tensor, Tensor]:
        # Self attention block
        if self.skip_first_layer_pe:
            queries = self.self_attn(q=queries, k=queries, v=queries)
        else:
            q = queries + query_pe
            attn_out = self.self_attn(q=q, k=q, v=queries)
            queries = queries + attn_out
        queries = self.norm1(queries)

        # Cross attention block, tokens attending to image embedding
        q = queries + query_pe
        k = keys + key_pe
        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
        queries = queries + attn_out
        queries = self.norm2(queries)

        # MLP block
        mlp_out = self.mlp(queries)
        queries = queries + mlp_out
        queries = self.norm3(queries)

        # Cross attention block, image embedding attending to tokens
        q = queries + query_pe
        k = keys + key_pe
        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
        keys = keys + attn_out
        keys = self.norm4(keys)

        return queries, keys


class Attention(nn.Module):
    """
    An attention layer that allows for downscaling the size of the embedding
    after projection to queries, keys, and values.
    """

    def __init__(
        self,
        embedding_dim: int,
        num_heads: int,
        downsample_rate: int = 1,
    ) -> None:
        super().__init__()
        self.embedding_dim = embedding_dim
        self.internal_dim = embedding_dim // downsample_rate
        self.num_heads = num_heads
        assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim."

        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)

    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
        b, n, c = x.shape
        x = x.reshape(b, n, num_heads, c // num_heads)
        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head

    def _recombine_heads(self, x: Tensor) -> Tensor:
        b, n_heads, n_tokens, c_per_head = x.shape
        x = x.transpose(1, 2)
        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C

    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
        # Input projections
        q = self.q_proj(q)
        k = self.k_proj(k)
        v = self.v_proj(v)

        # Separate into heads
        q = self._separate_heads(q, self.num_heads)
        k = self._separate_heads(k, self.num_heads)
        v = self._separate_heads(v, self.num_heads)

        # Attention
        _, _, _, c_per_head = q.shape
        attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
        attn = attn / math.sqrt(c_per_head)
        attn = torch.softmax(attn, dim=-1)

        # Get output
        out = attn @ v
        out = self._recombine_heads(out)
        out = self.out_proj(out)

        return out



class LayerNorm2d(nn.Module):
    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.ones(num_channels))
        self.bias = nn.Parameter(torch.zeros(num_channels))
        self.eps = eps

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        u = x.mean(1, keepdim=True)
        s = (x - u).pow(2).mean(1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.eps)
        x = self.weight[:, None, None] * x + self.bias[:, None, None]
        return x


class MLP(nn.Module):
    def __init__(
        self,
        input_dim: int,
        hidden_dim: int,
        output_dim: int,
        num_layers: int,
        sigmoid_output: bool = False,
    ) -> None:
        super().__init__()
        self.num_layers = num_layers
        h = [hidden_dim] * (num_layers - 1)
        self.layers = nn.ModuleList(
            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
        )
        self.sigmoid_output = sigmoid_output

    def forward(self, x):
        for i, layer in enumerate(self.layers):
            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
        if self.sigmoid_output:
            x = F.sigmoid(x)
        return x


@MODELS.register_module()
class SAMTransformerEDPromptGenNeck(nn.Module):
    def __init__(
            self,
            prompt_shape=(100, 5),
            in_channels=[1280]*16,
            inner_channels=128,
            selected_channels: list=None,
            num_encoders=2,
            num_decoders=2,
            out_channels=256,
            positional_encoding=dict(num_feats=128, normalize=True),
            kernel_size=3,
            stride=1,
            norm_cfg=dict(type='BN', requires_grad=True),
            act_cfg=dict(type='ReLU', inplace=True),
            init_cfg=None,
            **kwargs
    ):
        super().__init__()
        self.in_channels = in_channels
        self.kernel_size = kernel_size
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg
        self.out_channels = out_channels
        self.stride = stride
        self.selected_channels = selected_channels

        self.prompt_shape = prompt_shape
        self.num_queries = prompt_shape[0]
        self.per_query_point = prompt_shape[1]

        self.down_sample_layers = nn.ModuleList()
        for idx in self.selected_channels:
            self.down_sample_layers.append(
                nn.Sequential(
                    ConvModule(
                        in_channels[idx],
                        inner_channels,
                        kernel_size=1,
                        norm_cfg=self.norm_cfg,
                        act_cfg=self.act_cfg
                    ),
                    ConvModule(
                        inner_channels,
                        inner_channels,
                        kernel_size=3,
                        padding=1,
                        stride=2,
                        norm_cfg=self.norm_cfg,
                        act_cfg=self.act_cfg
                    ),
                )
            )
        self.fusion_layers = nn.ModuleList()
        for idx in self.selected_channels:
            self.fusion_layers.append(
                ConvModule(
                    inner_channels,
                    inner_channels,
                    kernel_size=3,
                    padding=1,
                    norm_cfg=self.norm_cfg,
                    act_cfg=self.act_cfg
                )
            )
        self.up_layers = nn.ModuleList()
        self.up_layers.append(
            nn.Sequential(
                ConvModule(
                    inner_channels,
                    inner_channels,
                    kernel_size=3,
                    padding=1,
                    norm_cfg=self.norm_cfg,
                    act_cfg=self.act_cfg
                ),
                ConvModule(
                    inner_channels,
                    inner_channels,
                    kernel_size=3,
                    padding=1,
                    norm_cfg=self.norm_cfg,
                    act_cfg=self.act_cfg
                )
            )
        )
        self.up_layers.append(
            ConvModule(
                inner_channels,
                out_channels,
                kernel_size=1,
                norm_cfg=self.norm_cfg,
                act_cfg=None
            )
        )

        self.generator_pe = SinePositionalEncoding(**positional_encoding)

        self.en_layers = nn.ModuleList()
        self.de_layers = nn.ModuleList()
        self.build_transformer(num_encoders=num_encoders, num_decoders=num_decoders)

        self.embed_dims = self.en_layers[0].embed_dims
        self.pre_norm = self.en_layers[0].pre_norm

        self.query_feat = nn.Embedding(self.num_queries, out_channels)
        self.query_embed = nn.Embedding(self.num_queries, out_channels)

        # self.output_upscaling = nn.Sequential(
        #     nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
        #     nn.BatchNorm2d(out_channels),
        #     nn.GELU(),
        #     nn.UpsamplingBilinear2d(scale_factor=2),
        #     nn.Conv2d(out_channels, out_channels // 4, kernel_size=3, padding=1),
        #     nn.BatchNorm2d(out_channels // 4),
        #     nn.GELU(),
        #     nn.UpsamplingBilinear2d(scale_factor=2),
        #     nn.Conv2d(out_channels // 4, out_channels // 8, kernel_size=3, padding=1),
        #     nn.BatchNorm2d(out_channels // 8),
        #     nn.GELU(),
        #     nn.UpsamplingBilinear2d(scale_factor=2),
        #     nn.Conv2d(out_channels // 8, out_channels // 8, kernel_size=3, padding=1),
        # )
        # self.output_hypernetworks_mlps = MLP(out_channels, out_channels, out_channels // 8, 3)

        self.init_weights()

    def build_transformer(self, num_encoders=2, num_decoders=2, embed_dims=256, num_heads=8, mlp_ratio=4):
        transformer_encoder_layer = dict(
            type='BaseTransformerLayer',
            attn_cfgs=[
                dict(
                    type='MultiheadAttention',
                    embed_dims=embed_dims,
                    num_heads=num_heads,
                    attn_drop=0.1,
                    proj_drop=0.1,
                    dropout_layer=dict(type='Dropout', drop_prob=0.1)
                ),
            ],
            ffn_cfgs=dict(
                type='FFN',
                embed_dims=embed_dims,
                feedforward_channels=embed_dims * mlp_ratio,
                num_fcs=2,
                act_cfg=dict(type='GELU'),
                ffn_drop=0.1,
                add_identity=True),
            operation_order=('norm', 'self_attn', 'norm', 'ffn'),
            norm_cfg=dict(type='LN'),
            batch_first=True
        )
        transformer_decoder_layer = dict(
            type='BaseTransformerLayer',
            attn_cfgs=dict(
                    type='MultiheadAttention',
                    embed_dims=embed_dims,
                    num_heads=num_heads,
                    attn_drop=0.1,
                    proj_drop=0.1,
                    dropout_layer=dict(type='Dropout', drop_prob=0.1)
                ),
            ffn_cfgs=dict(
                type='FFN',
                embed_dims=embed_dims,
                feedforward_channels=embed_dims * mlp_ratio,
                num_fcs=2,
                act_cfg=dict(type='GELU'),
                ffn_drop=0.1,
                add_identity=True),
            operation_order=('norm', 'self_attn', 'norm', 'cross_attn', 'norm', 'ffn'),
            norm_cfg=dict(type='LN'),
            batch_first=True
        )

        transformer_en_layers = [
            copy.deepcopy(transformer_encoder_layer) for _ in range(num_encoders)
        ]
        transformer_de_layers = [
            copy.deepcopy(transformer_decoder_layer) for _ in range(num_decoders)
        ]
        for i in range(num_encoders):
            self.en_layers.append(build_transformer_layer(transformer_en_layers[i]))
        for i in range(num_decoders):
            self.de_layers.append(build_transformer_layer(transformer_de_layers[i]))

    def init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, inputs):
        _, inner_states = inputs
        inner_states = [einops.rearrange(inner_states[idx], 'b h w c -> b c h w') for idx in self.selected_channels]
        inner_states = [layer(x) for layer, x in zip(self.down_sample_layers, inner_states)]

        x = None
        for inner_state, layer in zip(inner_states, self.fusion_layers):
            if x is not None:
                inner_state = x + inner_state
            x = inner_state + layer(inner_state)
        x = self.up_layers[0](x) + x
        img_feats = self.up_layers[1](x)

        bs, c, h, w = img_feats.shape

        mask_pe = torch.zeros((bs, h, w), device=img_feats.device, dtype=torch.bool)
        img_feats_pe = self.generator_pe(mask_pe)

        query_feat = self.query_feat.weight.unsqueeze(0).repeat(
            (bs, 1, 1))
        query_embed = self.query_embed.weight.unsqueeze(0).repeat(
            (bs, 1, 1))

        encoder_inputs = rearrange(img_feats, 'b c h w -> b (h w) c')
        img_feats_pe = img_feats_pe.flatten(2).permute(0, 2, 1)

        # shape (batch_size, num_total_queries, c)
        memory = encoder_inputs
        for layer in self.en_layers:
            memory = layer(
                query=memory,
                query_pos=img_feats_pe
            )
        # (batch_size, num_total_queries, c)

        query_feat_list = []
        for layer in self.de_layers:
            query_feat = layer(
                query=query_feat,
                key=memory,
                value=memory,
                query_pos=query_embed,
                key_pos=img_feats_pe
            )
            query_feat_list.append(query_feat)

        img_feat = rearrange(memory, 'b (h w) c -> b c h w', h=h, w=w)
        return query_feat, query_feat_list, img_feat


@MODELS.register_module()
class SAMAggregatorNeck(nn.Module):
    def __init__(
            self,
            in_channels=[1280]*16,
            inner_channels=128,
            selected_channels: list=None,
            out_channels=256,
            kernel_size=3,
            stride=1,
            norm_cfg=dict(type='BN', requires_grad=True),
            act_cfg=dict(type='ReLU', inplace=True),
            up_sample_scale=4,
            init_cfg=None,
            **kwargs
    ):
        super().__init__()
        self.in_channels = in_channels
        self.kernel_size = kernel_size
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg
        self.out_channels = out_channels
        self.stride = stride
        self.selected_channels = selected_channels
        self.up_sample_scale = up_sample_scale

        self.down_sample_layers = nn.ModuleList()
        for idx in self.selected_channels:
            self.down_sample_layers.append(
                nn.Sequential(
                    ConvModule(
                        in_channels[idx],
                        inner_channels,
                        kernel_size=1,
                        norm_cfg=self.norm_cfg,
                        act_cfg=self.act_cfg
                    ),
                    ConvModule(
                        inner_channels,
                        inner_channels,
                        kernel_size=3,
                        padding=1,
                        stride=2,
                        norm_cfg=self.norm_cfg,
                        act_cfg=self.act_cfg
                    ),
                )
            )
        self.fusion_layers = nn.ModuleList()
        for idx in self.selected_channels:
            self.fusion_layers.append(
                ConvModule(
                    inner_channels,
                    inner_channels,
                    kernel_size=3,
                    padding=1,
                    norm_cfg=self.norm_cfg,
                    act_cfg=self.act_cfg
                )
            )
        self.up_layers = nn.ModuleList()
        self.up_layers.append(
            nn.Sequential(
                ConvModule(
                    inner_channels,
                    inner_channels,
                    kernel_size=3,
                    padding=1,
                    norm_cfg=self.norm_cfg,
                    act_cfg=self.act_cfg
                ),
                ConvModule(
                    inner_channels,
                    inner_channels,
                    kernel_size=3,
                    padding=1,
                    norm_cfg=self.norm_cfg,
                    act_cfg=self.act_cfg
                )
            )
        )
        self.up_layers.append(
            ConvModule(
                inner_channels,
                out_channels,
                kernel_size=1,
                norm_cfg=self.norm_cfg,
                act_cfg=None
            )
        )

        self.up_sample_layers = nn.ModuleList()
        assert up_sample_scale == 4
        self.up_sample_layers.append(
            nn.Sequential(
                nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
                ConvModule(
                    out_channels,
                    out_channels,
                    kernel_size=3,
                    padding=1,
                    norm_cfg=self.norm_cfg,
                    act_cfg=self.act_cfg
                ),
                ConvModule(
                    out_channels,
                    out_channels,
                    kernel_size=3,
                    padding=1,
                    norm_cfg=self.norm_cfg,
                    act_cfg=self.act_cfg
                )
            )
        )

        self.up_sample_layers.append(
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False)
        )

        self.up_sample_layers.append(
            nn.Sequential(
                nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
                ConvModule(
                    out_channels,
                    out_channels,
                    kernel_size=3,
                    padding=1,
                    norm_cfg=self.norm_cfg,
                    act_cfg=self.act_cfg
                ),
                ConvModule(
                    out_channels,
                    out_channels,
                    kernel_size=3,
                    padding=1,
                    norm_cfg=self.norm_cfg,
                    act_cfg=self.act_cfg
                )
            )
        )

        self.up_sample_layers.append(
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False)
        )

    def forward(self, inputs):
        _, inner_states = inputs
        inner_states = [einops.rearrange(inner_states[idx], 'b h w c -> b c h w') for idx in self.selected_channels]
        inner_states = [layer(x) for layer, x in zip(self.down_sample_layers, inner_states)]

        x = None
        for inner_state, layer in zip(inner_states, self.fusion_layers):
            if x is not None:
                inner_state = x + inner_state
            x = inner_state + layer(inner_state)
        x = self.up_layers[0](x) + x
        img_feats_0 = self.up_layers[1](x)

        img_feats_1 = self.up_sample_layers[0](img_feats_0) + self.up_sample_layers[1](img_feats_0)

        img_feats_2 = self.up_sample_layers[2](img_feats_1) + self.up_sample_layers[3](img_feats_1)

        return img_feats_2, img_feats_1, img_feats_0