# Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the Apache License, Version 2.0 # found in the LICENSE file in the root directory of this source tree. """Vision Transformer (ViT) in PyTorch. A PyTorch implement of Vision Transformers as described in: 'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale' - https://arxiv.org/abs/2010.11929 `How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers` - https://arxiv.org/abs/2106.10270 The official jax code is released and available at https://github.com/google-research/vision_transformer DeiT model defs and weights from https://github.com/facebookresearch/deit, paper `DeiT: Data-efficient Image Transformers` - https://arxiv.org/abs/2012.12877 Acknowledgments: * The paper authors for releasing code and weights, thanks! * I fixed my class token impl based on Phil Wang's https://github.com/lucidrains/vit-pytorch ... check it out for some einops/einsum fun * Simple transformer style inspired by Andrej Karpathy's https://github.com/karpathy/minGPT * Bert reference code checks against Huggingface Transformers and Tensorflow Bert Hacked together by / Copyright 2021 Ross Wightman """ import logging import math from functools import partial from itertools import repeat from typing import Callable, Optional import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as cp from mmcv.runner import BaseModule, load_checkpoint from mmseg.ops import resize from mmseg.utils import get_root_logger from torch import Tensor from .drop_path import DropPath def to_2tuple(x): return tuple(repeat(x, 2)) class Mlp(nn.Module): def __init__( self, in_features: int, hidden_features: Optional[int] = None, out_features: Optional[int] = None, act_layer: Callable[..., nn.Module] = nn.GELU, drop: float = 0.0, bias: bool = True, ) -> None: super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) self.drop = nn.Dropout(drop) def forward(self, x: Tensor) -> Tensor: x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class SwiGLUFFN(nn.Module): def __init__( self, in_features: int, hidden_features: Optional[int] = None, out_features: Optional[int] = None, act_layer: Callable[..., nn.Module] = None, drop: float = 0.0, ) -> None: super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features swiglu_hidden_features = int(2 * hidden_features / 3) align_as = 8 swiglu_hidden_features = (swiglu_hidden_features + align_as - 1) // align_as * align_as self.w1 = nn.Linear(in_features, swiglu_hidden_features) self.w2 = nn.Linear(in_features, swiglu_hidden_features) self.w3 = nn.Linear(swiglu_hidden_features, out_features) def forward(self, x: Tensor) -> Tensor: x1 = self.w1(x) x2 = self.w2(x) hidden = F.silu(x1) * x2 return self.w3(hidden) class PatchEmbed(nn.Module): """2D Image to Patch Embedding.""" def __init__( self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, bias=True ): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) self.img_size = img_size self.patch_size = patch_size self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) self.num_patches = self.grid_size[0] * self.grid_size[1] self.flatten = flatten self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias) self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() def forward(self, x): x = self.proj(x) _, _, H, W = x.shape if self.flatten: x = x.flatten(2).transpose(1, 2) # BCHW -> BNC x = self.norm(x) return x, H, W class Attention(nn.Module): def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads self.scale = head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x, H, W): B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) attn = (q @ k.transpose(-2, -1)) * self.scale attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x class MemEffAttention(nn.Module): def __init__( self, dim: int, num_heads: int = 8, qkv_bias: bool = False, attn_drop: float = 0.0, proj_drop: float = 0.0, ) -> None: super().__init__() self.num_heads = num_heads head_dim = dim // num_heads self.scale = head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x: Tensor, H, W) -> Tensor: from xformers.ops import memory_efficient_attention, unbind B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) q, k, v = unbind(qkv, 2) x = memory_efficient_attention(q, k, v) x = x.reshape([B, N, C]) x = self.proj(x) x = self.proj_drop(x) return x def window_partition(x, window_size): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows def window_reverse(windows, window_size, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x class WindowedAttention(nn.Module): def __init__( self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0, window_size=14, pad_mode="constant" ): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads self.scale = head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.window_size = window_size self.pad_mode = pad_mode def forward(self, x, H, W): B, N, C = x.shape N_ = self.window_size * self.window_size H_ = math.ceil(H / self.window_size) * self.window_size W_ = math.ceil(W / self.window_size) * self.window_size qkv = self.qkv(x) # [B, N, C] qkv = qkv.transpose(1, 2).reshape(B, C * 3, H, W) # [B, C, H, W] qkv = F.pad(qkv, [0, W_ - W, 0, H_ - H], mode=self.pad_mode) qkv = F.unfold( qkv, kernel_size=(self.window_size, self.window_size), stride=(self.window_size, self.window_size) ) B, C_kw_kw, L = qkv.shape # L - the num of windows qkv = qkv.reshape(B, C * 3, N_, L).permute(0, 3, 2, 1) # [B, L, N_, C] qkv = qkv.reshape(B, L, N_, 3, self.num_heads, C // self.num_heads).permute(3, 0, 1, 4, 2, 5) q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) # q,k,v [B, L, num_head, N_, C/num_head] attn = (q @ k.transpose(-2, -1)) * self.scale # [B, L, num_head, N_, N_] # if self.mask: # attn = attn * mask attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) # [B, L, num_head, N_, N_] # attn @ v = [B, L, num_head, N_, C/num_head] x = (attn @ v).permute(0, 2, 4, 3, 1).reshape(B, C_kw_kw // 3, L) x = F.fold( x, output_size=(H_, W_), kernel_size=(self.window_size, self.window_size), stride=(self.window_size, self.window_size), ) # [B, C, H_, W_] x = x[:, :, :H, :W].reshape(B, C, N).transpose(-1, -2) x = self.proj(x) x = self.proj_drop(x) return x # class WindowedAttention(nn.Module): # def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0., window_size=14, pad_mode="constant"): # super().__init__() # self.num_heads = num_heads # head_dim = dim // num_heads # self.scale = head_dim ** -0.5 # # self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) # self.attn_drop = nn.Dropout(attn_drop) # self.proj = nn.Linear(dim, dim) # self.proj_drop = nn.Dropout(proj_drop) # self.window_size = window_size # self.pad_mode = pad_mode # # def forward(self, x, H, W): # B, N, C = x.shape # # N_ = self.window_size * self.window_size # H_ = math.ceil(H / self.window_size) * self.window_size # W_ = math.ceil(W / self.window_size) * self.window_size # x = x.view(B, H, W, C) # x = F.pad(x, [0, 0, 0, W_ - W, 0, H_- H], mode=self.pad_mode) # # x = window_partition(x, window_size=self.window_size)# nW*B, window_size, window_size, C # x = x.view(-1, N_, C) # # qkv = self.qkv(x).view(-1, N_, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) # q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) # attn = (q @ k.transpose(-2, -1)) * self.scale # [B, L, num_head, N_, N_] # attn = attn.softmax(dim=-1) # attn = self.attn_drop(attn) # [B, L, num_head, N_, N_] # x = (attn @ v).transpose(1, 2).reshape(-1, self.window_size, self.window_size, C) # # x = window_reverse(x, self.window_size, H_, W_) # x = x[:, :H, :W, :].reshape(B, N, C).contiguous() # x = self.proj(x) # x = self.proj_drop(x) # return x class Block(nn.Module): def __init__( self, dim, num_heads, mlp_ratio=4.0, qkv_bias=False, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, windowed=False, window_size=14, pad_mode="constant", layer_scale=False, with_cp=False, ffn_layer=Mlp, memeff=False, ): super().__init__() self.with_cp = with_cp self.norm1 = norm_layer(dim) if windowed: self.attn = WindowedAttention( dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, window_size=window_size, pad_mode=pad_mode, ) elif memeff: self.attn = MemEffAttention( dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop ) else: self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = ffn_layer(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) self.layer_scale = layer_scale if layer_scale: self.gamma1 = nn.Parameter(torch.ones((dim)), requires_grad=True) self.gamma2 = nn.Parameter(torch.ones((dim)), requires_grad=True) def forward(self, x, H, W): def _inner_forward(x): if self.layer_scale: x = x + self.drop_path(self.gamma1 * self.attn(self.norm1(x), H, W)) x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x))) else: x = x + self.drop_path(self.attn(self.norm1(x), H, W)) x = x + self.drop_path(self.mlp(self.norm2(x))) return x if self.with_cp and x.requires_grad: x = cp.checkpoint(_inner_forward, x) else: x = _inner_forward(x) return x class TIMMVisionTransformer(BaseModule): """Vision Transformer. A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` - https://arxiv.org/abs/2010.11929 Includes distillation token & head support for `DeiT: Data-efficient Image Transformers` - https://arxiv.org/abs/2012.12877 """ def __init__( self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, qkv_bias=True, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, layer_scale=True, embed_layer=PatchEmbed, norm_layer=partial(nn.LayerNorm, eps=1e-6), act_layer=nn.GELU, window_attn=False, window_size=14, pretrained=None, with_cp=False, pre_norm=False, ffn_type="mlp", memeff=False, ): """ Args: img_size (int, tuple): input image size patch_size (int, tuple): patch size in_chans (int): number of input channels num_classes (int): number of classes for classification head embed_dim (int): embedding dimension depth (int): depth of transformer num_heads (int): number of attention heads mlp_ratio (int): ratio of mlp hidden dim to embedding dim qkv_bias (bool): enable bias for qkv if True drop_rate (float): dropout rate attn_drop_rate (float): attention dropout rate drop_path_rate (float): stochastic depth rate embed_layer (nn.Module): patch embedding layer norm_layer: (nn.Module): normalization layer pretrained: (str): pretrained path """ super().__init__() self.num_classes = num_classes self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models self.num_tokens = 1 norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) act_layer = act_layer or nn.GELU self.norm_layer = norm_layer self.act_layer = act_layer self.pretrain_size = img_size self.drop_path_rate = drop_path_rate self.drop_rate = drop_rate self.patch_size = patch_size window_attn = [window_attn] * depth if not isinstance(window_attn, list) else window_attn window_size = [window_size] * depth if not isinstance(window_size, list) else window_size logging.info("window attention:", window_attn) logging.info("window size:", window_size) logging.info("layer scale:", layer_scale) self.patch_embed = embed_layer( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, bias=not pre_norm ) num_patches = self.patch_embed.num_patches self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) self.pos_drop = nn.Dropout(p=drop_rate) ffn_types = {"mlp": Mlp, "swiglu": SwiGLUFFN} dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule self.blocks = nn.Sequential( *[ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer, windowed=window_attn[i], window_size=window_size[i], layer_scale=layer_scale, with_cp=with_cp, ffn_layer=ffn_types[ffn_type], memeff=memeff, ) for i in range(depth) ] ) # self.norm = norm_layer(embed_dim) self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) # For CLIP if pre_norm: norm_pre = norm_layer(embed_dim) self.norm_pre = norm_pre else: self.norm_pre = nn.Identity() self.init_weights(pretrained) def init_weights(self, pretrained=None): if isinstance(pretrained, str): logger = get_root_logger() load_checkpoint(self, pretrained, map_location="cpu", strict=False, logger=logger) def forward_features(self, x): x, H, W = self.patch_embed(x) cls_token = self.cls_token.expand(x.shape[0], -1, -1) # stole cls_tokens impl from Phil Wang, thanks x = torch.cat((cls_token, x), dim=1) x = self.pos_drop(x + self.pos_embed) # For CLIP x = self.norm_pre(x) for blk in self.blocks: x = blk(x, H, W) x = self.norm(x) return x def forward(self, x): x = self.forward_features(x) return x @staticmethod def resize_pos_embed(pos_embed, input_shpae, pos_shape, mode): """Resize pos_embed weights. Resize pos_embed using bicubic interpolate method. Args: pos_embed (torch.Tensor): Position embedding weights. input_shpae (tuple): Tuple for (downsampled input image height, downsampled input image width). pos_shape (tuple): The resolution of downsampled origin training image. mode (str): Algorithm used for upsampling: ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` | ``'trilinear'``. Default: ``'nearest'`` Return: torch.Tensor: The resized pos_embed of shape [B, L_new, C] """ assert pos_embed.ndim == 3, "shape of pos_embed must be [B, L, C]" pos_h, pos_w = pos_shape # keep dim for easy deployment cls_token_weight = pos_embed[:, 0:1] pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w) :] pos_embed_weight = pos_embed_weight.reshape(1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2) pos_embed_weight = resize(pos_embed_weight, size=input_shpae, align_corners=False, mode=mode) pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2) pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1) return pos_embed