Spaces:
Sleeping
Sleeping
File size: 7,502 Bytes
3d85088 62ef5f4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import torch
import torch.nn as nn
import torch.nn.functional as F
from timm.models.vision_transformer import _load_weights
from timm.models.layers import trunc_normal_
from typing import List
from src.models.vit.utils import init_weights, resize_pos_embed
from src.models.vit.blocks import Block
from src.models.vit.decoder import DecoderLinear
class PatchEmbedding(nn.Module):
def __init__(self, image_size, patch_size, embed_dim, channels):
super().__init__()
self.image_size = image_size
if image_size[0] % patch_size != 0 or image_size[1] % patch_size != 0:
raise ValueError("image dimensions must be divisible by the patch size")
self.grid_size = image_size[0] // patch_size, image_size[1] // patch_size
self.num_patches = self.grid_size[0] * self.grid_size[1]
self.patch_size = patch_size
self.proj = nn.Conv2d(channels, embed_dim, kernel_size=patch_size, stride=patch_size)
def forward(self, im):
B, C, H, W = im.shape
x = self.proj(im).flatten(2).transpose(1, 2)
return x
class VisionTransformer(nn.Module):
def __init__(
self,
image_size,
patch_size,
n_layers,
d_model,
d_ff,
n_heads,
n_cls,
dropout=0.1,
drop_path_rate=0.0,
distilled=False,
channels=3,
):
super().__init__()
self.patch_embed = PatchEmbedding(
image_size,
patch_size,
d_model,
channels,
)
self.patch_size = patch_size
self.n_layers = n_layers
self.d_model = d_model
self.d_ff = d_ff
self.n_heads = n_heads
self.dropout = nn.Dropout(dropout)
self.n_cls = n_cls
# cls and pos tokens
self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
self.distilled = distilled
if self.distilled:
self.dist_token = nn.Parameter(torch.zeros(1, 1, d_model))
self.pos_embed = nn.Parameter(torch.randn(1, self.patch_embed.num_patches + 2, d_model))
self.head_dist = nn.Linear(d_model, n_cls)
else:
self.pos_embed = nn.Parameter(torch.randn(1, self.patch_embed.num_patches + 1, d_model))
# transformer blocks
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, n_layers)]
self.blocks = nn.ModuleList([Block(d_model, n_heads, d_ff, dropout, dpr[i]) for i in range(n_layers)])
# output head
self.norm = nn.LayerNorm(d_model)
self.head = nn.Linear(d_model, n_cls)
trunc_normal_(self.pos_embed, std=0.02)
trunc_normal_(self.cls_token, std=0.02)
if self.distilled:
trunc_normal_(self.dist_token, std=0.02)
self.pre_logits = nn.Identity()
self.apply(init_weights)
@torch.jit.ignore
def no_weight_decay(self):
return {"pos_embed", "cls_token", "dist_token"}
@torch.jit.ignore()
def load_pretrained(self, checkpoint_path, prefix=""):
_load_weights(self, checkpoint_path, prefix)
def forward(self, im, head_out_idx: List[int], n_dim_output=3, return_features=False):
B, _, H, W = im.shape
PS = self.patch_size
assert n_dim_output == 3 or n_dim_output == 4, "n_dim_output must be 3 or 4"
x = self.patch_embed(im)
cls_tokens = self.cls_token.expand(B, -1, -1)
if self.distilled:
dist_tokens = self.dist_token.expand(B, -1, -1)
x = torch.cat((cls_tokens, dist_tokens, x), dim=1)
else:
x = torch.cat((cls_tokens, x), dim=1)
pos_embed = self.pos_embed
num_extra_tokens = 1 + self.distilled
if x.shape[1] != pos_embed.shape[1]:
pos_embed = resize_pos_embed(
pos_embed,
self.patch_embed.grid_size,
(H // PS, W // PS),
num_extra_tokens,
)
x = x + pos_embed
x = self.dropout(x)
device = x.device
if n_dim_output == 3:
heads_out = torch.zeros(size=(len(head_out_idx), B, (H // PS) ** 2 + 1, self.d_model)).to(device)
else:
heads_out = torch.zeros(size=(len(head_out_idx), B, self.d_model, H // PS, H // PS)).to(device)
self.register_buffer("heads_out", heads_out)
head_idx = 0
for idx_layer, blk in enumerate(self.blocks):
x = blk(x)
if idx_layer in head_out_idx:
if n_dim_output == 3:
heads_out[head_idx] = x
else:
heads_out[head_idx] = x[:, 1:, :].reshape((-1, 24, 24, self.d_model)).permute(0, 3, 1, 2)
head_idx += 1
x = self.norm(x)
if return_features:
return heads_out
if self.distilled:
x, x_dist = x[:, 0], x[:, 1]
x = self.head(x)
x_dist = self.head_dist(x_dist)
x = (x + x_dist) / 2
else:
x = x[:, 0]
x = self.head(x)
return x
def get_attention_map(self, im, layer_id):
if layer_id >= self.n_layers or layer_id < 0:
raise ValueError(f"Provided layer_id: {layer_id} is not valid. 0 <= {layer_id} < {self.n_layers}.")
B, _, H, W = im.shape
PS = self.patch_size
x = self.patch_embed(im)
cls_tokens = self.cls_token.expand(B, -1, -1)
if self.distilled:
dist_tokens = self.dist_token.expand(B, -1, -1)
x = torch.cat((cls_tokens, dist_tokens, x), dim=1)
else:
x = torch.cat((cls_tokens, x), dim=1)
pos_embed = self.pos_embed
num_extra_tokens = 1 + self.distilled
if x.shape[1] != pos_embed.shape[1]:
pos_embed = resize_pos_embed(
pos_embed,
self.patch_embed.grid_size,
(H // PS, W // PS),
num_extra_tokens,
)
x = x + pos_embed
for i, blk in enumerate(self.blocks):
if i < layer_id:
x = blk(x)
else:
return blk(x, return_attention=True)
class FeatureTransform(nn.Module):
def __init__(self, img_size, d_encoder, nls_list=[128, 256, 512, 512], scale_factor_list=[8, 4, 2, 1]):
super(FeatureTransform, self).__init__()
self.img_size = img_size
self.decoder_0 = DecoderLinear(n_cls=nls_list[0], d_encoder=d_encoder, scale_factor=scale_factor_list[0])
self.decoder_1 = DecoderLinear(n_cls=nls_list[1], d_encoder=d_encoder, scale_factor=scale_factor_list[1])
self.decoder_2 = DecoderLinear(n_cls=nls_list[2], d_encoder=d_encoder, scale_factor=scale_factor_list[2])
self.decoder_3 = DecoderLinear(n_cls=nls_list[3], d_encoder=d_encoder, scale_factor=scale_factor_list[3])
def forward(self, x_list):
feat_3 = self.decoder_3(x_list[3][:, 1:, :], self.img_size) # (2, 512, 24, 24)
feat_2 = self.decoder_2(x_list[2][:, 1:, :], self.img_size) # (2, 512, 48, 48)
feat_1 = self.decoder_1(x_list[1][:, 1:, :], self.img_size) # (2, 256, 96, 96)
feat_0 = self.decoder_0(x_list[0][:, 1:, :], self.img_size) # (2, 128, 192, 192)
return feat_0, feat_1, feat_2, feat_3 |