cymic commited on
Commit
36f5048
·
1 Parent(s): 3ea2698

Upload 2 files

Browse files
modules/codeformer/codeformer_arch.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this file is copied from CodeFormer repository. Please see comment in modules/codeformer_model.py
2
+
3
+ import math
4
+ import numpy as np
5
+ import torch
6
+ from torch import nn, Tensor
7
+ import torch.nn.functional as F
8
+ from typing import Optional, List
9
+
10
+ from modules.codeformer.vqgan_arch import *
11
+ from basicsr.utils import get_root_logger
12
+ from basicsr.utils.registry import ARCH_REGISTRY
13
+
14
+ def calc_mean_std(feat, eps=1e-5):
15
+ """Calculate mean and std for adaptive_instance_normalization.
16
+
17
+ Args:
18
+ feat (Tensor): 4D tensor.
19
+ eps (float): A small value added to the variance to avoid
20
+ divide-by-zero. Default: 1e-5.
21
+ """
22
+ size = feat.size()
23
+ assert len(size) == 4, 'The input feature should be 4D tensor.'
24
+ b, c = size[:2]
25
+ feat_var = feat.view(b, c, -1).var(dim=2) + eps
26
+ feat_std = feat_var.sqrt().view(b, c, 1, 1)
27
+ feat_mean = feat.view(b, c, -1).mean(dim=2).view(b, c, 1, 1)
28
+ return feat_mean, feat_std
29
+
30
+
31
+ def adaptive_instance_normalization(content_feat, style_feat):
32
+ """Adaptive instance normalization.
33
+
34
+ Adjust the reference features to have the similar color and illuminations
35
+ as those in the degradate features.
36
+
37
+ Args:
38
+ content_feat (Tensor): The reference feature.
39
+ style_feat (Tensor): The degradate features.
40
+ """
41
+ size = content_feat.size()
42
+ style_mean, style_std = calc_mean_std(style_feat)
43
+ content_mean, content_std = calc_mean_std(content_feat)
44
+ normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
45
+ return normalized_feat * style_std.expand(size) + style_mean.expand(size)
46
+
47
+
48
+ class PositionEmbeddingSine(nn.Module):
49
+ """
50
+ This is a more standard version of the position embedding, very similar to the one
51
+ used by the Attention is all you need paper, generalized to work on images.
52
+ """
53
+
54
+ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
55
+ super().__init__()
56
+ self.num_pos_feats = num_pos_feats
57
+ self.temperature = temperature
58
+ self.normalize = normalize
59
+ if scale is not None and normalize is False:
60
+ raise ValueError("normalize should be True if scale is passed")
61
+ if scale is None:
62
+ scale = 2 * math.pi
63
+ self.scale = scale
64
+
65
+ def forward(self, x, mask=None):
66
+ if mask is None:
67
+ mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
68
+ not_mask = ~mask
69
+ y_embed = not_mask.cumsum(1, dtype=torch.float32)
70
+ x_embed = not_mask.cumsum(2, dtype=torch.float32)
71
+ if self.normalize:
72
+ eps = 1e-6
73
+ y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
74
+ x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
75
+
76
+ dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
77
+ dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
78
+
79
+ pos_x = x_embed[:, :, :, None] / dim_t
80
+ pos_y = y_embed[:, :, :, None] / dim_t
81
+ pos_x = torch.stack(
82
+ (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
83
+ ).flatten(3)
84
+ pos_y = torch.stack(
85
+ (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
86
+ ).flatten(3)
87
+ pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
88
+ return pos
89
+
90
+ def _get_activation_fn(activation):
91
+ """Return an activation function given a string"""
92
+ if activation == "relu":
93
+ return F.relu
94
+ if activation == "gelu":
95
+ return F.gelu
96
+ if activation == "glu":
97
+ return F.glu
98
+ raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
99
+
100
+
101
+ class TransformerSALayer(nn.Module):
102
+ def __init__(self, embed_dim, nhead=8, dim_mlp=2048, dropout=0.0, activation="gelu"):
103
+ super().__init__()
104
+ self.self_attn = nn.MultiheadAttention(embed_dim, nhead, dropout=dropout)
105
+ # Implementation of Feedforward model - MLP
106
+ self.linear1 = nn.Linear(embed_dim, dim_mlp)
107
+ self.dropout = nn.Dropout(dropout)
108
+ self.linear2 = nn.Linear(dim_mlp, embed_dim)
109
+
110
+ self.norm1 = nn.LayerNorm(embed_dim)
111
+ self.norm2 = nn.LayerNorm(embed_dim)
112
+ self.dropout1 = nn.Dropout(dropout)
113
+ self.dropout2 = nn.Dropout(dropout)
114
+
115
+ self.activation = _get_activation_fn(activation)
116
+
117
+ def with_pos_embed(self, tensor, pos: Optional[Tensor]):
118
+ return tensor if pos is None else tensor + pos
119
+
120
+ def forward(self, tgt,
121
+ tgt_mask: Optional[Tensor] = None,
122
+ tgt_key_padding_mask: Optional[Tensor] = None,
123
+ query_pos: Optional[Tensor] = None):
124
+
125
+ # self attention
126
+ tgt2 = self.norm1(tgt)
127
+ q = k = self.with_pos_embed(tgt2, query_pos)
128
+ tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
129
+ key_padding_mask=tgt_key_padding_mask)[0]
130
+ tgt = tgt + self.dropout1(tgt2)
131
+
132
+ # ffn
133
+ tgt2 = self.norm2(tgt)
134
+ tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
135
+ tgt = tgt + self.dropout2(tgt2)
136
+ return tgt
137
+
138
+ class Fuse_sft_block(nn.Module):
139
+ def __init__(self, in_ch, out_ch):
140
+ super().__init__()
141
+ self.encode_enc = ResBlock(2*in_ch, out_ch)
142
+
143
+ self.scale = nn.Sequential(
144
+ nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
145
+ nn.LeakyReLU(0.2, True),
146
+ nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
147
+
148
+ self.shift = nn.Sequential(
149
+ nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
150
+ nn.LeakyReLU(0.2, True),
151
+ nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
152
+
153
+ def forward(self, enc_feat, dec_feat, w=1):
154
+ enc_feat = self.encode_enc(torch.cat([enc_feat, dec_feat], dim=1))
155
+ scale = self.scale(enc_feat)
156
+ shift = self.shift(enc_feat)
157
+ residual = w * (dec_feat * scale + shift)
158
+ out = dec_feat + residual
159
+ return out
160
+
161
+
162
+ @ARCH_REGISTRY.register()
163
+ class CodeFormer(VQAutoEncoder):
164
+ def __init__(self, dim_embd=512, n_head=8, n_layers=9,
165
+ codebook_size=1024, latent_size=256,
166
+ connect_list=['32', '64', '128', '256'],
167
+ fix_modules=['quantize','generator']):
168
+ super(CodeFormer, self).__init__(512, 64, [1, 2, 2, 4, 4, 8], 'nearest',2, [16], codebook_size)
169
+
170
+ if fix_modules is not None:
171
+ for module in fix_modules:
172
+ for param in getattr(self, module).parameters():
173
+ param.requires_grad = False
174
+
175
+ self.connect_list = connect_list
176
+ self.n_layers = n_layers
177
+ self.dim_embd = dim_embd
178
+ self.dim_mlp = dim_embd*2
179
+
180
+ self.position_emb = nn.Parameter(torch.zeros(latent_size, self.dim_embd))
181
+ self.feat_emb = nn.Linear(256, self.dim_embd)
182
+
183
+ # transformer
184
+ self.ft_layers = nn.Sequential(*[TransformerSALayer(embed_dim=dim_embd, nhead=n_head, dim_mlp=self.dim_mlp, dropout=0.0)
185
+ for _ in range(self.n_layers)])
186
+
187
+ # logits_predict head
188
+ self.idx_pred_layer = nn.Sequential(
189
+ nn.LayerNorm(dim_embd),
190
+ nn.Linear(dim_embd, codebook_size, bias=False))
191
+
192
+ self.channels = {
193
+ '16': 512,
194
+ '32': 256,
195
+ '64': 256,
196
+ '128': 128,
197
+ '256': 128,
198
+ '512': 64,
199
+ }
200
+
201
+ # after second residual block for > 16, before attn layer for ==16
202
+ self.fuse_encoder_block = {'512':2, '256':5, '128':8, '64':11, '32':14, '16':18}
203
+ # after first residual block for > 16, before attn layer for ==16
204
+ self.fuse_generator_block = {'16':6, '32': 9, '64':12, '128':15, '256':18, '512':21}
205
+
206
+ # fuse_convs_dict
207
+ self.fuse_convs_dict = nn.ModuleDict()
208
+ for f_size in self.connect_list:
209
+ in_ch = self.channels[f_size]
210
+ self.fuse_convs_dict[f_size] = Fuse_sft_block(in_ch, in_ch)
211
+
212
+ def _init_weights(self, module):
213
+ if isinstance(module, (nn.Linear, nn.Embedding)):
214
+ module.weight.data.normal_(mean=0.0, std=0.02)
215
+ if isinstance(module, nn.Linear) and module.bias is not None:
216
+ module.bias.data.zero_()
217
+ elif isinstance(module, nn.LayerNorm):
218
+ module.bias.data.zero_()
219
+ module.weight.data.fill_(1.0)
220
+
221
+ def forward(self, x, w=0, detach_16=True, code_only=False, adain=False):
222
+ # ################### Encoder #####################
223
+ enc_feat_dict = {}
224
+ out_list = [self.fuse_encoder_block[f_size] for f_size in self.connect_list]
225
+ for i, block in enumerate(self.encoder.blocks):
226
+ x = block(x)
227
+ if i in out_list:
228
+ enc_feat_dict[str(x.shape[-1])] = x.clone()
229
+
230
+ lq_feat = x
231
+ # ################# Transformer ###################
232
+ # quant_feat, codebook_loss, quant_stats = self.quantize(lq_feat)
233
+ pos_emb = self.position_emb.unsqueeze(1).repeat(1,x.shape[0],1)
234
+ # BCHW -> BC(HW) -> (HW)BC
235
+ feat_emb = self.feat_emb(lq_feat.flatten(2).permute(2,0,1))
236
+ query_emb = feat_emb
237
+ # Transformer encoder
238
+ for layer in self.ft_layers:
239
+ query_emb = layer(query_emb, query_pos=pos_emb)
240
+
241
+ # output logits
242
+ logits = self.idx_pred_layer(query_emb) # (hw)bn
243
+ logits = logits.permute(1,0,2) # (hw)bn -> b(hw)n
244
+
245
+ if code_only: # for training stage II
246
+ # logits doesn't need softmax before cross_entropy loss
247
+ return logits, lq_feat
248
+
249
+ # ################# Quantization ###################
250
+ # if self.training:
251
+ # quant_feat = torch.einsum('btn,nc->btc', [soft_one_hot, self.quantize.embedding.weight])
252
+ # # b(hw)c -> bc(hw) -> bchw
253
+ # quant_feat = quant_feat.permute(0,2,1).view(lq_feat.shape)
254
+ # ------------
255
+ soft_one_hot = F.softmax(logits, dim=2)
256
+ _, top_idx = torch.topk(soft_one_hot, 1, dim=2)
257
+ quant_feat = self.quantize.get_codebook_feat(top_idx, shape=[x.shape[0],16,16,256])
258
+ # preserve gradients
259
+ # quant_feat = lq_feat + (quant_feat - lq_feat).detach()
260
+
261
+ if detach_16:
262
+ quant_feat = quant_feat.detach() # for training stage III
263
+ if adain:
264
+ quant_feat = adaptive_instance_normalization(quant_feat, lq_feat)
265
+
266
+ # ################## Generator ####################
267
+ x = quant_feat
268
+ fuse_list = [self.fuse_generator_block[f_size] for f_size in self.connect_list]
269
+
270
+ for i, block in enumerate(self.generator.blocks):
271
+ x = block(x)
272
+ if i in fuse_list: # fuse after i-th block
273
+ f_size = str(x.shape[-1])
274
+ if w>0:
275
+ x = self.fuse_convs_dict[f_size](enc_feat_dict[f_size].detach(), x, w)
276
+ out = x
277
+ # logits doesn't need softmax before cross_entropy loss
278
+ return out, logits, lq_feat
modules/codeformer/vqgan_arch.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this file is copied from CodeFormer repository. Please see comment in modules/codeformer_model.py
2
+
3
+ '''
4
+ VQGAN code, adapted from the original created by the Unleashing Transformers authors:
5
+ https://github.com/samb-t/unleashing-transformers/blob/master/models/vqgan.py
6
+
7
+ '''
8
+ import numpy as np
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ import copy
13
+ from basicsr.utils import get_root_logger
14
+ from basicsr.utils.registry import ARCH_REGISTRY
15
+
16
+ def normalize(in_channels):
17
+ return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
18
+
19
+
20
+ @torch.jit.script
21
+ def swish(x):
22
+ return x*torch.sigmoid(x)
23
+
24
+
25
+ # Define VQVAE classes
26
+ class VectorQuantizer(nn.Module):
27
+ def __init__(self, codebook_size, emb_dim, beta):
28
+ super(VectorQuantizer, self).__init__()
29
+ self.codebook_size = codebook_size # number of embeddings
30
+ self.emb_dim = emb_dim # dimension of embedding
31
+ self.beta = beta # commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2
32
+ self.embedding = nn.Embedding(self.codebook_size, self.emb_dim)
33
+ self.embedding.weight.data.uniform_(-1.0 / self.codebook_size, 1.0 / self.codebook_size)
34
+
35
+ def forward(self, z):
36
+ # reshape z -> (batch, height, width, channel) and flatten
37
+ z = z.permute(0, 2, 3, 1).contiguous()
38
+ z_flattened = z.view(-1, self.emb_dim)
39
+
40
+ # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
41
+ d = (z_flattened ** 2).sum(dim=1, keepdim=True) + (self.embedding.weight**2).sum(1) - \
42
+ 2 * torch.matmul(z_flattened, self.embedding.weight.t())
43
+
44
+ mean_distance = torch.mean(d)
45
+ # find closest encodings
46
+ # min_encoding_indices = torch.argmin(d, dim=1).unsqueeze(1)
47
+ min_encoding_scores, min_encoding_indices = torch.topk(d, 1, dim=1, largest=False)
48
+ # [0-1], higher score, higher confidence
49
+ min_encoding_scores = torch.exp(-min_encoding_scores/10)
50
+
51
+ min_encodings = torch.zeros(min_encoding_indices.shape[0], self.codebook_size).to(z)
52
+ min_encodings.scatter_(1, min_encoding_indices, 1)
53
+
54
+ # get quantized latent vectors
55
+ z_q = torch.matmul(min_encodings, self.embedding.weight).view(z.shape)
56
+ # compute loss for embedding
57
+ loss = torch.mean((z_q.detach()-z)**2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
58
+ # preserve gradients
59
+ z_q = z + (z_q - z).detach()
60
+
61
+ # perplexity
62
+ e_mean = torch.mean(min_encodings, dim=0)
63
+ perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10)))
64
+ # reshape back to match original input shape
65
+ z_q = z_q.permute(0, 3, 1, 2).contiguous()
66
+
67
+ return z_q, loss, {
68
+ "perplexity": perplexity,
69
+ "min_encodings": min_encodings,
70
+ "min_encoding_indices": min_encoding_indices,
71
+ "min_encoding_scores": min_encoding_scores,
72
+ "mean_distance": mean_distance
73
+ }
74
+
75
+ def get_codebook_feat(self, indices, shape):
76
+ # input indices: batch*token_num -> (batch*token_num)*1
77
+ # shape: batch, height, width, channel
78
+ indices = indices.view(-1,1)
79
+ min_encodings = torch.zeros(indices.shape[0], self.codebook_size).to(indices)
80
+ min_encodings.scatter_(1, indices, 1)
81
+ # get quantized latent vectors
82
+ z_q = torch.matmul(min_encodings.float(), self.embedding.weight)
83
+
84
+ if shape is not None: # reshape back to match original input shape
85
+ z_q = z_q.view(shape).permute(0, 3, 1, 2).contiguous()
86
+
87
+ return z_q
88
+
89
+
90
+ class GumbelQuantizer(nn.Module):
91
+ def __init__(self, codebook_size, emb_dim, num_hiddens, straight_through=False, kl_weight=5e-4, temp_init=1.0):
92
+ super().__init__()
93
+ self.codebook_size = codebook_size # number of embeddings
94
+ self.emb_dim = emb_dim # dimension of embedding
95
+ self.straight_through = straight_through
96
+ self.temperature = temp_init
97
+ self.kl_weight = kl_weight
98
+ self.proj = nn.Conv2d(num_hiddens, codebook_size, 1) # projects last encoder layer to quantized logits
99
+ self.embed = nn.Embedding(codebook_size, emb_dim)
100
+
101
+ def forward(self, z):
102
+ hard = self.straight_through if self.training else True
103
+
104
+ logits = self.proj(z)
105
+
106
+ soft_one_hot = F.gumbel_softmax(logits, tau=self.temperature, dim=1, hard=hard)
107
+
108
+ z_q = torch.einsum("b n h w, n d -> b d h w", soft_one_hot, self.embed.weight)
109
+
110
+ # + kl divergence to the prior loss
111
+ qy = F.softmax(logits, dim=1)
112
+ diff = self.kl_weight * torch.sum(qy * torch.log(qy * self.codebook_size + 1e-10), dim=1).mean()
113
+ min_encoding_indices = soft_one_hot.argmax(dim=1)
114
+
115
+ return z_q, diff, {
116
+ "min_encoding_indices": min_encoding_indices
117
+ }
118
+
119
+
120
+ class Downsample(nn.Module):
121
+ def __init__(self, in_channels):
122
+ super().__init__()
123
+ self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
124
+
125
+ def forward(self, x):
126
+ pad = (0, 1, 0, 1)
127
+ x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
128
+ x = self.conv(x)
129
+ return x
130
+
131
+
132
+ class Upsample(nn.Module):
133
+ def __init__(self, in_channels):
134
+ super().__init__()
135
+ self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
136
+
137
+ def forward(self, x):
138
+ x = F.interpolate(x, scale_factor=2.0, mode="nearest")
139
+ x = self.conv(x)
140
+
141
+ return x
142
+
143
+
144
+ class ResBlock(nn.Module):
145
+ def __init__(self, in_channels, out_channels=None):
146
+ super(ResBlock, self).__init__()
147
+ self.in_channels = in_channels
148
+ self.out_channels = in_channels if out_channels is None else out_channels
149
+ self.norm1 = normalize(in_channels)
150
+ self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
151
+ self.norm2 = normalize(out_channels)
152
+ self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
153
+ if self.in_channels != self.out_channels:
154
+ self.conv_out = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
155
+
156
+ def forward(self, x_in):
157
+ x = x_in
158
+ x = self.norm1(x)
159
+ x = swish(x)
160
+ x = self.conv1(x)
161
+ x = self.norm2(x)
162
+ x = swish(x)
163
+ x = self.conv2(x)
164
+ if self.in_channels != self.out_channels:
165
+ x_in = self.conv_out(x_in)
166
+
167
+ return x + x_in
168
+
169
+
170
+ class AttnBlock(nn.Module):
171
+ def __init__(self, in_channels):
172
+ super().__init__()
173
+ self.in_channels = in_channels
174
+
175
+ self.norm = normalize(in_channels)
176
+ self.q = torch.nn.Conv2d(
177
+ in_channels,
178
+ in_channels,
179
+ kernel_size=1,
180
+ stride=1,
181
+ padding=0
182
+ )
183
+ self.k = torch.nn.Conv2d(
184
+ in_channels,
185
+ in_channels,
186
+ kernel_size=1,
187
+ stride=1,
188
+ padding=0
189
+ )
190
+ self.v = torch.nn.Conv2d(
191
+ in_channels,
192
+ in_channels,
193
+ kernel_size=1,
194
+ stride=1,
195
+ padding=0
196
+ )
197
+ self.proj_out = torch.nn.Conv2d(
198
+ in_channels,
199
+ in_channels,
200
+ kernel_size=1,
201
+ stride=1,
202
+ padding=0
203
+ )
204
+
205
+ def forward(self, x):
206
+ h_ = x
207
+ h_ = self.norm(h_)
208
+ q = self.q(h_)
209
+ k = self.k(h_)
210
+ v = self.v(h_)
211
+
212
+ # compute attention
213
+ b, c, h, w = q.shape
214
+ q = q.reshape(b, c, h*w)
215
+ q = q.permute(0, 2, 1)
216
+ k = k.reshape(b, c, h*w)
217
+ w_ = torch.bmm(q, k)
218
+ w_ = w_ * (int(c)**(-0.5))
219
+ w_ = F.softmax(w_, dim=2)
220
+
221
+ # attend to values
222
+ v = v.reshape(b, c, h*w)
223
+ w_ = w_.permute(0, 2, 1)
224
+ h_ = torch.bmm(v, w_)
225
+ h_ = h_.reshape(b, c, h, w)
226
+
227
+ h_ = self.proj_out(h_)
228
+
229
+ return x+h_
230
+
231
+
232
+ class Encoder(nn.Module):
233
+ def __init__(self, in_channels, nf, emb_dim, ch_mult, num_res_blocks, resolution, attn_resolutions):
234
+ super().__init__()
235
+ self.nf = nf
236
+ self.num_resolutions = len(ch_mult)
237
+ self.num_res_blocks = num_res_blocks
238
+ self.resolution = resolution
239
+ self.attn_resolutions = attn_resolutions
240
+
241
+ curr_res = self.resolution
242
+ in_ch_mult = (1,)+tuple(ch_mult)
243
+
244
+ blocks = []
245
+ # initial convultion
246
+ blocks.append(nn.Conv2d(in_channels, nf, kernel_size=3, stride=1, padding=1))
247
+
248
+ # residual and downsampling blocks, with attention on smaller res (16x16)
249
+ for i in range(self.num_resolutions):
250
+ block_in_ch = nf * in_ch_mult[i]
251
+ block_out_ch = nf * ch_mult[i]
252
+ for _ in range(self.num_res_blocks):
253
+ blocks.append(ResBlock(block_in_ch, block_out_ch))
254
+ block_in_ch = block_out_ch
255
+ if curr_res in attn_resolutions:
256
+ blocks.append(AttnBlock(block_in_ch))
257
+
258
+ if i != self.num_resolutions - 1:
259
+ blocks.append(Downsample(block_in_ch))
260
+ curr_res = curr_res // 2
261
+
262
+ # non-local attention block
263
+ blocks.append(ResBlock(block_in_ch, block_in_ch))
264
+ blocks.append(AttnBlock(block_in_ch))
265
+ blocks.append(ResBlock(block_in_ch, block_in_ch))
266
+
267
+ # normalise and convert to latent size
268
+ blocks.append(normalize(block_in_ch))
269
+ blocks.append(nn.Conv2d(block_in_ch, emb_dim, kernel_size=3, stride=1, padding=1))
270
+ self.blocks = nn.ModuleList(blocks)
271
+
272
+ def forward(self, x):
273
+ for block in self.blocks:
274
+ x = block(x)
275
+
276
+ return x
277
+
278
+
279
+ class Generator(nn.Module):
280
+ def __init__(self, nf, emb_dim, ch_mult, res_blocks, img_size, attn_resolutions):
281
+ super().__init__()
282
+ self.nf = nf
283
+ self.ch_mult = ch_mult
284
+ self.num_resolutions = len(self.ch_mult)
285
+ self.num_res_blocks = res_blocks
286
+ self.resolution = img_size
287
+ self.attn_resolutions = attn_resolutions
288
+ self.in_channels = emb_dim
289
+ self.out_channels = 3
290
+ block_in_ch = self.nf * self.ch_mult[-1]
291
+ curr_res = self.resolution // 2 ** (self.num_resolutions-1)
292
+
293
+ blocks = []
294
+ # initial conv
295
+ blocks.append(nn.Conv2d(self.in_channels, block_in_ch, kernel_size=3, stride=1, padding=1))
296
+
297
+ # non-local attention block
298
+ blocks.append(ResBlock(block_in_ch, block_in_ch))
299
+ blocks.append(AttnBlock(block_in_ch))
300
+ blocks.append(ResBlock(block_in_ch, block_in_ch))
301
+
302
+ for i in reversed(range(self.num_resolutions)):
303
+ block_out_ch = self.nf * self.ch_mult[i]
304
+
305
+ for _ in range(self.num_res_blocks):
306
+ blocks.append(ResBlock(block_in_ch, block_out_ch))
307
+ block_in_ch = block_out_ch
308
+
309
+ if curr_res in self.attn_resolutions:
310
+ blocks.append(AttnBlock(block_in_ch))
311
+
312
+ if i != 0:
313
+ blocks.append(Upsample(block_in_ch))
314
+ curr_res = curr_res * 2
315
+
316
+ blocks.append(normalize(block_in_ch))
317
+ blocks.append(nn.Conv2d(block_in_ch, self.out_channels, kernel_size=3, stride=1, padding=1))
318
+
319
+ self.blocks = nn.ModuleList(blocks)
320
+
321
+
322
+ def forward(self, x):
323
+ for block in self.blocks:
324
+ x = block(x)
325
+
326
+ return x
327
+
328
+
329
+ @ARCH_REGISTRY.register()
330
+ class VQAutoEncoder(nn.Module):
331
+ def __init__(self, img_size, nf, ch_mult, quantizer="nearest", res_blocks=2, attn_resolutions=[16], codebook_size=1024, emb_dim=256,
332
+ beta=0.25, gumbel_straight_through=False, gumbel_kl_weight=1e-8, model_path=None):
333
+ super().__init__()
334
+ logger = get_root_logger()
335
+ self.in_channels = 3
336
+ self.nf = nf
337
+ self.n_blocks = res_blocks
338
+ self.codebook_size = codebook_size
339
+ self.embed_dim = emb_dim
340
+ self.ch_mult = ch_mult
341
+ self.resolution = img_size
342
+ self.attn_resolutions = attn_resolutions
343
+ self.quantizer_type = quantizer
344
+ self.encoder = Encoder(
345
+ self.in_channels,
346
+ self.nf,
347
+ self.embed_dim,
348
+ self.ch_mult,
349
+ self.n_blocks,
350
+ self.resolution,
351
+ self.attn_resolutions
352
+ )
353
+ if self.quantizer_type == "nearest":
354
+ self.beta = beta #0.25
355
+ self.quantize = VectorQuantizer(self.codebook_size, self.embed_dim, self.beta)
356
+ elif self.quantizer_type == "gumbel":
357
+ self.gumbel_num_hiddens = emb_dim
358
+ self.straight_through = gumbel_straight_through
359
+ self.kl_weight = gumbel_kl_weight
360
+ self.quantize = GumbelQuantizer(
361
+ self.codebook_size,
362
+ self.embed_dim,
363
+ self.gumbel_num_hiddens,
364
+ self.straight_through,
365
+ self.kl_weight
366
+ )
367
+ self.generator = Generator(
368
+ self.nf,
369
+ self.embed_dim,
370
+ self.ch_mult,
371
+ self.n_blocks,
372
+ self.resolution,
373
+ self.attn_resolutions
374
+ )
375
+
376
+ if model_path is not None:
377
+ chkpt = torch.load(model_path, map_location='cpu')
378
+ if 'params_ema' in chkpt:
379
+ self.load_state_dict(torch.load(model_path, map_location='cpu')['params_ema'])
380
+ logger.info(f'vqgan is loaded from: {model_path} [params_ema]')
381
+ elif 'params' in chkpt:
382
+ self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
383
+ logger.info(f'vqgan is loaded from: {model_path} [params]')
384
+ else:
385
+ raise ValueError(f'Wrong params!')
386
+
387
+
388
+ def forward(self, x):
389
+ x = self.encoder(x)
390
+ quant, codebook_loss, quant_stats = self.quantize(x)
391
+ x = self.generator(quant)
392
+ return x, codebook_loss, quant_stats
393
+
394
+
395
+
396
+ # patch based discriminator
397
+ @ARCH_REGISTRY.register()
398
+ class VQGANDiscriminator(nn.Module):
399
+ def __init__(self, nc=3, ndf=64, n_layers=4, model_path=None):
400
+ super().__init__()
401
+
402
+ layers = [nn.Conv2d(nc, ndf, kernel_size=4, stride=2, padding=1), nn.LeakyReLU(0.2, True)]
403
+ ndf_mult = 1
404
+ ndf_mult_prev = 1
405
+ for n in range(1, n_layers): # gradually increase the number of filters
406
+ ndf_mult_prev = ndf_mult
407
+ ndf_mult = min(2 ** n, 8)
408
+ layers += [
409
+ nn.Conv2d(ndf * ndf_mult_prev, ndf * ndf_mult, kernel_size=4, stride=2, padding=1, bias=False),
410
+ nn.BatchNorm2d(ndf * ndf_mult),
411
+ nn.LeakyReLU(0.2, True)
412
+ ]
413
+
414
+ ndf_mult_prev = ndf_mult
415
+ ndf_mult = min(2 ** n_layers, 8)
416
+
417
+ layers += [
418
+ nn.Conv2d(ndf * ndf_mult_prev, ndf * ndf_mult, kernel_size=4, stride=1, padding=1, bias=False),
419
+ nn.BatchNorm2d(ndf * ndf_mult),
420
+ nn.LeakyReLU(0.2, True)
421
+ ]
422
+
423
+ layers += [
424
+ nn.Conv2d(ndf * ndf_mult, 1, kernel_size=4, stride=1, padding=1)] # output 1 channel prediction map
425
+ self.main = nn.Sequential(*layers)
426
+
427
+ if model_path is not None:
428
+ chkpt = torch.load(model_path, map_location='cpu')
429
+ if 'params_d' in chkpt:
430
+ self.load_state_dict(torch.load(model_path, map_location='cpu')['params_d'])
431
+ elif 'params' in chkpt:
432
+ self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
433
+ else:
434
+ raise ValueError(f'Wrong params!')
435
+
436
+ def forward(self, x):
437
+ return self.main(x)