OptVQ / optvq /models /vqgan.py
BorelTHU's picture
initiate
223d932
# ------------------------------------------------------------------------------
# OptVQ: Preventing Local Pitfalls in Vector Quantization via Optimal Transport
# Copyright (c) 2024 Borui Zhang. All Rights Reserved.
# Licensed under the MIT License [see LICENSE for details]
# ------------------------------------------------------------------------------
# Modified from [CompVis/taming-transformers](https://github.com/CompVis/taming-transformers)
# Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer. All Rights Reserved.
# ------------------------------------------------------------------------------
import torch
import torch.nn.functional as F
import torch.nn as nn
import optvq.utils.logger as L
class Identity(nn.Module):
def forward(self, x):
return x
class VQModel(nn.Module):
def __init__(self,
encoder: nn.Module,
decoder: nn.Module,
loss: nn.Module,
quantize: nn.Module,
ckpt_path: str = None,
ignore_keys=[],
image_key="image",
colorize_nlabels=None,
monitor=None,
use_connector: bool = True,
):
super(VQModel, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.loss = loss
self.quantize = quantize
self.use_connector = use_connector
encoder_dim = self.encoder.hidden_dim
decoder_dim = self.decoder.hidden_dim
embed_dim = self.quantize.e_dim
if not use_connector:
self.quant_conv = Identity()
self.post_quant_conv = Identity()
assert encoder_dim == embed_dim, f"{encoder_dim} != {embed_dim}"
assert decoder_dim == embed_dim, f"{decoder_dim} != {embed_dim}"
else:
self.quant_conv = torch.nn.Conv2d(encoder_dim, embed_dim, 1)
self.post_quant_conv = torch.nn.Conv2d(embed_dim, decoder_dim, 1)
if ckpt_path is not None:
self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
self.image_key = image_key
if colorize_nlabels is not None:
assert type(colorize_nlabels)==int
self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
if monitor is not None:
self.monitor = monitor
def init_from_ckpt(self, path, ignore_keys=list()):
sd = torch.load(path, map_location="cpu")["state_dict"]
keys = list(sd.keys())
for k in keys:
for ik in ignore_keys:
if k.startswith(ik):
print("Deleting key {} from state_dict.".format(k))
del sd[k]
self.load_state_dict(sd, strict=False)
print(f"Restored from {path}")
def encode(self, x):
h = self.encoder(x)
h = self.quant_conv(h)
quant, emb_loss, indices = self.quantize(h)
return quant, emb_loss, indices
def decode(self, quant):
quant = self.post_quant_conv(quant)
dec = self.decoder(quant)
return dec
def decode_code(self, code_b):
quant_b = self.quantize.embed_code(code_b)
dec = self.decode(quant_b)
return dec
def forward(self, x, mode: int = 0, global_step: int = None):
"""
Args:
x (torch.Tensor): input tensor
mode (int): 0 for autoencoder, 1 for discriminator
global_step (int): global step for adaptive discriminator weight
"""
global_step = global_step if global_step is not None else L.log.total_steps
quant, qloss, indices = self.encode(x)
xrec = self.decode(quant)
if mode == 0:
# compute the autoencoder loss
loss, log_dict = self.loss(qloss, x, xrec, mode, last_layer=self.get_last_layer(), global_step=global_step)
elif mode == 1:
# compute the discriminator loss
loss, log_dict = self.loss(qloss, x, xrec, mode, last_layer=self.get_last_layer(), global_step=global_step)
elif mode == 2:
# compute the hidden embedding
h = self.encoder(x)
h = self.quant_conv(h)
return h
return loss, log_dict, indices
def get_input(self, batch, k):
x = batch[k]
if len(x.shape) == 3:
x = x[..., None]
x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format)
return x.float()
def get_last_layer(self):
if hasattr(self.decoder, "conv_out"):
return self.decoder.conv_out.weight
elif hasattr(self.decoder, "out_fc"):
return self.decoder.out_fc.weight
elif hasattr(self.decoder, "inv_conv"):
return self.decoder.inv_conv.weight
else:
raise NotImplementedError(f"Cannot find last layer in decoder")
def log_images(self, batch, **kwargs):
log = dict()
x = self.get_input(batch, self.image_key)
x = x.to(self.device)
xrec, _ = self(x)
if x.shape[1] > 3:
# colorize with random projection
assert xrec.shape[1] > 3
x = self.to_rgb(x)
xrec = self.to_rgb(xrec)
log["inputs"] = x
log["reconstructions"] = xrec
return log
def to_rgb(self, x):
assert self.image_key == "segmentation"
if not hasattr(self, "colorize"):
self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
x = F.conv2d(x, weight=self.colorize)
x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
return x
# The functions below are deprecated
def validation_step(self, batch, batch_idx):
x = self.get_input(batch, self.image_key)
xrec, qloss = self(x)
aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0, self.global_step,
last_layer=self.get_last_layer(), split="val")
discloss, log_dict_disc = self.loss(qloss, x, xrec, 1, self.global_step,
last_layer=self.get_last_layer(), split="val")
rec_loss = log_dict_ae["val/rec_loss"]
self.log("val/rec_loss", rec_loss,
prog_bar=True, logger=True, on_step=True, on_epoch=True, sync_dist=True)
self.log("val/aeloss", aeloss,
prog_bar=True, logger=True, on_step=True, on_epoch=True, sync_dist=True)
self.log_dict(log_dict_ae)
self.log_dict(log_dict_disc)
return self.log_dict