Harshit Agarwal

slides added

ee90412 5 months ago

10.4 kB

	'''
	THis gile is to contain the DDPM implementation modularized for loading, prediciton and training.
	'''

	from torch import nn
	import math
	import torch
	from utils import forward_diffusion_sample, sample_timestep, sample_plot_image
	import torch.nn.functional as F
	from attn_utils import SelfAttention, CBAM, Block_CBAM

	class Block(nn.Module):
	def __init__(self, in_ch, out_ch, time_emb_dim, up=False):
	super().__init__()
	self.time_mlp = nn.Linear(time_emb_dim, out_ch)
	if up:
	## up channel - gobig big big bigg from smol smol smol with 3x3 kernel
	self.conv1 = nn.Conv2d(2*in_ch, out_ch, 3, padding=1)
	self.transform = nn.ConvTranspose2d(out_ch, out_ch, 4, 2, 1)
	else:
	self.conv1 = nn.Conv2d(in_ch, out_ch, 3, padding=1)
	self.transform = nn.Conv2d(out_ch, out_ch, 4,2,1)
	self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1)
	self.relu = nn.ReLU()
	self.batch_norm1 = nn.BatchNorm2d(out_ch)
	self.batch_norm2 = nn.BatchNorm2d(out_ch)

	def forward(self, x, t, ):
	h = self.batch_norm1(self.relu(self.conv1(x)))
	time_emb = self.relu(self.time_mlp(t))
	time_emb = time_emb[(..., ) + (None, ) * 2]
	h = h + time_emb
	h = self.batch_norm2(self.relu(self.conv2(h)))
	return self.transform(h)

	class PositionEmbeddings(nn.Module):
	def __init__(self,dim):
	super().__init__()
	self.dim = dim

	def forward(self, time):
	device = time.device
	half_dim = self.dim // 2
	embeddings = math.log(10000) / (half_dim - 1)
	embeddings = torch.exp(torch.arange(half_dim, device=device) * -embeddings)
	embeddings = time[:, None] * embeddings[None, :]
	embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1)
	return embeddings



	class SimpleUnet(nn.Module):
	def __init__(self):
	super().__init__()
	image_channels = 3
	down_channels = (64, 128, 256, 512, 1024)
	up_channels = (1024, 512, 256, 128, 64)
	self.device = "cuda" if torch.cuda.is_available() else "cpu"

	out_dim = 3
	time_emb_dim = 32

	## timestep stored as positional encoding in terms of sine
	self.time_mlp = nn.Sequential(
	PositionEmbeddings(time_emb_dim),
	nn.Linear(time_emb_dim, time_emb_dim),
	nn.ReLU()
	)


	self.conv0 = nn.Conv2d(image_channels, down_channels[0], 3, padding=1)
	self.down_blocks = nn.ModuleList([
	Block(down_channels[i], down_channels[i+1], time_emb_dim)
	for i in range(len(down_channels)-1)
	])
	self.up_blocks = nn.ModuleList([
	Block(up_channels[i], up_channels[i+1], time_emb_dim, up=True)
	for i in range(len(up_channels)-1)
	])

	## readout layer
	self.output = nn.Conv2d(up_channels[-1], out_dim, 1)

	def forward(self, x, timestep):
	t = self.time_mlp(timestep)
	x = self.conv0(x)
	residual_inputs = []
	for down in self.down_blocks:
	x = down(x, t)
	residual_inputs.append(x)
	for up in self.up_blocks:
	residual_x = residual_inputs.pop()
	x = torch.cat((x, residual_x), dim=1)
	x = up(x, t)
	return self.output(x)

	@torch.no_grad()
	def sample(self, noise):
	"""
	Generate an image by denoising a given noise tensor using the reverse diffusion process.

	Args:
	noise (torch.Tensor): Initial noise tensor (e.g., sampled from a Gaussian distribution).

	Returns:
	torch.Tensor: Denoised image.
	"""
	img = noise # Start with the provided noise tensor
	T = self.num_timesteps # Total timesteps for diffusion
	stepsize = 1 # You can adjust if needed

	# Iterate through the timesteps in reverse order
	for i in range(0, T)[::-1]:
	t = torch.full((noise.size(0),), i, device=noise.device, dtype=torch.long) # Current timestep
	img = sample_timestep(self, img, t) # Perform one reverse diffusion step
	img = torch.clamp(img, -1.0, 1.0) # Clamp the image to ensure values stay in [-1, 1]

	return img

	def get_loss(self, x_0, t):
	x_noisy, noise = forward_diffusion_sample(x_0, t, self.device)
	noise_pred = self(x_noisy, t)
	return F.l1_loss(noise, noise_pred)

	def train(self, dataloader, BATCH_SIZE=64,T=300, EPOCHS=50, verbose=True):
	from torch.optim import Adam

	device = "cuda" if torch.cuda.is_available() else "cpu"
	self.to(device)
	optimizer = Adam(self.parameters(), lr=0.001)
	epochs = EPOCHS

	for epoch in range(epochs):
	for step, batch in enumerate(dataloader):
	optimizer.zero_grad()

	t = torch.randint(0, T, (BATCH_SIZE,), device=device).long()
	loss = self.get_loss(self, batch[0], t)
	loss.backward()
	optimizer.step()

	if verbose:
	if epoch % 5 == 0 and step % 150 == 0:
	print(f"Epoch {epoch} \| step {step:03d} Loss: {loss.item()} ")
	sample_plot_image(self)

	def test():
	## TODO: add the testing loop here
	pass




	################################################################################################
	####################### ATTENTION LAYERS ADDEDD TO THE MODEL ###################################
	################################################################################################

	class SimpleUnetWSelfAttn(nn.Module):
	def __init__(self):
	super().__init__()
	image_channels = 3
	down_channels = (64, 128, 256, 512, 1024)
	up_channels = (1024, 512, 256, 128, 64)

	out_dim = 3
	time_emb_dim = 32

	## timestep stored as positional encoding in terms of sine
	self.time_mlp = nn.Sequential(
	PositionEmbeddings(time_emb_dim),
	nn.Linear(time_emb_dim, time_emb_dim),
	nn.ReLU()
	)
	self.num_timesteps = 300


	self.conv0 = nn.Conv2d(image_channels, down_channels[0], 3, padding=1)
	self.down_blocks = nn.ModuleList([
	Block(down_channels[i], down_channels[i+1], time_emb_dim)
	for i in range(len(down_channels)-1)
	])
	self.up_blocks = nn.ModuleList([
	Block(up_channels[i], up_channels[i+1], time_emb_dim, up=True)
	for i in range(len(up_channels)-1)
	])

	self.self_attention = SelfAttention(down_channels[-1])


	## readout layer
	self.output = nn.Conv2d(up_channels[-1], out_dim, 1)


	# def settimestep()

	def forward(self, x, timestep):
	self.num_timesteps = timestep
	t = self.time_mlp(timestep)
	x = self.conv0(x)
	residual_inputs = []
	for down in self.down_blocks:
	x = down(x, t)
	residual_inputs.append(x)

	x = self.self_attention(x)

	for up in self.up_blocks:
	residual_x = residual_inputs.pop()
	x = torch.cat((x, residual_x), dim=1)
	x = up(x, t)
	return self.output(x)

	@torch.no_grad()
	def sample(self, noise):
	"""
	Generate an image by denoising a given noise tensor using the reverse diffusion process.

	Args:
	noise (torch.Tensor): Initial noise tensor (e.g., sampled from a Gaussian distribution).

	Returns:
	torch.Tensor: Denoised image.
	"""
	img = noise # Start with the provided noise tensor
	T = self.num_timesteps # Total timesteps for diffusion
	stepsize = 1 # You can adjust if needed
	print(noise.device)

	# Iterate through the timesteps in reverse order
	for i in range(T - 1, -1, -1):
	t = torch.full((noise.size(0),), i, device=noise.device, dtype=torch.long) # Current timestep
	img = sample_timestep(self, img, t) # Perform one reverse diffusion step
	img = torch.clamp(img, -1.0, 1.0) # Clamp the image to ensure values stay in [-1, 1]

	return img



	################################################################################################
	#################### Convolutional Block Attention Module ADDED TO THE MODEL ###################
	################################################################################################

	class SimpleUnetWCBAM(nn.Module):
	def __init__(self):
	super().__init__()
	image_channels = 3
	down_channels = (64, 128, 256, 512, 1024)
	up_channels = (1024, 512, 256, 128, 64)

	out_dim = 3
	time_emb_dim = 32

	## timestep stored as positional encoding in terms of sine
	self.time_mlp = nn.Sequential(
	PositionEmbeddings(time_emb_dim),
	nn.Linear(time_emb_dim, time_emb_dim),
	nn.ReLU()
	)
	self.num_timesteps = 300


	self.conv0 = nn.Conv2d(image_channels, down_channels[0], 3, padding=1)
	self.down_blocks = nn.ModuleList([
	Block_CBAM(down_channels[i], down_channels[i+1], time_emb_dim)
	for i in range(len(down_channels)-1)
	])
	self.up_blocks = nn.ModuleList([
	Block_CBAM(up_channels[i], up_channels[i+1], time_emb_dim, up=True)
	for i in range(len(up_channels)-1)
	])

	self.self_attention = SelfAttention(down_channels[-1])


	## readout layer
	self.output = nn.Conv2d(up_channels[-1], out_dim, 1)


	# def settimestep()

	def forward(self, x, timestep):
	self.num_timesteps = timestep
	t = self.time_mlp(timestep)
	x = self.conv0(x)
	residual_inputs = []
	for down in self.down_blocks:
	x = down(x, t)
	residual_inputs.append(x)

	x = self.self_attention(x)

	for up in self.up_blocks:
	residual_x = residual_inputs.pop()
	x = torch.cat((x, residual_x), dim=1)
	x = up(x, t)
	return self.output(x)

	@torch.no_grad()
	def sample(self, noise):
	"""
	Generate an image by denoising a given noise tensor using the reverse diffusion process.

	Args:
	noise (torch.Tensor): Initial noise tensor (e.g., sampled from a Gaussian distribution).

	Returns:
	torch.Tensor: Denoised image.
	"""
	img = noise # Start with the provided noise tensor
	T = self.num_timesteps # Total timesteps for diffusion
	stepsize = 1 # You can adjust if needed
	print(noise.device)

	# Iterate through the timesteps in reverse order
	for i in range(T - 1, -1, -1):
	t = torch.full((noise.size(0),), i, device=noise.device, dtype=torch.long) # Current timestep
	img = sample_timestep(self, img, t) # Perform one reverse diffusion step
	img = torch.clamp(img, -1.0, 1.0) # Clamp the image to ensure values stay in [-1, 1]

	return img