import torch import torch.nn as nn class AttentionBlock(nn.Module): def __init__(self, in_channels, out_channels, kernel_size=3, padding=1): super(AttentionBlock, self).__init__() self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=kernel_size, padding=padding) self.attn = nn.MultiheadAttention( out_channels, num_heads=8, batch_first=True) self.norm = nn.LayerNorm(out_channels) self.activation = nn.ReLU() def forward(self, x): x = self.conv1(x) x = self.activation(x) x = self.conv2(x) b, c, h, w = x.size() x = x.view(b, c, h * w).permute(2, 0, 1) # Reshape and permute attn_output, _ = self.attn(x, x, x) x = attn_output.permute(1, 2, 0).view( b, c, h, w) # Revert the permute and reshape x = x.view(b, c, -1) # Flatten the last two dimensions # Reshape for LayerNorm and apply normalization x = self.norm(x.reshape(b, -1, c)) x = x.view(b, c, h, w) # Reshape back to original return x class UNet(nn.Module): def __init__(self): super(UNet, self).__init__() self.encoder = nn.Sequential( nn.Conv2d(3, 32, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(128, 256, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(256, 512, kernel_size=3, padding=1), nn.ReLU(), ) self.lstm = nn.LSTM(512, 512, batch_first=True) self.attn_block = AttentionBlock(512, 512) self.decoder = nn.Sequential( nn.ConvTranspose2d(1024, 256, kernel_size=2, stride=2), nn.ReLU(), nn.ConvTranspose2d(512, 128, kernel_size=2, stride=2), nn.ReLU(), nn.ConvTranspose2d(256, 64, kernel_size=2, stride=2), nn.ReLU(), nn.ConvTranspose2d(128, 32, kernel_size=2, stride=2), nn.ReLU(), nn.ConvTranspose2d(64, 3, kernel_size=1), nn.Sigmoid(), ) def forward(self, x): skip_connections = [] for layer in self.encoder: x = layer(x) skip_connections.append(x) if isinstance(layer, nn.MaxPool2d): skip_connections.pop() batch_size, channels, height, width = x.size() x = x.view(batch_size, -1, channels) x, _ = self.lstm(x) x = x.unsqueeze(1) x = x.permute(0, 2, 3, 1) x = x.reshape(batch_size, channels, height, width) x = self.attn_block(x) skip_connections = skip_connections[::-1] for i, layer in enumerate(self.decoder): if isinstance(layer, nn.ConvTranspose2d): x = layer(torch.cat((x, skip_connections[i]), dim=1)) else: x = layer(x) return x