Spaces:

torinriley
/

Diffusion

Sleeping

App Files Files Community

torinriley commited on Mar 19

Commit

05469a1

1 Parent(s): ba632ba

yeah ok

Browse files

Files changed (1) hide show

src/diffusion.py +26 -53

src/diffusion.py CHANGED Viewed

@@ -13,25 +13,8 @@ class TimeEmbedding(nn.Module):
         x = F.silu(self.linear_1(x))
         return self.linear_2(x)
-class SqueezeExcitation(nn.Module):
-    def __init__(self, channels, reduction=16):
-        super().__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Sequential(
-            nn.Linear(channels, channels // reduction, bias=False),
-            nn.ReLU(inplace=True),
-            nn.Linear(channels // reduction, channels, bias=False),
-            nn.Sigmoid()
-        )
-    def forward(self, x):
-        b, c, _, _ = x.size()
-        y = self.avg_pool(x).view(b, c)
-        y = self.fc(y).view(b, c, 1, 1)
-        return x * y.expand_as(x)
 class UNET_ResidualBlock(nn.Module):
-    def __init__(self, in_channels, out_channels, n_time=1280, use_se=False):
         super().__init__()
         self.groupnorm_feature = nn.GroupNorm(32, in_channels)
         self.conv_feature = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
@@ -39,26 +22,16 @@ class UNET_ResidualBlock(nn.Module):
         self.groupnorm_merged = nn.GroupNorm(32, out_channels)
         self.conv_merged = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
         self.residual_layer = nn.Identity() if in_channels == out_channels else nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0)
-        # Add Squeeze-Excitation blocks only if use_se is True
-        self.use_se = use_se
-        if use_se:
-            self.se1 = SqueezeExcitation(out_channels)
-            self.se2 = SqueezeExcitation(out_channels)
     def forward(self, feature, time):
         residue = feature
         feature = F.silu(self.groupnorm_feature(feature))
         feature = self.conv_feature(feature)
-        if self.use_se:
-            feature = self.se1(feature)  # Apply SE after first conv
         time = self.linear_time(F.silu(time))
         merged = feature + time.unsqueeze(-1).unsqueeze(-1)
         merged = F.silu(self.groupnorm_merged(merged))
         merged = self.conv_merged(merged)
-        if self.use_se:
-            merged = self.se2(merged)  # Apply SE after second conv
         return merged + self.residual_layer(residue)
@@ -112,42 +85,42 @@ class SwitchSequential(nn.Sequential):
         return x
 class UNET(nn.Module):
-    def __init__(self, use_se=False):
         super().__init__()
         self.encoders = nn.ModuleList([
             SwitchSequential(nn.Conv2d(4, 320, kernel_size=3, padding=1)),
-            SwitchSequential(UNET_ResidualBlock(320, 320, use_se=use_se), UNET_AttentionBlock(8, 40)),
-            SwitchSequential(UNET_ResidualBlock(320, 320, use_se=use_se), UNET_AttentionBlock(8, 40)),
             SwitchSequential(nn.Conv2d(320, 320, kernel_size=3, stride=2, padding=1)),
-            SwitchSequential(UNET_ResidualBlock(320, 640, use_se=use_se), UNET_AttentionBlock(8, 80)),
-            SwitchSequential(UNET_ResidualBlock(640, 640, use_se=use_se), UNET_AttentionBlock(8, 80)),
             SwitchSequential(nn.Conv2d(640, 640, kernel_size=3, stride=2, padding=1)),
-            SwitchSequential(UNET_ResidualBlock(640, 1280, use_se=use_se), UNET_AttentionBlock(8, 160)),
-            SwitchSequential(UNET_ResidualBlock(1280, 1280, use_se=use_se), UNET_AttentionBlock(8, 160)),
             SwitchSequential(nn.Conv2d(1280, 1280, kernel_size=3, stride=2, padding=1)),
-            SwitchSequential(UNET_ResidualBlock(1280, 1280, use_se=use_se)),
-            SwitchSequential(UNET_ResidualBlock(1280, 1280, use_se=use_se)),
         ])
         self.bottleneck = SwitchSequential(
-            UNET_ResidualBlock(1280, 1280, use_se=use_se),
             UNET_AttentionBlock(8, 160),
-            UNET_ResidualBlock(1280, 1280, use_se=use_se),
         )
         self.decoders = nn.ModuleList([
-            SwitchSequential(UNET_ResidualBlock(2560, 1280, use_se=use_se)),
-            SwitchSequential(UNET_ResidualBlock(2560, 1280, use_se=use_se)),
-            SwitchSequential(UNET_ResidualBlock(2560, 1280, use_se=use_se), Upsample(1280)),
-            SwitchSequential(UNET_ResidualBlock(2560, 1280, use_se=use_se), UNET_AttentionBlock(8, 160)),
-            SwitchSequential(UNET_ResidualBlock(2560, 1280, use_se=use_se), UNET_AttentionBlock(8, 160)),
-            SwitchSequential(UNET_ResidualBlock(1920, 1280, use_se=use_se), UNET_AttentionBlock(8, 160), Upsample(1280)),
-            SwitchSequential(UNET_ResidualBlock(1920, 640, use_se=use_se), UNET_AttentionBlock(8, 80)),
-            SwitchSequential(UNET_ResidualBlock(1280, 640, use_se=use_se), UNET_AttentionBlock(8, 80)),
-            SwitchSequential(UNET_ResidualBlock(960, 640, use_se=use_se), UNET_AttentionBlock(8, 80), Upsample(640)),
-            SwitchSequential(UNET_ResidualBlock(960, 320, use_se=use_se), UNET_AttentionBlock(8, 40)),
-            SwitchSequential(UNET_ResidualBlock(640, 320, use_se=use_se), UNET_AttentionBlock(8, 40)),
-            SwitchSequential(UNET_ResidualBlock(640, 320, use_se=use_se), UNET_AttentionBlock(8, 40)),
         ])
     def forward(self, x, context, time):
@@ -175,10 +148,10 @@ class UNET_OutputLayer(nn.Module):
         return self.conv(x)
 class Diffusion(nn.Module):
-    def __init__(self, use_se=False):
         super().__init__()
         self.time_embedding = TimeEmbedding(320)
-        self.unet = UNET(use_se=use_se)
         self.final = UNET_OutputLayer(320, 4)
     def forward(self, latent, context, time):

         x = F.silu(self.linear_1(x))
         return self.linear_2(x)
 class UNET_ResidualBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, n_time=1280):
         super().__init__()
         self.groupnorm_feature = nn.GroupNorm(32, in_channels)
         self.conv_feature = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
         self.groupnorm_merged = nn.GroupNorm(32, out_channels)
         self.conv_merged = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
         self.residual_layer = nn.Identity() if in_channels == out_channels else nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0)
     def forward(self, feature, time):
         residue = feature
         feature = F.silu(self.groupnorm_feature(feature))
         feature = self.conv_feature(feature)
         time = self.linear_time(F.silu(time))
         merged = feature + time.unsqueeze(-1).unsqueeze(-1)
         merged = F.silu(self.groupnorm_merged(merged))
         merged = self.conv_merged(merged)
         return merged + self.residual_layer(residue)
         return x
 class UNET(nn.Module):
+    def __init__(self):
         super().__init__()
         self.encoders = nn.ModuleList([
             SwitchSequential(nn.Conv2d(4, 320, kernel_size=3, padding=1)),
+            SwitchSequential(UNET_ResidualBlock(320, 320), UNET_AttentionBlock(8, 40)),
+            SwitchSequential(UNET_ResidualBlock(320, 320), UNET_AttentionBlock(8, 40)),
             SwitchSequential(nn.Conv2d(320, 320, kernel_size=3, stride=2, padding=1)),
+            SwitchSequential(UNET_ResidualBlock(320, 640), UNET_AttentionBlock(8, 80)),
+            SwitchSequential(UNET_ResidualBlock(640, 640), UNET_AttentionBlock(8, 80)),
             SwitchSequential(nn.Conv2d(640, 640, kernel_size=3, stride=2, padding=1)),
+            SwitchSequential(UNET_ResidualBlock(640, 1280), UNET_AttentionBlock(8, 160)),
+            SwitchSequential(UNET_ResidualBlock(1280, 1280), UNET_AttentionBlock(8, 160)),
             SwitchSequential(nn.Conv2d(1280, 1280, kernel_size=3, stride=2, padding=1)),
+            SwitchSequential(UNET_ResidualBlock(1280, 1280)),
+            SwitchSequential(UNET_ResidualBlock(1280, 1280)),
         ])
         self.bottleneck = SwitchSequential(
+            UNET_ResidualBlock(1280, 1280),
             UNET_AttentionBlock(8, 160),
+            UNET_ResidualBlock(1280, 1280),
         )
         self.decoders = nn.ModuleList([
+            SwitchSequential(UNET_ResidualBlock(2560, 1280)),
+            SwitchSequential(UNET_ResidualBlock(2560, 1280)),
+            SwitchSequential(UNET_ResidualBlock(2560, 1280), Upsample(1280)),
+            SwitchSequential(UNET_ResidualBlock(2560, 1280), UNET_AttentionBlock(8, 160)),
+            SwitchSequential(UNET_ResidualBlock(2560, 1280), UNET_AttentionBlock(8, 160)),
+            SwitchSequential(UNET_ResidualBlock(1920, 1280), UNET_AttentionBlock(8, 160), Upsample(1280)),
+            SwitchSequential(UNET_ResidualBlock(1920, 640), UNET_AttentionBlock(8, 80)),
+            SwitchSequential(UNET_ResidualBlock(1280, 640), UNET_AttentionBlock(8, 80)),
+            SwitchSequential(UNET_ResidualBlock(960, 640), UNET_AttentionBlock(8, 80), Upsample(640)),
+            SwitchSequential(UNET_ResidualBlock(960, 320), UNET_AttentionBlock(8, 40)),
+            SwitchSequential(UNET_ResidualBlock(640, 320), UNET_AttentionBlock(8, 40)),
+            SwitchSequential(UNET_ResidualBlock(640, 320), UNET_AttentionBlock(8, 40)),
         ])
     def forward(self, x, context, time):
         return self.conv(x)
 class Diffusion(nn.Module):
+    def __init__(self):
         super().__init__()
         self.time_embedding = TimeEmbedding(320)
+        self.unet = UNET()
         self.final = UNET_OutputLayer(320, 4)
     def forward(self, latent, context, time):