Spaces:

Tonic
/

Pixtral

Paused

App Files Files Community

Tonic commited on Sep 11, 2024

Commit

cbd9440

unverified ·

1 Parent(s): 93a33fb

add cis_2D preprocessing

Browse files

Files changed (1) hide show

app.py +21 -23

app.py CHANGED Viewed

@@ -53,30 +53,29 @@ class GELU(nn.Module):
         else:
             return F.gelu(self.linear(x))
 class Rope2D(nn.Module):
     def __init__(self, dim, max_position_embeddings=1024, base=10000):
         super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer("inv_freq", inv_freq)
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype)
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
-    def forward(self, x, seq_len=None):
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-            self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
 class VisionEncoder(nn.Module):
     def __init__(self, config):
@@ -92,14 +91,13 @@ class VisionEncoder(nn.Module):
         x = self.embed(pixel_values)
         b, c, h, w = x.shape
         x = x.flatten(2).transpose(1, 2)
-        cos, sin = self.rope(x, seq_len=h*w)
         for layer in self.layers:
             x = layer(x)
         x = self.norm(x)
         x = self.gelu(x)
         return x
 class PixtralModel(nn.Module):
     def __init__(self, params):
         super().__init__()

         else:
             return F.gelu(self.linear(x))
+def precompute_freqs_cis_2d(dim: int, height: int, width: int, theta: float) -> torch.Tensor:
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
+    h = torch.arange(height, device=freqs.device)
+    w = torch.arange(width, device=freqs.device)
+    freqs_h = torch.outer(h, freqs[::2]).float()
+    freqs_w = torch.outer(w, freqs[1::2]).float()
+    freqs_2d = torch.cat([
+        freqs_h[:, None, :].repeat(1, width, 1),
+        freqs_w[None, :, :].repeat(height, 1, 1),
+    ], dim=-1)
+    return torch.polar(torch.ones_like(freqs_2d), freqs_2d)
 class Rope2D(nn.Module):
     def __init__(self, dim, max_position_embeddings=1024, base=10000):
         super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+    def forward(self, x, height, width):
+        freqs_cis = precompute_freqs_cis_2d(self.dim, height, width, self.base)
+        return freqs_cis.to(x.device)
 class VisionEncoder(nn.Module):
     def __init__(self, config):
         x = self.embed(pixel_values)
         b, c, h, w = x.shape
         x = x.flatten(2).transpose(1, 2)
+        freqs_cis = self.rope(x, h, w)
         for layer in self.layers:
             x = layer(x)
         x = self.norm(x)
         x = self.gelu(x)
         return x
 class PixtralModel(nn.Module):
     def __init__(self, params):
         super().__init__()