chaojiemao commited on
Commit
20a320d
·
verified ·
1 Parent(s): 9c29859

Create layers.py

Browse files
Files changed (1) hide show
  1. model/layers.py +356 -0
model/layers.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ from dataclasses import dataclass
5
+ from torch import Tensor, nn
6
+ import torch
7
+ from einops import rearrange, repeat
8
+ from torch import Tensor
9
+ from torch.nn.utils.rnn import pad_sequence
10
+
11
+ try:
12
+ from flash_attn import (
13
+ flash_attn_varlen_func
14
+ )
15
+ FLASHATTN_IS_AVAILABLE = True
16
+ except ImportError:
17
+ FLASHATTN_IS_AVAILABLE = False
18
+ flash_attn_varlen_func = None
19
+
20
+ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask: Tensor | None = None, backend = 'pytorch') -> Tensor:
21
+ q, k = apply_rope(q, k, pe)
22
+ if backend == 'pytorch':
23
+ if mask is not None and mask.dtype == torch.bool:
24
+ mask = torch.zeros_like(mask).to(q).masked_fill_(mask.logical_not(), -1e20)
25
+ x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask)
26
+ # x = torch.nan_to_num(x, nan=0.0, posinf=1e10, neginf=-1e10)
27
+ x = rearrange(x, "B H L D -> B L (H D)")
28
+ elif backend == 'flash_attn':
29
+ # q: (B, H, L, D)
30
+ # k: (B, H, S, D) now L = S
31
+ # v: (B, H, S, D)
32
+ b, h, lq, d = q.shape
33
+ _, _, lk, _ = k.shape
34
+ q = rearrange(q, "B H L D -> B L H D")
35
+ k = rearrange(k, "B H S D -> B S H D")
36
+ v = rearrange(v, "B H S D -> B S H D")
37
+ if mask is None:
38
+ q_lens = torch.tensor([lq] * b, dtype=torch.int32).to(q.device, non_blocking=True)
39
+ k_lens = torch.tensor([lk] * b, dtype=torch.int32).to(k.device, non_blocking=True)
40
+ else:
41
+ q_lens = torch.sum(mask[:, 0, :, 0], dim=1).int()
42
+ k_lens = torch.sum(mask[:, 0, 0, :], dim=1).int()
43
+ q = torch.cat([q_v[:q_l] for q_v, q_l in zip(q, q_lens)])
44
+ k = torch.cat([k_v[:k_l] for k_v, k_l in zip(k, k_lens)])
45
+ v = torch.cat([v_v[:v_l] for v_v, v_l in zip(v, k_lens)])
46
+ cu_seqlens_q = torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(0, dtype=torch.int32)
47
+ cu_seqlens_k = torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(0, dtype=torch.int32)
48
+ max_seqlen_q = q_lens.max()
49
+ max_seqlen_k = k_lens.max()
50
+
51
+ x = flash_attn_varlen_func(
52
+ q,
53
+ k,
54
+ v,
55
+ cu_seqlens_q=cu_seqlens_q,
56
+ cu_seqlens_k=cu_seqlens_k,
57
+ max_seqlen_q=max_seqlen_q,
58
+ max_seqlen_k=max_seqlen_k
59
+ )
60
+ x_list = [x[cu_seqlens_q[i]:cu_seqlens_q[i+1]] for i in range(b)]
61
+ x = pad_sequence(tuple(x_list), batch_first=True)
62
+ x = rearrange(x, "B L H D -> B L (H D)")
63
+ else:
64
+ raise NotImplementedError
65
+ return x
66
+
67
+
68
+ def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
69
+ assert dim % 2 == 0
70
+ scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
71
+ omega = 1.0 / (theta**scale)
72
+ out = torch.einsum("...n,d->...nd", pos, omega)
73
+ out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
74
+ out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
75
+ return out.float()
76
+
77
+
78
+ def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
79
+ xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
80
+ xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
81
+ xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
82
+ xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
83
+ return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
84
+
85
+ class EmbedND(nn.Module):
86
+ def __init__(self, dim: int, theta: int, axes_dim: list[int]):
87
+ super().__init__()
88
+ self.dim = dim
89
+ self.theta = theta
90
+ self.axes_dim = axes_dim
91
+
92
+ def forward(self, ids: Tensor) -> Tensor:
93
+ n_axes = ids.shape[-1]
94
+ emb = torch.cat(
95
+ [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
96
+ dim=-3,
97
+ )
98
+
99
+ return emb.unsqueeze(1)
100
+
101
+
102
+ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
103
+ """
104
+ Create sinusoidal timestep embeddings.
105
+ :param t: a 1-D Tensor of N indices, one per batch element.
106
+ These may be fractional.
107
+ :param dim: the dimension of the output.
108
+ :param max_period: controls the minimum frequency of the embeddings.
109
+ :return: an (N, D) Tensor of positional embeddings.
110
+ """
111
+ t = time_factor * t
112
+ half = dim // 2
113
+ freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
114
+ t.device
115
+ )
116
+
117
+ args = t[:, None].float() * freqs[None]
118
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
119
+ if dim % 2:
120
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
121
+ if torch.is_floating_point(t):
122
+ embedding = embedding.to(t)
123
+ return embedding
124
+
125
+
126
+ class MLPEmbedder(nn.Module):
127
+ def __init__(self, in_dim: int, hidden_dim: int):
128
+ super().__init__()
129
+ self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
130
+ self.silu = nn.SiLU()
131
+ self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
132
+
133
+ def forward(self, x: Tensor) -> Tensor:
134
+ return self.out_layer(self.silu(self.in_layer(x)))
135
+
136
+
137
+ class RMSNorm(torch.nn.Module):
138
+ def __init__(self, dim: int):
139
+ super().__init__()
140
+ self.scale = nn.Parameter(torch.ones(dim))
141
+
142
+ def forward(self, x: Tensor):
143
+ x_dtype = x.dtype
144
+ x = x.float()
145
+ rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
146
+ return (x * rrms).to(dtype=x_dtype) * self.scale
147
+
148
+
149
+ class QKNorm(torch.nn.Module):
150
+ def __init__(self, dim: int):
151
+ super().__init__()
152
+ self.query_norm = RMSNorm(dim)
153
+ self.key_norm = RMSNorm(dim)
154
+
155
+ def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
156
+ q = self.query_norm(q)
157
+ k = self.key_norm(k)
158
+ return q.to(v), k.to(v)
159
+
160
+
161
+ class SelfAttention(nn.Module):
162
+ def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
163
+ super().__init__()
164
+ self.num_heads = num_heads
165
+ head_dim = dim // num_heads
166
+
167
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
168
+ self.norm = QKNorm(head_dim)
169
+ self.proj = nn.Linear(dim, dim)
170
+
171
+ def forward(self, x: Tensor, pe: Tensor, mask: Tensor | None = None) -> Tensor:
172
+ qkv = self.qkv(x)
173
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
174
+ q, k = self.norm(q, k, v)
175
+ x = attention(q, k, v, pe=pe, mask=mask)
176
+ x = self.proj(x)
177
+ return x
178
+
179
+ class CrossAttention(nn.Module):
180
+ def __init__(self, dim: int, context_dim: int, num_heads: int = 8, qkv_bias: bool = False):
181
+ super().__init__()
182
+ self.num_heads = num_heads
183
+ head_dim = dim // num_heads
184
+ self.q = nn.Linear(dim, dim, bias=qkv_bias)
185
+ self.kv = nn.Linear(dim, context_dim * 2, bias=qkv_bias)
186
+ self.norm = QKNorm(head_dim)
187
+ self.proj = nn.Linear(dim, dim)
188
+
189
+ def forward(self, x: Tensor, context: Tensor, pe: Tensor, mask: Tensor | None = None) -> Tensor:
190
+ qkv = self.qkv(x)
191
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
192
+ q, k = self.norm(q, k, v)
193
+ x = attention(q, k, v, pe=pe, mask=mask)
194
+ x = self.proj(x)
195
+ return x
196
+
197
+
198
+ @dataclass
199
+ class ModulationOut:
200
+ shift: Tensor
201
+ scale: Tensor
202
+ gate: Tensor
203
+
204
+
205
+ class Modulation(nn.Module):
206
+ def __init__(self, dim: int, double: bool):
207
+ super().__init__()
208
+ self.is_double = double
209
+ self.multiplier = 6 if double else 3
210
+ self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
211
+
212
+ def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
213
+ out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
214
+
215
+ return (
216
+ ModulationOut(*out[:3]),
217
+ ModulationOut(*out[3:]) if self.is_double else None,
218
+ )
219
+
220
+
221
+ class DoubleStreamBlock(nn.Module):
222
+ def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, backend = 'pytorch'):
223
+ super().__init__()
224
+
225
+ mlp_hidden_dim = int(hidden_size * mlp_ratio)
226
+ self.num_heads = num_heads
227
+ self.hidden_size = hidden_size
228
+ self.img_mod = Modulation(hidden_size, double=True)
229
+ self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
230
+ self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
231
+
232
+ self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
233
+ self.img_mlp = nn.Sequential(
234
+ nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
235
+ nn.GELU(approximate="tanh"),
236
+ nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
237
+ )
238
+
239
+ self.backend = backend
240
+
241
+ self.txt_mod = Modulation(hidden_size, double=True)
242
+ self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
243
+ self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
244
+
245
+ self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
246
+ self.txt_mlp = nn.Sequential(
247
+ nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
248
+ nn.GELU(approximate="tanh"),
249
+ nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
250
+ )
251
+
252
+
253
+
254
+
255
+ def forward(self, x: Tensor, vec: Tensor, pe: Tensor, mask: Tensor = None, txt_length = None):
256
+ img_mod1, img_mod2 = self.img_mod(vec)
257
+ txt_mod1, txt_mod2 = self.txt_mod(vec)
258
+
259
+ txt, img = x[:, :txt_length], x[:, txt_length:]
260
+
261
+ # prepare image for attention
262
+ img_modulated = self.img_norm1(img)
263
+ img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
264
+ img_qkv = self.img_attn.qkv(img_modulated)
265
+ img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
266
+ img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
267
+ # prepare txt for attention
268
+ txt_modulated = self.txt_norm1(txt)
269
+ txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
270
+ txt_qkv = self.txt_attn.qkv(txt_modulated)
271
+ txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
272
+ txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
273
+
274
+ # run actual attention
275
+ q = torch.cat((txt_q, img_q), dim=2)
276
+ k = torch.cat((txt_k, img_k), dim=2)
277
+ v = torch.cat((txt_v, img_v), dim=2)
278
+ if mask is not None:
279
+ mask = repeat(mask, 'B L S-> B H L S', H=self.num_heads)
280
+ attn = attention(q, k, v, pe=pe, mask = mask, backend = self.backend)
281
+ txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
282
+
283
+ # calculate the img bloks
284
+ img = img + img_mod1.gate * self.img_attn.proj(img_attn)
285
+ img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
286
+
287
+ # calculate the txt bloks
288
+ txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
289
+ txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
290
+ x = torch.cat((txt, img), 1)
291
+ return x
292
+
293
+
294
+ class SingleStreamBlock(nn.Module):
295
+ """
296
+ A DiT block with parallel linear layers as described in
297
+ https://arxiv.org/abs/2302.05442 and adapted modulation interface.
298
+ """
299
+
300
+ def __init__(
301
+ self,
302
+ hidden_size: int,
303
+ num_heads: int,
304
+ mlp_ratio: float = 4.0,
305
+ qk_scale: float | None = None,
306
+ backend='pytorch'
307
+ ):
308
+ super().__init__()
309
+ self.hidden_dim = hidden_size
310
+ self.num_heads = num_heads
311
+ head_dim = hidden_size // num_heads
312
+ self.scale = qk_scale or head_dim**-0.5
313
+
314
+ self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
315
+ # qkv and mlp_in
316
+ self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
317
+ # proj and mlp_out
318
+ self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
319
+
320
+ self.norm = QKNorm(head_dim)
321
+
322
+ self.hidden_size = hidden_size
323
+ self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
324
+
325
+ self.mlp_act = nn.GELU(approximate="tanh")
326
+ self.modulation = Modulation(hidden_size, double=False)
327
+ self.backend = backend
328
+
329
+ def forward(self, x: Tensor, vec: Tensor, pe: Tensor, mask: Tensor = None) -> Tensor:
330
+ mod, _ = self.modulation(vec)
331
+ x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
332
+ qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
333
+
334
+ q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
335
+ q, k = self.norm(q, k, v)
336
+ if mask is not None:
337
+ mask = repeat(mask, 'B L S-> B H L S', H=self.num_heads)
338
+ # compute attention
339
+ attn = attention(q, k, v, pe=pe, mask = mask, backend=self.backend)
340
+ # compute activation in mlp stream, cat again and run second linear layer
341
+ output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
342
+ return x + mod.gate * output
343
+
344
+
345
+ class LastLayer(nn.Module):
346
+ def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
347
+ super().__init__()
348
+ self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
349
+ self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
350
+ self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
351
+
352
+ def forward(self, x: Tensor, vec: Tensor) -> Tensor:
353
+ shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
354
+ x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
355
+ x = self.linear(x)
356
+ return x