revert transformer.py
Browse files- audiocraft/lm.py +20 -42
- audiocraft/transformer.py +204 -39
- audiocraft/utils/utils.py +7 -64
- demo.py +2 -2
- live_api.py +8 -6
audiocraft/lm.py
CHANGED
@@ -246,32 +246,29 @@ class LMModel(StreamingModule):
|
|
246 |
def _sample_next_token(self,
|
247 |
sequence,
|
248 |
cfg_conditions,
|
249 |
-
unconditional_state
|
250 |
-
use_sampling=False,
|
251 |
-
temp: float = 1.0,
|
252 |
-
top_k: int = 0,
|
253 |
-
top_p: float = 0.0,
|
254 |
-
cfg_coef: tp.Optional[float] = None,
|
255 |
-
two_step_cfg: tp.Optional[bool] = None) -> torch.Tensor:
|
256 |
"""self.n_draw"""
|
257 |
B = sequence.shape[0]
|
258 |
-
cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
|
259 |
-
model = self if self._fsdp is None else self._fsdp
|
260 |
-
two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
|
261 |
-
condition_tensors = cfg_conditions
|
262 |
-
|
263 |
|
|
|
264 |
|
|
|
|
|
265 |
logits = model(
|
266 |
sequence, # cond_logits = wav condition
|
267 |
conditions=[], condition_tensors=condition_tensors) # uncond_logits already see the text
|
268 |
|
269 |
-
# print(f'{logits.shape=} L')
|
270 |
-
logits = logits[0, :, :, :].transpose(1,0) # sample expects [1, 4, 2048]
|
271 |
-
# logits = [2, 4, 1, 2048]
|
272 |
-
# print(f'{B=}, {logits.shape=} SAMPLER {top_k=}')
|
273 |
-
next_token = utils.sample_top_k(logits, k=top_k, n_draw=self.n_draw) # [1,4,2048] logits
|
274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
return next_token
|
276 |
|
277 |
# GENERATE class revert_codebook_patterns()
|
@@ -282,15 +279,7 @@ class LMModel(StreamingModule):
|
|
282 |
num_samples = 1, # THIS IS HOW MANY GENERATIONS - A SAMPLE IS A FULL WAV
|
283 |
max_gen_len=256, # unduplicated sequence length - actual len will be n_draw * maxgenlen
|
284 |
use_sampling: bool = True,
|
285 |
-
|
286 |
-
top_k: int = 250,
|
287 |
-
top_p: float = 0.0,
|
288 |
-
cfg_coef: tp.Optional[float] = None,
|
289 |
-
two_step_cfg: tp.Optional[bool] = None,
|
290 |
-
remove_prompts: bool = False,
|
291 |
-
check: bool = False,
|
292 |
-
callback: tp.Optional[tp.Callable[[int, int], None]] = None,
|
293 |
-
**kwargs) -> torch.Tensor:
|
294 |
|
295 |
print(f'{num_samples=}')
|
296 |
first_param = next(iter(self.parameters()))
|
@@ -365,32 +354,21 @@ class LMModel(StreamingModule):
|
|
365 |
next_token = self._sample_next_token(
|
366 |
curr_sequence,
|
367 |
cfg_conditions,
|
368 |
-
unconditional_state,
|
369 |
-
use_sampling,
|
370 |
-
temp, top_k, top_p,
|
371 |
-
cfg_coef=cfg_coef,
|
372 |
-
two_step_cfg=two_step_cfg) # [5, 4, 1]
|
373 |
-
print(f'{next_token.shape=}')
|
374 |
-
# replicate the sequence to hold 5 or more sequences as we generate 5 tokens or more
|
375 |
|
376 |
|
377 |
-
|
378 |
-
# ensure the tokens that should be masked are properly set to special_token_id
|
379 |
-
# as the model never output special_token_id
|
380 |
-
# valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
|
381 |
|
382 |
-
# next_token[~valid_mask] = self.special_token_id
|
383 |
|
384 |
-
# print(f'{unconditional_state=} \n
|
385 |
-
# print('Set All to Special')
|
386 |
|
387 |
-
|
|
|
|
|
388 |
# special_token_id is filler for CODEBOOK_PATTERN ?
|
389 |
|
390 |
# next_token[:] = self.special_token_id # seanet.embed torch.embedding does not have this - out of bounds in detokenize
|
391 |
|
392 |
_gen_sequence[..., offset:offset+1] = next_token[0, :, :] #gen_sequence.shape=torch.Size([1, 4, 39])
|
393 |
-
|
394 |
duplicate_draw.append(next_token)
|
395 |
|
396 |
prev_offset = offset
|
|
|
246 |
def _sample_next_token(self,
|
247 |
sequence,
|
248 |
cfg_conditions,
|
249 |
+
unconditional_state):
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
"""self.n_draw"""
|
251 |
B = sequence.shape[0]
|
|
|
|
|
|
|
|
|
|
|
252 |
|
253 |
+
model = self if self._fsdp is None else self._fsdp
|
254 |
|
255 |
+
condition_tensors = cfg_conditions
|
256 |
+
# logits = [2, 4, 1, 2048]
|
257 |
logits = model(
|
258 |
sequence, # cond_logits = wav condition
|
259 |
conditions=[], condition_tensors=condition_tensors) # uncond_logits already see the text
|
260 |
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
+
# use cfg
|
263 |
+
# logits = (3 * logits[1, :, :, :] - 2.4 * logits[0, :, :, :]).transpose(1,0)
|
264 |
+
|
265 |
+
# or use 1 of logits
|
266 |
+
logits = logits[0, :, :, :].transpose(1,0) # [2,4,1, 2048] -> [1,4,2048]
|
267 |
+
|
268 |
+
|
269 |
+
|
270 |
+
# print(f'{B=}, {logits.shape=} SAMPLER {top_k=}')
|
271 |
+
next_token = utils.sample_top_k(logits, n_draw=self.n_draw) # [1,4,2048] logits
|
272 |
return next_token
|
273 |
|
274 |
# GENERATE class revert_codebook_patterns()
|
|
|
279 |
num_samples = 1, # THIS IS HOW MANY GENERATIONS - A SAMPLE IS A FULL WAV
|
280 |
max_gen_len=256, # unduplicated sequence length - actual len will be n_draw * maxgenlen
|
281 |
use_sampling: bool = True,
|
282 |
+
**kwargs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
|
284 |
print(f'{num_samples=}')
|
285 |
first_param = next(iter(self.parameters()))
|
|
|
354 |
next_token = self._sample_next_token(
|
355 |
curr_sequence,
|
356 |
cfg_conditions,
|
357 |
+
unconditional_state) # [5, 4, 1]
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
|
359 |
|
|
|
|
|
|
|
|
|
360 |
|
|
|
361 |
|
|
|
|
|
362 |
|
363 |
+
|
364 |
+
|
365 |
+
# RUNS with = 2047 just different of self.special_token_id = 2047 = alwayssingletoken = drill noise
|
366 |
# special_token_id is filler for CODEBOOK_PATTERN ?
|
367 |
|
368 |
# next_token[:] = self.special_token_id # seanet.embed torch.embedding does not have this - out of bounds in detokenize
|
369 |
|
370 |
_gen_sequence[..., offset:offset+1] = next_token[0, :, :] #gen_sequence.shape=torch.Size([1, 4, 39])
|
371 |
+
|
372 |
duplicate_draw.append(next_token)
|
373 |
|
374 |
prev_offset = offset
|
audiocraft/transformer.py
CHANGED
@@ -86,6 +86,7 @@ def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float =
|
|
86 |
adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
|
87 |
max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype) # avoid sync point
|
88 |
phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
|
|
|
89 |
return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
|
90 |
|
91 |
|
@@ -177,7 +178,7 @@ class StreamingMultiheadAttention(StreamingModule):
|
|
177 |
self.past_context = past_context
|
178 |
self.memory_efficient = memory_efficient
|
179 |
self.attention_as_float32 = attention_as_float32
|
180 |
-
|
181 |
self.cross_attention = cross_attention
|
182 |
self.safe_streaming = safe_streaming
|
183 |
self.num_heads = num_heads
|
@@ -230,8 +231,41 @@ class StreamingMultiheadAttention(StreamingModule):
|
|
230 |
state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
|
231 |
super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
|
232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
def _complete_kv(self, k, v):
|
|
|
235 |
time_dim = _get_attention_time_dimension(self.memory_efficient)
|
236 |
if self.cross_attention:
|
237 |
# With cross attention we assume all keys and values
|
@@ -240,16 +274,15 @@ class StreamingMultiheadAttention(StreamingModule):
|
|
240 |
return k, v
|
241 |
# Complete the key/value pair using the streaming state.
|
242 |
if self._streaming_state:
|
243 |
-
# print('{self._streaming_state.keys()=}') EMPTY - ALTHOUGH WE HAVE STREAMING STATE
|
244 |
pk = self._streaming_state['past_keys']
|
245 |
nk = torch.cat([pk, k], dim=time_dim)
|
|
|
246 |
if v is k:
|
247 |
-
|
248 |
nv = nk
|
249 |
else:
|
250 |
-
|
251 |
pv = self._streaming_state['past_values']
|
252 |
nv = torch.cat([pv, v], dim=time_dim)
|
|
|
253 |
else:
|
254 |
nk = k
|
255 |
nv = v
|
@@ -257,28 +290,35 @@ class StreamingMultiheadAttention(StreamingModule):
|
|
257 |
assert nk.shape[time_dim] == nv.shape[time_dim]
|
258 |
offset = 0
|
259 |
if self.past_context is not None:
|
260 |
-
|
261 |
offset = max(0, nk.shape[time_dim] - self.past_context)
|
262 |
if self._is_streaming:
|
263 |
self._streaming_state['past_keys'] = nk[:, offset:]
|
264 |
if v is not k:
|
265 |
-
|
266 |
self._streaming_state['past_values'] = nv[:, offset:]
|
267 |
if 'offset' in self._streaming_state:
|
268 |
-
|
269 |
self._streaming_state['offset'] += offset
|
270 |
else:
|
271 |
-
|
272 |
self._streaming_state['offset'] = torch.tensor(0)
|
273 |
return nk, nv
|
274 |
|
275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
|
277 |
def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
|
278 |
key_padding_mask=None, need_weights=False, attn_mask=None,
|
279 |
average_attn_weights=True, is_causal=False):
|
280 |
-
|
281 |
-
|
282 |
assert not is_causal, ("New param added in torch 2.0.1 not supported, "
|
283 |
"use the causal args in the constructor.")
|
284 |
|
@@ -292,22 +332,29 @@ class StreamingMultiheadAttention(StreamingModule):
|
|
292 |
assert self.causal or self.cross_attention, \
|
293 |
"Streaming only available for causal or cross attention"
|
294 |
|
295 |
-
|
296 |
|
297 |
-
|
298 |
-
|
299 |
-
|
|
|
|
|
|
|
300 |
|
301 |
if self.custom:
|
302 |
-
|
|
|
|
|
303 |
if self.cross_attention:
|
304 |
-
|
305 |
# Different queries, keys, values, we have to spit manually the weights
|
306 |
# before applying the linear.
|
307 |
dim = self.in_proj_weight.shape[0] // 3
|
308 |
-
|
309 |
-
|
310 |
-
|
|
|
|
|
|
|
311 |
q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
|
312 |
# todo: when streaming, we could actually save k, v and check the shape actually match.
|
313 |
k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
|
@@ -323,31 +370,125 @@ class StreamingMultiheadAttention(StreamingModule):
|
|
323 |
assert value is key, "specialized implementation"
|
324 |
projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
|
325 |
if self.kv_repeat == 1:
|
326 |
-
|
327 |
if time_dim == 2:
|
328 |
bound_layout = "b h p t d"
|
329 |
else:
|
330 |
bound_layout = "b t p h d"
|
331 |
packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
|
332 |
q, k, v = ops.unbind(packed, dim=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
|
334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
k, v = self._complete_kv(k, v)
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
q, k, v
|
341 |
-
|
342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
x = x.to(dtype)
|
344 |
x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
|
345 |
x = self.out_proj(x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
return x, None
|
347 |
|
348 |
|
349 |
class StreamingTransformerLayer(nn.TransformerEncoderLayer):
|
|
|
|
|
|
|
350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
|
352 |
bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
|
353 |
past_context: tp.Optional[int] = None, custom: bool = False,
|
@@ -495,7 +636,6 @@ class StreamingTransformer(StreamingModule):
|
|
495 |
assert positional_embedding in ['sin', 'rope', 'sin_rope']
|
496 |
self.rope: tp.Optional[RotaryEmbedding] = None
|
497 |
if self.positional_embedding in ['rope', 'sin_rope']:
|
498 |
-
print('ROPE\nL')
|
499 |
assert _is_custom(custom, memory_efficient)
|
500 |
self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
|
501 |
xpos=xpos, scale=positional_scale, device=device)
|
@@ -523,11 +663,39 @@ class StreamingTransformer(StreamingModule):
|
|
523 |
# backward hook inside of FSDP...
|
524 |
layer._magma_checkpointed = True # type: ignore
|
525 |
|
526 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
527 |
|
528 |
def forward(self, x: torch.Tensor, *args, **kwargs):
|
529 |
-
|
530 |
-
# Output x: [2, 1, 1536] how is batch expanded to 2
|
531 |
B, T, C = x.shape
|
532 |
|
533 |
if 'offsets' in self._streaming_state:
|
@@ -536,20 +704,17 @@ class StreamingTransformer(StreamingModule):
|
|
536 |
offsets = torch.zeros(B, dtype=torch.long, device=x.device)
|
537 |
|
538 |
if self.positional_embedding in ['sin', 'sin_rope']:
|
539 |
-
# print(f'{self.positional_embedding=}\n') 'sin'
|
540 |
positions = torch.arange(T, device=x.device).view(1, -1, 1)
|
541 |
positions = positions + offsets.view(-1, 1, 1)
|
542 |
pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
|
543 |
x = x + self.positional_scale * pos_emb
|
544 |
|
545 |
for layer in self.layers:
|
546 |
-
|
547 |
-
# # kwargs=() kwargs={'cross_attention_src', 'src_mask'}
|
548 |
-
x = layer(x, **kwargs)
|
549 |
|
550 |
if self._is_streaming:
|
551 |
self._streaming_state['offsets'] = offsets + T
|
552 |
-
|
553 |
return x
|
554 |
|
555 |
def make_optim_group(self):
|
@@ -592,4 +757,4 @@ def _verify_xformers_internal_compat():
|
|
592 |
|
593 |
|
594 |
def _is_custom(custom: bool, memory_efficient: bool):
|
595 |
-
return custom or memory_efficient
|
|
|
86 |
adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
|
87 |
max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype) # avoid sync point
|
88 |
phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
|
89 |
+
print('==============CONCAT 3 ============')
|
90 |
return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
|
91 |
|
92 |
|
|
|
178 |
self.past_context = past_context
|
179 |
self.memory_efficient = memory_efficient
|
180 |
self.attention_as_float32 = attention_as_float32
|
181 |
+
self.rope = rope
|
182 |
self.cross_attention = cross_attention
|
183 |
self.safe_streaming = safe_streaming
|
184 |
self.num_heads = num_heads
|
|
|
231 |
state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
|
232 |
super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
|
233 |
|
234 |
+
def _get_mask(self, current_steps: int, device: torch.device, dtype: torch.dtype):
|
235 |
+
# Return a causal mask, accounting for potentially stored past keys/values
|
236 |
+
# We actually return a bias for the attention score, as this has the same
|
237 |
+
# convention both in the builtin MHA in Pytorch, and Xformers functions.
|
238 |
+
time_dim = _get_attention_time_dimension(self.memory_efficient)
|
239 |
+
if self.memory_efficient:
|
240 |
+
from xformers.ops import LowerTriangularMask
|
241 |
+
if current_steps == 1:
|
242 |
+
# If we only have one step, then we do not need a mask.
|
243 |
+
return None
|
244 |
+
elif 'past_keys' in self._streaming_state:
|
245 |
+
raise RuntimeError("Not supported at the moment")
|
246 |
+
else:
|
247 |
+
# Then we can safely use a lower triangular mask
|
248 |
+
return LowerTriangularMask()
|
249 |
+
if self._streaming_state:
|
250 |
+
past_keys = self._streaming_state['past_keys']
|
251 |
+
past_steps = past_keys.shape[time_dim]
|
252 |
+
else:
|
253 |
+
past_steps = 0
|
254 |
+
|
255 |
+
queries_pos = torch.arange(
|
256 |
+
past_steps, current_steps + past_steps, device=device).view(-1, 1)
|
257 |
+
keys_pos = torch.arange(past_steps + current_steps, device=device).view(1, -1)
|
258 |
+
delta = queries_pos - keys_pos
|
259 |
+
valid = delta >= 0
|
260 |
+
if self.past_context is not None:
|
261 |
+
valid &= (delta <= self.past_context)
|
262 |
+
return torch.where(
|
263 |
+
valid,
|
264 |
+
torch.zeros([], device=device, dtype=dtype),
|
265 |
+
torch.full([], float('-inf'), device=device, dtype=dtype))
|
266 |
|
267 |
def _complete_kv(self, k, v):
|
268 |
+
|
269 |
time_dim = _get_attention_time_dimension(self.memory_efficient)
|
270 |
if self.cross_attention:
|
271 |
# With cross attention we assume all keys and values
|
|
|
274 |
return k, v
|
275 |
# Complete the key/value pair using the streaming state.
|
276 |
if self._streaming_state:
|
|
|
277 |
pk = self._streaming_state['past_keys']
|
278 |
nk = torch.cat([pk, k], dim=time_dim)
|
279 |
+
print('==============CONCAT 1===============')
|
280 |
if v is k:
|
|
|
281 |
nv = nk
|
282 |
else:
|
|
|
283 |
pv = self._streaming_state['past_values']
|
284 |
nv = torch.cat([pv, v], dim=time_dim)
|
285 |
+
print('==============CONCAT 2================')
|
286 |
else:
|
287 |
nk = k
|
288 |
nv = v
|
|
|
290 |
assert nk.shape[time_dim] == nv.shape[time_dim]
|
291 |
offset = 0
|
292 |
if self.past_context is not None:
|
|
|
293 |
offset = max(0, nk.shape[time_dim] - self.past_context)
|
294 |
if self._is_streaming:
|
295 |
self._streaming_state['past_keys'] = nk[:, offset:]
|
296 |
if v is not k:
|
|
|
297 |
self._streaming_state['past_values'] = nv[:, offset:]
|
298 |
if 'offset' in self._streaming_state:
|
|
|
299 |
self._streaming_state['offset'] += offset
|
300 |
else:
|
|
|
301 |
self._streaming_state['offset'] = torch.tensor(0)
|
302 |
return nk, nv
|
303 |
|
304 |
+
def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
|
305 |
+
time_dim = _get_attention_time_dimension(self.memory_efficient)
|
306 |
+
# Apply rope embeddings to query and key tensors.
|
307 |
+
assert self.rope is not None
|
308 |
+
if 'past_keys' in self._streaming_state:
|
309 |
+
past_keys_offset = self._streaming_state['past_keys'].shape[1]
|
310 |
+
else:
|
311 |
+
past_keys_offset = 0
|
312 |
+
if 'offset' in self._streaming_state:
|
313 |
+
past_context_offset = int(self._streaming_state['offset'].item())
|
314 |
+
else:
|
315 |
+
past_context_offset = 0
|
316 |
+
streaming_offset = past_context_offset + past_keys_offset
|
317 |
+
return self.rope.rotate_qk(query, key, start=streaming_offset, time_dim=time_dim)
|
318 |
|
319 |
def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
|
320 |
key_padding_mask=None, need_weights=False, attn_mask=None,
|
321 |
average_attn_weights=True, is_causal=False):
|
|
|
|
|
322 |
assert not is_causal, ("New param added in torch 2.0.1 not supported, "
|
323 |
"use the causal args in the constructor.")
|
324 |
|
|
|
332 |
assert self.causal or self.cross_attention, \
|
333 |
"Streaming only available for causal or cross attention"
|
334 |
|
335 |
+
custom_attn_mask = attn_mask is not None
|
336 |
|
337 |
+
if self.causal:
|
338 |
+
assert attn_mask is None
|
339 |
+
# At the moment we specialize only for the self-attention case.
|
340 |
+
assert query.shape[1] == key.shape[1], "Causal only for same length query / key / value"
|
341 |
+
assert value.shape[1] == key.shape[1], "Causal only for same length query / key / value"
|
342 |
+
attn_mask = self._get_mask(query.shape[1], query.device, query.dtype)
|
343 |
|
344 |
if self.custom:
|
345 |
+
# custom implementation
|
346 |
+
assert need_weights is False
|
347 |
+
assert key_padding_mask is None
|
348 |
if self.cross_attention:
|
|
|
349 |
# Different queries, keys, values, we have to spit manually the weights
|
350 |
# before applying the linear.
|
351 |
dim = self.in_proj_weight.shape[0] // 3
|
352 |
+
if self.in_proj_bias is None:
|
353 |
+
bias_q, bias_k, bias_v = None, None, None
|
354 |
+
else:
|
355 |
+
bias_q = self.in_proj_bias[:dim]
|
356 |
+
bias_k = self.in_proj_bias[dim: 2 * dim]
|
357 |
+
bias_v = self.in_proj_bias[2 * dim:]
|
358 |
q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
|
359 |
# todo: when streaming, we could actually save k, v and check the shape actually match.
|
360 |
k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
|
|
|
370 |
assert value is key, "specialized implementation"
|
371 |
projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
|
372 |
if self.kv_repeat == 1:
|
|
|
373 |
if time_dim == 2:
|
374 |
bound_layout = "b h p t d"
|
375 |
else:
|
376 |
bound_layout = "b t p h d"
|
377 |
packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
|
378 |
q, k, v = ops.unbind(packed, dim=2)
|
379 |
+
else:
|
380 |
+
embed_dim = self.embed_dim
|
381 |
+
per_head_dim = (embed_dim // self.num_heads)
|
382 |
+
kv_heads = self.num_heads // self.kv_repeat
|
383 |
+
q = projected[:, :, :embed_dim]
|
384 |
+
start = embed_dim
|
385 |
+
end = start + per_head_dim * kv_heads
|
386 |
+
k = projected[:, :, start: end]
|
387 |
+
v = projected[:, :, end:]
|
388 |
+
q = rearrange(q, f"b t (h d) -> {layout}", h=self.num_heads)
|
389 |
+
k = rearrange(k, f"b t (h d) -> {layout}", h=kv_heads)
|
390 |
+
v = rearrange(v, f"b t (h d) -> {layout}", h=kv_heads)
|
391 |
|
392 |
+
if self.qk_layer_norm is True:
|
393 |
+
assert self.kv_repeat == 1
|
394 |
+
q, k = [rearrange(x, f"{layout} -> b t (h d)") for x in [q, k]]
|
395 |
+
q = self.q_layer_norm(q)
|
396 |
+
k = self.k_layer_norm(k)
|
397 |
+
q, k = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k]]
|
398 |
+
if self.rope:
|
399 |
+
q, k = self._apply_rope(q, k)
|
400 |
k, v = self._complete_kv(k, v)
|
401 |
+
if self.kv_repeat > 1:
|
402 |
+
k = expand_repeated_kv(k, self.kv_repeat, self.memory_efficient)
|
403 |
+
v = expand_repeated_kv(v, self.kv_repeat, self.memory_efficient)
|
404 |
+
if self.attention_as_float32:
|
405 |
+
q, k, v = [x.float() for x in [q, k, v]]
|
406 |
+
if self.memory_efficient:
|
407 |
+
if custom_attn_mask:
|
408 |
+
# When using a custom attn mask:
|
409 |
+
# Move to query's device, repeat for each sample, remove align8 padding
|
410 |
+
seq_len = query.shape[1]
|
411 |
+
attn_mask = attn_mask.to(q.dtype)
|
412 |
+
attn_mask = attn_mask.repeat((q.shape[0], 1, 1, 1))
|
413 |
+
attn_mask = attn_mask[..., :seq_len, :seq_len]
|
414 |
+
|
415 |
+
p = self.dropout if self.training else 0
|
416 |
+
if _efficient_attention_backend == 'torch':
|
417 |
+
x = torch.nn.functional.scaled_dot_product_attention(
|
418 |
+
q, k, v, is_causal=attn_mask is not None, dropout_p=p)
|
419 |
+
else:
|
420 |
+
x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
|
421 |
+
else:
|
422 |
+
# We include the dot product as float32, for consistency
|
423 |
+
# with the other implementations that include that step
|
424 |
+
# as part of the attention. Note that when using `autocast`,
|
425 |
+
# the einsums would be done as bfloat16, but the softmax
|
426 |
+
# would be done as bfloat16, so `attention_as_float32` will
|
427 |
+
# extend a bit the range of operations done in float32,
|
428 |
+
# although this should make no difference.
|
429 |
+
q = q / q.shape[-1] ** 0.5
|
430 |
+
key_layout = layout.replace('t', 'k')
|
431 |
+
query_layout = layout
|
432 |
+
if self._is_streaming and self.safe_streaming and q.device.type == 'cuda':
|
433 |
+
with torch.autocast(device_type=q.device.type, dtype=torch.float32):
|
434 |
+
pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
|
435 |
+
else:
|
436 |
+
pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
|
437 |
+
if attn_mask is not None:
|
438 |
+
pre_w = pre_w + attn_mask
|
439 |
+
w = torch.softmax(pre_w, dim=-1)
|
440 |
+
w = F.dropout(w, self.dropout, training=self.training).to(v)
|
441 |
+
# Key and value have the same format.
|
442 |
+
x = torch.einsum(f"b h t k, {key_layout} -> {layout}", w, v)
|
443 |
x = x.to(dtype)
|
444 |
x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
|
445 |
x = self.out_proj(x)
|
446 |
+
else:
|
447 |
+
key, value = self._complete_kv(key, value)
|
448 |
+
if self.attention_as_float32:
|
449 |
+
query, key, value = [x.float() for x in [query, key, value]]
|
450 |
+
x, _ = self.mha(
|
451 |
+
query, key, value, key_padding_mask,
|
452 |
+
need_weights, attn_mask, average_attn_weights)
|
453 |
+
x = x.to(dtype)
|
454 |
+
|
455 |
return x, None
|
456 |
|
457 |
|
458 |
class StreamingTransformerLayer(nn.TransformerEncoderLayer):
|
459 |
+
"""TransformerLayer with Streaming / Causal support.
|
460 |
+
This also integrates cross_attention, when passing `cross_attention=True`,
|
461 |
+
rather than having two separate classes like in PyTorch.
|
462 |
|
463 |
+
Args:
|
464 |
+
d_model (int): Dimension of the data.
|
465 |
+
num_heads (int): Number of heads.
|
466 |
+
dim_feedforward (int): Intermediate dimension of FF module.
|
467 |
+
dropout (float): Dropout both for MHA and FF.
|
468 |
+
bias_ff (bool): Use bias for FF.
|
469 |
+
bias_attn (bool): Use bias for MHA.
|
470 |
+
causal (bool): Causal mask applied automatically.
|
471 |
+
past_context (int, optional): Receptive field for the causal mask, infinite if None.
|
472 |
+
custom (bool): Use custom MHA implementation, for testing / benchmarking.
|
473 |
+
memory_efficient (bool): Use xformers based memory efficient attention.
|
474 |
+
attention_as_float32 (bool): Perform the attention as float32
|
475 |
+
(especially important with memory_efficient as autocast won't do this automatically).
|
476 |
+
qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product in attention.
|
477 |
+
qk_layer_norm_cross (bool): Same for the cross attention.
|
478 |
+
cross_attention (bool): If True, expect to get secondary input for cross-attention.
|
479 |
+
Cross attention will use the default MHA, as it typically won't require
|
480 |
+
special treatment.
|
481 |
+
layer_scale (float, optional): If not None, LayerScale will be used with
|
482 |
+
the given value as initial scale.
|
483 |
+
rope (`RotaryEmbedding`, optional): Rope embedding to use.
|
484 |
+
attention_dropout (float, optional): If not None, separate the value of the dimension dropout
|
485 |
+
in FFN and of the attention dropout.
|
486 |
+
kv_repeat (int): If > 1, will repeat keys and queries multiple times (need to divide num_heads).
|
487 |
+
This will lead to faster decoding time on A100 or other GPUs with tensorcore.
|
488 |
+
device (torch.device, optional): Device on which to initialize.
|
489 |
+
dtype (torch.dtype, optional): dtype to use.
|
490 |
+
**kwargs: See `nn.TransformerEncoderLayer`.
|
491 |
+
"""
|
492 |
def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
|
493 |
bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
|
494 |
past_context: tp.Optional[int] = None, custom: bool = False,
|
|
|
636 |
assert positional_embedding in ['sin', 'rope', 'sin_rope']
|
637 |
self.rope: tp.Optional[RotaryEmbedding] = None
|
638 |
if self.positional_embedding in ['rope', 'sin_rope']:
|
|
|
639 |
assert _is_custom(custom, memory_efficient)
|
640 |
self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
|
641 |
xpos=xpos, scale=positional_scale, device=device)
|
|
|
663 |
# backward hook inside of FSDP...
|
664 |
layer._magma_checkpointed = True # type: ignore
|
665 |
|
666 |
+
def _apply_layer(self, layer, *args, **kwargs):
|
667 |
+
method = self.checkpointing
|
668 |
+
if method == 'none':
|
669 |
+
return layer(*args, **kwargs)
|
670 |
+
elif method == 'torch':
|
671 |
+
return torch_checkpoint(layer, *args, use_reentrant=False, **kwargs)
|
672 |
+
elif method.startswith('xformers'):
|
673 |
+
from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy
|
674 |
+
if method == 'xformers_default':
|
675 |
+
# those operations will be saved, and not recomputed.
|
676 |
+
# According to Francisco we can get smarter policies but this is a good start.
|
677 |
+
allow_list = [
|
678 |
+
"xformers.efficient_attention_forward_cutlass.default",
|
679 |
+
"xformers_flash.flash_fwd.default",
|
680 |
+
"aten.addmm.default",
|
681 |
+
"aten.mm.default",
|
682 |
+
]
|
683 |
+
elif method == 'xformers_mm':
|
684 |
+
# those operations will be saved, and not recomputed.
|
685 |
+
# According to Francisco we can get smarter policies but this is a good start.
|
686 |
+
allow_list = [
|
687 |
+
"aten.addmm.default",
|
688 |
+
"aten.mm.default",
|
689 |
+
]
|
690 |
+
else:
|
691 |
+
raise ValueError(f"xformers checkpointing xformers policy {method} is not known.")
|
692 |
+
policy_fn = _get_default_policy(allow_list)
|
693 |
+
return checkpoint(layer, *args, policy_fn=policy_fn, **kwargs)
|
694 |
+
else:
|
695 |
+
raise ValueError(f"Checkpointing method {method} is unknown.")
|
696 |
|
697 |
def forward(self, x: torch.Tensor, *args, **kwargs):
|
698 |
+
|
|
|
699 |
B, T, C = x.shape
|
700 |
|
701 |
if 'offsets' in self._streaming_state:
|
|
|
704 |
offsets = torch.zeros(B, dtype=torch.long, device=x.device)
|
705 |
|
706 |
if self.positional_embedding in ['sin', 'sin_rope']:
|
|
|
707 |
positions = torch.arange(T, device=x.device).view(1, -1, 1)
|
708 |
positions = positions + offsets.view(-1, 1, 1)
|
709 |
pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
|
710 |
x = x + self.positional_scale * pos_emb
|
711 |
|
712 |
for layer in self.layers:
|
713 |
+
x = self._apply_layer(layer, x, *args, **kwargs)
|
|
|
|
|
714 |
|
715 |
if self._is_streaming:
|
716 |
self._streaming_state['offsets'] = offsets + T
|
717 |
+
|
718 |
return x
|
719 |
|
720 |
def make_optim_group(self):
|
|
|
757 |
|
758 |
|
759 |
def _is_custom(custom: bool, memory_efficient: bool):
|
760 |
+
return custom or memory_efficient
|
audiocraft/utils/utils.py
CHANGED
@@ -1,23 +1,11 @@
|
|
1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
-
# All rights reserved.
|
3 |
-
#
|
4 |
-
# This source code is licensed under the license found in the
|
5 |
-
# LICENSE file in the root directory of this source tree.
|
6 |
-
|
7 |
-
|
8 |
-
from contextlib import contextmanager
|
9 |
-
from functools import wraps, lru_cache
|
10 |
import hashlib
|
11 |
import json
|
12 |
import logging
|
13 |
-
from pathlib import Path
|
14 |
import typing as tp
|
15 |
-
|
16 |
import flashy
|
17 |
import flashy.distrib
|
18 |
import omegaconf
|
19 |
import torch
|
20 |
-
from torch.nn.utils.rnn import pad_sequence
|
21 |
|
22 |
|
23 |
logger = logging.getLogger(__name__)
|
@@ -46,13 +34,7 @@ def dict_from_config(cfg: omegaconf.DictConfig) -> dict:
|
|
46 |
return dct
|
47 |
|
48 |
|
49 |
-
def random_subset(dataset, max_samples: int, seed: int = 42) -> torch.utils.data.Subset:
|
50 |
-
if max_samples >= len(dataset):
|
51 |
-
return dataset
|
52 |
|
53 |
-
generator = torch.Generator().manual_seed(seed)
|
54 |
-
perm = torch.randperm(len(dataset), generator=generator)
|
55 |
-
return torch.utils.data.Subset(dataset, perm[:max_samples].tolist())
|
56 |
|
57 |
|
58 |
def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
|
@@ -89,67 +71,28 @@ def get_dataset_from_loader(dataloader):
|
|
89 |
|
90 |
|
91 |
|
92 |
-
def sample_top_k(p, k, n_draw=None):
|
93 |
"""
|
94 |
p probabs 2048 ?
|
95 |
num_draw : how many tokens to sample (for duplicate elongation)
|
96 |
"""
|
97 |
|
98 |
-
p = torch.softmax(p
|
99 |
|
100 |
|
101 |
|
102 |
top_k_value, i250 = torch.topk(p, k, dim=-1) # probs: [1, 4, 2048]
|
|
|
103 |
min_value_top_k = top_k_value[..., [-1]] #
|
104 |
p *= (p >= min_value_top_k).float()
|
105 |
p.div_(p.sum(dim=-1, keepdim=True))
|
106 |
# -- next_token = multinomial(probs, num_samples=num_draw)
|
|
|
|
|
107 |
p_ = p.reshape(-1, p.shape[-1])
|
|
|
|
|
108 |
out = torch.multinomial(p_,
|
109 |
num_samples=n_draw,
|
110 |
replacement=False) # [4, num_draw]
|
111 |
return out.transpose(0, 1)[:, :, None] # [num_draw, 4, 1]
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -> torch.Tensor:
|
118 |
-
"""Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
|
119 |
-
For example: [3, 5] => [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]
|
120 |
-
|
121 |
-
Args:
|
122 |
-
lengths (torch.Tensor): tensor with lengths
|
123 |
-
max_len (int): can set the max length manually. Defaults to None.
|
124 |
-
Returns:
|
125 |
-
torch.Tensor: mask with 0s where there is pad tokens else 1s
|
126 |
-
"""
|
127 |
-
assert len(lengths.shape) == 1, "Length shape should be 1 dimensional."
|
128 |
-
final_length = lengths.max().item() if not max_len else max_len
|
129 |
-
final_length = max(final_length, 1) # if all seqs are of len zero we don't want a zero-size tensor
|
130 |
-
return torch.arange(final_length, device=lengths.device)[None, :] < lengths[:, None]
|
131 |
-
|
132 |
-
|
133 |
-
def collate(tensors: tp.List[torch.Tensor], dim: int = 0) -> tp.Tuple[torch.Tensor, torch.Tensor]:
|
134 |
-
"""Get a list of tensors and collate them to a single tensor. according to the following logic:
|
135 |
-
- `dim` specifies the time dimension which will be stacked and padded.
|
136 |
-
- The output will contain 1 new dimension (dimension index 0) which will be the size of
|
137 |
-
of the original list.
|
138 |
-
|
139 |
-
Args:
|
140 |
-
tensors (tp.List[torch.Tensor]): List of tensors to collate.
|
141 |
-
dim (int): Dimension which will be stacked and padded.
|
142 |
-
Returns:
|
143 |
-
tp.Tuple[torch.Tensor, torch.Tensor]:
|
144 |
-
torch.Tensor: Stacked and padded tensor. The output will contain 1 new dimension
|
145 |
-
(dimension index 0) which will be the size of the original list.
|
146 |
-
torch.Tensor: Tensor containing length of original tensor sizes (without padding).
|
147 |
-
"""
|
148 |
-
tensors = [x.transpose(0, dim) for x in tensors]
|
149 |
-
lens = torch.LongTensor([len(x) for x in tensors])
|
150 |
-
padded_tensors = pad_sequence(tensors)
|
151 |
-
padded_tensors = padded_tensors.transpose(0, 1)
|
152 |
-
padded_tensors = padded_tensors.transpose(1, dim + 1)
|
153 |
-
return padded_tensors, lens
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import hashlib
|
2 |
import json
|
3 |
import logging
|
|
|
4 |
import typing as tp
|
|
|
5 |
import flashy
|
6 |
import flashy.distrib
|
7 |
import omegaconf
|
8 |
import torch
|
|
|
9 |
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
|
|
34 |
return dct
|
35 |
|
36 |
|
|
|
|
|
|
|
37 |
|
|
|
|
|
|
|
38 |
|
39 |
|
40 |
def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
|
|
|
71 |
|
72 |
|
73 |
|
74 |
+
def sample_top_k(p, k=250, n_draw=None):
|
75 |
"""
|
76 |
p probabs 2048 ?
|
77 |
num_draw : how many tokens to sample (for duplicate elongation)
|
78 |
"""
|
79 |
|
80 |
+
p = torch.softmax(p, dim=-1) # p/temp
|
81 |
|
82 |
|
83 |
|
84 |
top_k_value, i250 = torch.topk(p, k, dim=-1) # probs: [1, 4, 2048]
|
85 |
+
# print('\n_____TOPK________\n', top_k_value.shape, top_k_value[0, 0, :10], '\n___________END_TOPK____________\n')
|
86 |
min_value_top_k = top_k_value[..., [-1]] #
|
87 |
p *= (p >= min_value_top_k).float()
|
88 |
p.div_(p.sum(dim=-1, keepdim=True))
|
89 |
# -- next_token = multinomial(probs, num_samples=num_draw)
|
90 |
+
|
91 |
+
# RESHAPED into bs, 4, 250
|
92 |
p_ = p.reshape(-1, p.shape[-1])
|
93 |
+
|
94 |
+
|
95 |
out = torch.multinomial(p_,
|
96 |
num_samples=n_draw,
|
97 |
replacement=False) # [4, num_draw]
|
98 |
return out.transpose(0, 1)[:, :, None] # [num_draw, 4, 1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo.py
CHANGED
@@ -4,10 +4,10 @@ import numpy as np
|
|
4 |
|
5 |
print('\n\n\n\n___________________')
|
6 |
|
7 |
-
txt = '
|
8 |
|
9 |
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
10 |
-
sound_generator.set_generation_params(duration
|
11 |
|
12 |
x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
|
13 |
x /= np.abs(x).max() + 1e-7
|
|
|
4 |
|
5 |
print('\n\n\n\n___________________')
|
6 |
|
7 |
+
txt = 'dogs in street'
|
8 |
|
9 |
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
10 |
+
sound_generator.set_generation_params(duration=1.7) # why is generating so long at 14 seconds
|
11 |
|
12 |
x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
|
13 |
x /= np.abs(x).max() + 1e-7
|
live_api.py
CHANGED
@@ -17,7 +17,7 @@ from flask_cors import CORS
|
|
17 |
from audiocraft.audiogen import AudioGen #, audio_write
|
18 |
|
19 |
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
20 |
-
sound_generator.set_generation_params(duration
|
21 |
|
22 |
|
23 |
# ====STYLE VECTOR====
|
@@ -51,11 +51,13 @@ def tts_multi_sentence(scene=None):
|
|
51 |
x = sound_generator.generate([scene])[0].detach().cpu().numpy()[0, :]
|
52 |
|
53 |
x /= np.abs(x).max() + 1e-7
|
54 |
-
#
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
59 |
print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
|
60 |
else:
|
61 |
print(scene, '\nDrop\n')
|
|
|
17 |
from audiocraft.audiogen import AudioGen #, audio_write
|
18 |
|
19 |
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
20 |
+
sound_generator.set_generation_params(duration=.7)
|
21 |
|
22 |
|
23 |
# ====STYLE VECTOR====
|
|
|
51 |
x = sound_generator.generate([scene])[0].detach().cpu().numpy()[0, :]
|
52 |
|
53 |
x /= np.abs(x).max() + 1e-7
|
54 |
+
# is 16kHz - AUdiogen Fs
|
55 |
+
x = audresample.resample(x,
|
56 |
+
original_rate=sound_generator.sample_rate, # 16000
|
57 |
+
target_rate=24000)[0, :]
|
58 |
+
|
59 |
+
|
60 |
+
#
|
61 |
print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
|
62 |
else:
|
63 |
print(scene, '\nDrop\n')
|