Dionyssos commited on
Commit
2e6c69d
·
1 Parent(s): 3eec6d2

revert transformer.py

Browse files
Files changed (5) hide show
  1. audiocraft/lm.py +20 -42
  2. audiocraft/transformer.py +204 -39
  3. audiocraft/utils/utils.py +7 -64
  4. demo.py +2 -2
  5. live_api.py +8 -6
audiocraft/lm.py CHANGED
@@ -246,32 +246,29 @@ class LMModel(StreamingModule):
246
  def _sample_next_token(self,
247
  sequence,
248
  cfg_conditions,
249
- unconditional_state,
250
- use_sampling=False,
251
- temp: float = 1.0,
252
- top_k: int = 0,
253
- top_p: float = 0.0,
254
- cfg_coef: tp.Optional[float] = None,
255
- two_step_cfg: tp.Optional[bool] = None) -> torch.Tensor:
256
  """self.n_draw"""
257
  B = sequence.shape[0]
258
- cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
259
- model = self if self._fsdp is None else self._fsdp
260
- two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
261
- condition_tensors = cfg_conditions
262
-
263
 
 
264
 
 
 
265
  logits = model(
266
  sequence, # cond_logits = wav condition
267
  conditions=[], condition_tensors=condition_tensors) # uncond_logits already see the text
268
 
269
- # print(f'{logits.shape=} L')
270
- logits = logits[0, :, :, :].transpose(1,0) # sample expects [1, 4, 2048]
271
- # logits = [2, 4, 1, 2048]
272
- # print(f'{B=}, {logits.shape=} SAMPLER {top_k=}')
273
- next_token = utils.sample_top_k(logits, k=top_k, n_draw=self.n_draw) # [1,4,2048] logits
274
 
 
 
 
 
 
 
 
 
 
 
275
  return next_token
276
 
277
  # GENERATE class revert_codebook_patterns()
@@ -282,15 +279,7 @@ class LMModel(StreamingModule):
282
  num_samples = 1, # THIS IS HOW MANY GENERATIONS - A SAMPLE IS A FULL WAV
283
  max_gen_len=256, # unduplicated sequence length - actual len will be n_draw * maxgenlen
284
  use_sampling: bool = True,
285
- temp: float = 1.0,
286
- top_k: int = 250,
287
- top_p: float = 0.0,
288
- cfg_coef: tp.Optional[float] = None,
289
- two_step_cfg: tp.Optional[bool] = None,
290
- remove_prompts: bool = False,
291
- check: bool = False,
292
- callback: tp.Optional[tp.Callable[[int, int], None]] = None,
293
- **kwargs) -> torch.Tensor:
294
 
295
  print(f'{num_samples=}')
296
  first_param = next(iter(self.parameters()))
@@ -365,32 +354,21 @@ class LMModel(StreamingModule):
365
  next_token = self._sample_next_token(
366
  curr_sequence,
367
  cfg_conditions,
368
- unconditional_state,
369
- use_sampling,
370
- temp, top_k, top_p,
371
- cfg_coef=cfg_coef,
372
- two_step_cfg=two_step_cfg) # [5, 4, 1]
373
- print(f'{next_token.shape=}')
374
- # replicate the sequence to hold 5 or more sequences as we generate 5 tokens or more
375
 
376
 
377
-
378
- # ensure the tokens that should be masked are properly set to special_token_id
379
- # as the model never output special_token_id
380
- # valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
381
 
382
- # next_token[~valid_mask] = self.special_token_id
383
 
384
- # print(f'{unconditional_state=} \n
385
- # print('Set All to Special')
386
 
387
- # RUNS with = 2047 just different of self.special_token_id = 2047 = drill noise
 
 
388
  # special_token_id is filler for CODEBOOK_PATTERN ?
389
 
390
  # next_token[:] = self.special_token_id # seanet.embed torch.embedding does not have this - out of bounds in detokenize
391
 
392
  _gen_sequence[..., offset:offset+1] = next_token[0, :, :] #gen_sequence.shape=torch.Size([1, 4, 39])
393
- # only cat 1 token to 1 sequence - preserve the duplicates in
394
  duplicate_draw.append(next_token)
395
 
396
  prev_offset = offset
 
246
  def _sample_next_token(self,
247
  sequence,
248
  cfg_conditions,
249
+ unconditional_state):
 
 
 
 
 
 
250
  """self.n_draw"""
251
  B = sequence.shape[0]
 
 
 
 
 
252
 
253
+ model = self if self._fsdp is None else self._fsdp
254
 
255
+ condition_tensors = cfg_conditions
256
+ # logits = [2, 4, 1, 2048]
257
  logits = model(
258
  sequence, # cond_logits = wav condition
259
  conditions=[], condition_tensors=condition_tensors) # uncond_logits already see the text
260
 
 
 
 
 
 
261
 
262
+ # use cfg
263
+ # logits = (3 * logits[1, :, :, :] - 2.4 * logits[0, :, :, :]).transpose(1,0)
264
+
265
+ # or use 1 of logits
266
+ logits = logits[0, :, :, :].transpose(1,0) # [2,4,1, 2048] -> [1,4,2048]
267
+
268
+
269
+
270
+ # print(f'{B=}, {logits.shape=} SAMPLER {top_k=}')
271
+ next_token = utils.sample_top_k(logits, n_draw=self.n_draw) # [1,4,2048] logits
272
  return next_token
273
 
274
  # GENERATE class revert_codebook_patterns()
 
279
  num_samples = 1, # THIS IS HOW MANY GENERATIONS - A SAMPLE IS A FULL WAV
280
  max_gen_len=256, # unduplicated sequence length - actual len will be n_draw * maxgenlen
281
  use_sampling: bool = True,
282
+ **kwargs):
 
 
 
 
 
 
 
 
283
 
284
  print(f'{num_samples=}')
285
  first_param = next(iter(self.parameters()))
 
354
  next_token = self._sample_next_token(
355
  curr_sequence,
356
  cfg_conditions,
357
+ unconditional_state) # [5, 4, 1]
 
 
 
 
 
 
358
 
359
 
 
 
 
 
360
 
 
361
 
 
 
362
 
363
+
364
+
365
+ # RUNS with = 2047 just different of self.special_token_id = 2047 = alwayssingletoken = drill noise
366
  # special_token_id is filler for CODEBOOK_PATTERN ?
367
 
368
  # next_token[:] = self.special_token_id # seanet.embed torch.embedding does not have this - out of bounds in detokenize
369
 
370
  _gen_sequence[..., offset:offset+1] = next_token[0, :, :] #gen_sequence.shape=torch.Size([1, 4, 39])
371
+
372
  duplicate_draw.append(next_token)
373
 
374
  prev_offset = offset
audiocraft/transformer.py CHANGED
@@ -86,6 +86,7 @@ def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float =
86
  adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
87
  max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype) # avoid sync point
88
  phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
 
89
  return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
90
 
91
 
@@ -177,7 +178,7 @@ class StreamingMultiheadAttention(StreamingModule):
177
  self.past_context = past_context
178
  self.memory_efficient = memory_efficient
179
  self.attention_as_float32 = attention_as_float32
180
-
181
  self.cross_attention = cross_attention
182
  self.safe_streaming = safe_streaming
183
  self.num_heads = num_heads
@@ -230,8 +231,41 @@ class StreamingMultiheadAttention(StreamingModule):
230
  state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
231
  super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
232
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
  def _complete_kv(self, k, v):
 
235
  time_dim = _get_attention_time_dimension(self.memory_efficient)
236
  if self.cross_attention:
237
  # With cross attention we assume all keys and values
@@ -240,16 +274,15 @@ class StreamingMultiheadAttention(StreamingModule):
240
  return k, v
241
  # Complete the key/value pair using the streaming state.
242
  if self._streaming_state:
243
- # print('{self._streaming_state.keys()=}') EMPTY - ALTHOUGH WE HAVE STREAMING STATE
244
  pk = self._streaming_state['past_keys']
245
  nk = torch.cat([pk, k], dim=time_dim)
 
246
  if v is k:
247
-
248
  nv = nk
249
  else:
250
-
251
  pv = self._streaming_state['past_values']
252
  nv = torch.cat([pv, v], dim=time_dim)
 
253
  else:
254
  nk = k
255
  nv = v
@@ -257,28 +290,35 @@ class StreamingMultiheadAttention(StreamingModule):
257
  assert nk.shape[time_dim] == nv.shape[time_dim]
258
  offset = 0
259
  if self.past_context is not None:
260
-
261
  offset = max(0, nk.shape[time_dim] - self.past_context)
262
  if self._is_streaming:
263
  self._streaming_state['past_keys'] = nk[:, offset:]
264
  if v is not k:
265
-
266
  self._streaming_state['past_values'] = nv[:, offset:]
267
  if 'offset' in self._streaming_state:
268
-
269
  self._streaming_state['offset'] += offset
270
  else:
271
-
272
  self._streaming_state['offset'] = torch.tensor(0)
273
  return nk, nv
274
 
275
-
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
  def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
278
  key_padding_mask=None, need_weights=False, attn_mask=None,
279
  average_attn_weights=True, is_causal=False):
280
-
281
-
282
  assert not is_causal, ("New param added in torch 2.0.1 not supported, "
283
  "use the causal args in the constructor.")
284
 
@@ -292,22 +332,29 @@ class StreamingMultiheadAttention(StreamingModule):
292
  assert self.causal or self.cross_attention, \
293
  "Streaming only available for causal or cross attention"
294
 
295
-
296
 
297
-
298
-
299
-
 
 
 
300
 
301
  if self.custom:
302
-
 
 
303
  if self.cross_attention:
304
-
305
  # Different queries, keys, values, we have to spit manually the weights
306
  # before applying the linear.
307
  dim = self.in_proj_weight.shape[0] // 3
308
-
309
- bias_q, bias_k, bias_v = None, None, None
310
-
 
 
 
311
  q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
312
  # todo: when streaming, we could actually save k, v and check the shape actually match.
313
  k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
@@ -323,31 +370,125 @@ class StreamingMultiheadAttention(StreamingModule):
323
  assert value is key, "specialized implementation"
324
  projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
325
  if self.kv_repeat == 1:
326
-
327
  if time_dim == 2:
328
  bound_layout = "b h p t d"
329
  else:
330
  bound_layout = "b t p h d"
331
  packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
332
  q, k, v = ops.unbind(packed, dim=2)
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
-
 
 
 
 
 
 
 
335
  k, v = self._complete_kv(k, v)
336
- #print(f'{k.shape=}, {v.shape=}, {q.shape=}\n\n\n\n')
337
- # what is the 24 dimension is this heads?
338
-
339
- x = torch.nn.functional.scaled_dot_product_attention(
340
- q, k, v, is_causal=attn_mask is not None, dropout_p=0)
341
-
342
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  x = x.to(dtype)
344
  x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
345
  x = self.out_proj(x)
 
 
 
 
 
 
 
 
 
346
  return x, None
347
 
348
 
349
  class StreamingTransformerLayer(nn.TransformerEncoderLayer):
 
 
 
350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
352
  bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
353
  past_context: tp.Optional[int] = None, custom: bool = False,
@@ -495,7 +636,6 @@ class StreamingTransformer(StreamingModule):
495
  assert positional_embedding in ['sin', 'rope', 'sin_rope']
496
  self.rope: tp.Optional[RotaryEmbedding] = None
497
  if self.positional_embedding in ['rope', 'sin_rope']:
498
- print('ROPE\nL')
499
  assert _is_custom(custom, memory_efficient)
500
  self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
501
  xpos=xpos, scale=positional_scale, device=device)
@@ -523,11 +663,39 @@ class StreamingTransformer(StreamingModule):
523
  # backward hook inside of FSDP...
524
  layer._magma_checkpointed = True # type: ignore
525
 
526
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
 
528
  def forward(self, x: torch.Tensor, *args, **kwargs):
529
- # Input x: [1, 1, 1536]
530
- # Output x: [2, 1, 1536] how is batch expanded to 2
531
  B, T, C = x.shape
532
 
533
  if 'offsets' in self._streaming_state:
@@ -536,20 +704,17 @@ class StreamingTransformer(StreamingModule):
536
  offsets = torch.zeros(B, dtype=torch.long, device=x.device)
537
 
538
  if self.positional_embedding in ['sin', 'sin_rope']:
539
- # print(f'{self.positional_embedding=}\n') 'sin'
540
  positions = torch.arange(T, device=x.device).view(1, -1, 1)
541
  positions = positions + offsets.view(-1, 1, 1)
542
  pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
543
  x = x + self.positional_scale * pos_emb
544
 
545
  for layer in self.layers:
546
- # print(f'{args=} {kwargs.keys()=}')
547
- # # kwargs=() kwargs={'cross_attention_src', 'src_mask'}
548
- x = layer(x, **kwargs)
549
 
550
  if self._is_streaming:
551
  self._streaming_state['offsets'] = offsets + T
552
- print('OUT STReamTransfor', x.shape)
553
  return x
554
 
555
  def make_optim_group(self):
@@ -592,4 +757,4 @@ def _verify_xformers_internal_compat():
592
 
593
 
594
  def _is_custom(custom: bool, memory_efficient: bool):
595
- return custom or memory_efficient
 
86
  adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
87
  max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype) # avoid sync point
88
  phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
89
+ print('==============CONCAT 3 ============')
90
  return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
91
 
92
 
 
178
  self.past_context = past_context
179
  self.memory_efficient = memory_efficient
180
  self.attention_as_float32 = attention_as_float32
181
+ self.rope = rope
182
  self.cross_attention = cross_attention
183
  self.safe_streaming = safe_streaming
184
  self.num_heads = num_heads
 
231
  state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
232
  super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
233
 
234
+ def _get_mask(self, current_steps: int, device: torch.device, dtype: torch.dtype):
235
+ # Return a causal mask, accounting for potentially stored past keys/values
236
+ # We actually return a bias for the attention score, as this has the same
237
+ # convention both in the builtin MHA in Pytorch, and Xformers functions.
238
+ time_dim = _get_attention_time_dimension(self.memory_efficient)
239
+ if self.memory_efficient:
240
+ from xformers.ops import LowerTriangularMask
241
+ if current_steps == 1:
242
+ # If we only have one step, then we do not need a mask.
243
+ return None
244
+ elif 'past_keys' in self._streaming_state:
245
+ raise RuntimeError("Not supported at the moment")
246
+ else:
247
+ # Then we can safely use a lower triangular mask
248
+ return LowerTriangularMask()
249
+ if self._streaming_state:
250
+ past_keys = self._streaming_state['past_keys']
251
+ past_steps = past_keys.shape[time_dim]
252
+ else:
253
+ past_steps = 0
254
+
255
+ queries_pos = torch.arange(
256
+ past_steps, current_steps + past_steps, device=device).view(-1, 1)
257
+ keys_pos = torch.arange(past_steps + current_steps, device=device).view(1, -1)
258
+ delta = queries_pos - keys_pos
259
+ valid = delta >= 0
260
+ if self.past_context is not None:
261
+ valid &= (delta <= self.past_context)
262
+ return torch.where(
263
+ valid,
264
+ torch.zeros([], device=device, dtype=dtype),
265
+ torch.full([], float('-inf'), device=device, dtype=dtype))
266
 
267
  def _complete_kv(self, k, v):
268
+
269
  time_dim = _get_attention_time_dimension(self.memory_efficient)
270
  if self.cross_attention:
271
  # With cross attention we assume all keys and values
 
274
  return k, v
275
  # Complete the key/value pair using the streaming state.
276
  if self._streaming_state:
 
277
  pk = self._streaming_state['past_keys']
278
  nk = torch.cat([pk, k], dim=time_dim)
279
+ print('==============CONCAT 1===============')
280
  if v is k:
 
281
  nv = nk
282
  else:
 
283
  pv = self._streaming_state['past_values']
284
  nv = torch.cat([pv, v], dim=time_dim)
285
+ print('==============CONCAT 2================')
286
  else:
287
  nk = k
288
  nv = v
 
290
  assert nk.shape[time_dim] == nv.shape[time_dim]
291
  offset = 0
292
  if self.past_context is not None:
 
293
  offset = max(0, nk.shape[time_dim] - self.past_context)
294
  if self._is_streaming:
295
  self._streaming_state['past_keys'] = nk[:, offset:]
296
  if v is not k:
 
297
  self._streaming_state['past_values'] = nv[:, offset:]
298
  if 'offset' in self._streaming_state:
 
299
  self._streaming_state['offset'] += offset
300
  else:
 
301
  self._streaming_state['offset'] = torch.tensor(0)
302
  return nk, nv
303
 
304
+ def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
305
+ time_dim = _get_attention_time_dimension(self.memory_efficient)
306
+ # Apply rope embeddings to query and key tensors.
307
+ assert self.rope is not None
308
+ if 'past_keys' in self._streaming_state:
309
+ past_keys_offset = self._streaming_state['past_keys'].shape[1]
310
+ else:
311
+ past_keys_offset = 0
312
+ if 'offset' in self._streaming_state:
313
+ past_context_offset = int(self._streaming_state['offset'].item())
314
+ else:
315
+ past_context_offset = 0
316
+ streaming_offset = past_context_offset + past_keys_offset
317
+ return self.rope.rotate_qk(query, key, start=streaming_offset, time_dim=time_dim)
318
 
319
  def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
320
  key_padding_mask=None, need_weights=False, attn_mask=None,
321
  average_attn_weights=True, is_causal=False):
 
 
322
  assert not is_causal, ("New param added in torch 2.0.1 not supported, "
323
  "use the causal args in the constructor.")
324
 
 
332
  assert self.causal or self.cross_attention, \
333
  "Streaming only available for causal or cross attention"
334
 
335
+ custom_attn_mask = attn_mask is not None
336
 
337
+ if self.causal:
338
+ assert attn_mask is None
339
+ # At the moment we specialize only for the self-attention case.
340
+ assert query.shape[1] == key.shape[1], "Causal only for same length query / key / value"
341
+ assert value.shape[1] == key.shape[1], "Causal only for same length query / key / value"
342
+ attn_mask = self._get_mask(query.shape[1], query.device, query.dtype)
343
 
344
  if self.custom:
345
+ # custom implementation
346
+ assert need_weights is False
347
+ assert key_padding_mask is None
348
  if self.cross_attention:
 
349
  # Different queries, keys, values, we have to spit manually the weights
350
  # before applying the linear.
351
  dim = self.in_proj_weight.shape[0] // 3
352
+ if self.in_proj_bias is None:
353
+ bias_q, bias_k, bias_v = None, None, None
354
+ else:
355
+ bias_q = self.in_proj_bias[:dim]
356
+ bias_k = self.in_proj_bias[dim: 2 * dim]
357
+ bias_v = self.in_proj_bias[2 * dim:]
358
  q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
359
  # todo: when streaming, we could actually save k, v and check the shape actually match.
360
  k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
 
370
  assert value is key, "specialized implementation"
371
  projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
372
  if self.kv_repeat == 1:
 
373
  if time_dim == 2:
374
  bound_layout = "b h p t d"
375
  else:
376
  bound_layout = "b t p h d"
377
  packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
378
  q, k, v = ops.unbind(packed, dim=2)
379
+ else:
380
+ embed_dim = self.embed_dim
381
+ per_head_dim = (embed_dim // self.num_heads)
382
+ kv_heads = self.num_heads // self.kv_repeat
383
+ q = projected[:, :, :embed_dim]
384
+ start = embed_dim
385
+ end = start + per_head_dim * kv_heads
386
+ k = projected[:, :, start: end]
387
+ v = projected[:, :, end:]
388
+ q = rearrange(q, f"b t (h d) -> {layout}", h=self.num_heads)
389
+ k = rearrange(k, f"b t (h d) -> {layout}", h=kv_heads)
390
+ v = rearrange(v, f"b t (h d) -> {layout}", h=kv_heads)
391
 
392
+ if self.qk_layer_norm is True:
393
+ assert self.kv_repeat == 1
394
+ q, k = [rearrange(x, f"{layout} -> b t (h d)") for x in [q, k]]
395
+ q = self.q_layer_norm(q)
396
+ k = self.k_layer_norm(k)
397
+ q, k = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k]]
398
+ if self.rope:
399
+ q, k = self._apply_rope(q, k)
400
  k, v = self._complete_kv(k, v)
401
+ if self.kv_repeat > 1:
402
+ k = expand_repeated_kv(k, self.kv_repeat, self.memory_efficient)
403
+ v = expand_repeated_kv(v, self.kv_repeat, self.memory_efficient)
404
+ if self.attention_as_float32:
405
+ q, k, v = [x.float() for x in [q, k, v]]
406
+ if self.memory_efficient:
407
+ if custom_attn_mask:
408
+ # When using a custom attn mask:
409
+ # Move to query's device, repeat for each sample, remove align8 padding
410
+ seq_len = query.shape[1]
411
+ attn_mask = attn_mask.to(q.dtype)
412
+ attn_mask = attn_mask.repeat((q.shape[0], 1, 1, 1))
413
+ attn_mask = attn_mask[..., :seq_len, :seq_len]
414
+
415
+ p = self.dropout if self.training else 0
416
+ if _efficient_attention_backend == 'torch':
417
+ x = torch.nn.functional.scaled_dot_product_attention(
418
+ q, k, v, is_causal=attn_mask is not None, dropout_p=p)
419
+ else:
420
+ x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
421
+ else:
422
+ # We include the dot product as float32, for consistency
423
+ # with the other implementations that include that step
424
+ # as part of the attention. Note that when using `autocast`,
425
+ # the einsums would be done as bfloat16, but the softmax
426
+ # would be done as bfloat16, so `attention_as_float32` will
427
+ # extend a bit the range of operations done in float32,
428
+ # although this should make no difference.
429
+ q = q / q.shape[-1] ** 0.5
430
+ key_layout = layout.replace('t', 'k')
431
+ query_layout = layout
432
+ if self._is_streaming and self.safe_streaming and q.device.type == 'cuda':
433
+ with torch.autocast(device_type=q.device.type, dtype=torch.float32):
434
+ pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
435
+ else:
436
+ pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
437
+ if attn_mask is not None:
438
+ pre_w = pre_w + attn_mask
439
+ w = torch.softmax(pre_w, dim=-1)
440
+ w = F.dropout(w, self.dropout, training=self.training).to(v)
441
+ # Key and value have the same format.
442
+ x = torch.einsum(f"b h t k, {key_layout} -> {layout}", w, v)
443
  x = x.to(dtype)
444
  x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
445
  x = self.out_proj(x)
446
+ else:
447
+ key, value = self._complete_kv(key, value)
448
+ if self.attention_as_float32:
449
+ query, key, value = [x.float() for x in [query, key, value]]
450
+ x, _ = self.mha(
451
+ query, key, value, key_padding_mask,
452
+ need_weights, attn_mask, average_attn_weights)
453
+ x = x.to(dtype)
454
+
455
  return x, None
456
 
457
 
458
  class StreamingTransformerLayer(nn.TransformerEncoderLayer):
459
+ """TransformerLayer with Streaming / Causal support.
460
+ This also integrates cross_attention, when passing `cross_attention=True`,
461
+ rather than having two separate classes like in PyTorch.
462
 
463
+ Args:
464
+ d_model (int): Dimension of the data.
465
+ num_heads (int): Number of heads.
466
+ dim_feedforward (int): Intermediate dimension of FF module.
467
+ dropout (float): Dropout both for MHA and FF.
468
+ bias_ff (bool): Use bias for FF.
469
+ bias_attn (bool): Use bias for MHA.
470
+ causal (bool): Causal mask applied automatically.
471
+ past_context (int, optional): Receptive field for the causal mask, infinite if None.
472
+ custom (bool): Use custom MHA implementation, for testing / benchmarking.
473
+ memory_efficient (bool): Use xformers based memory efficient attention.
474
+ attention_as_float32 (bool): Perform the attention as float32
475
+ (especially important with memory_efficient as autocast won't do this automatically).
476
+ qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product in attention.
477
+ qk_layer_norm_cross (bool): Same for the cross attention.
478
+ cross_attention (bool): If True, expect to get secondary input for cross-attention.
479
+ Cross attention will use the default MHA, as it typically won't require
480
+ special treatment.
481
+ layer_scale (float, optional): If not None, LayerScale will be used with
482
+ the given value as initial scale.
483
+ rope (`RotaryEmbedding`, optional): Rope embedding to use.
484
+ attention_dropout (float, optional): If not None, separate the value of the dimension dropout
485
+ in FFN and of the attention dropout.
486
+ kv_repeat (int): If > 1, will repeat keys and queries multiple times (need to divide num_heads).
487
+ This will lead to faster decoding time on A100 or other GPUs with tensorcore.
488
+ device (torch.device, optional): Device on which to initialize.
489
+ dtype (torch.dtype, optional): dtype to use.
490
+ **kwargs: See `nn.TransformerEncoderLayer`.
491
+ """
492
  def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
493
  bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
494
  past_context: tp.Optional[int] = None, custom: bool = False,
 
636
  assert positional_embedding in ['sin', 'rope', 'sin_rope']
637
  self.rope: tp.Optional[RotaryEmbedding] = None
638
  if self.positional_embedding in ['rope', 'sin_rope']:
 
639
  assert _is_custom(custom, memory_efficient)
640
  self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
641
  xpos=xpos, scale=positional_scale, device=device)
 
663
  # backward hook inside of FSDP...
664
  layer._magma_checkpointed = True # type: ignore
665
 
666
+ def _apply_layer(self, layer, *args, **kwargs):
667
+ method = self.checkpointing
668
+ if method == 'none':
669
+ return layer(*args, **kwargs)
670
+ elif method == 'torch':
671
+ return torch_checkpoint(layer, *args, use_reentrant=False, **kwargs)
672
+ elif method.startswith('xformers'):
673
+ from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy
674
+ if method == 'xformers_default':
675
+ # those operations will be saved, and not recomputed.
676
+ # According to Francisco we can get smarter policies but this is a good start.
677
+ allow_list = [
678
+ "xformers.efficient_attention_forward_cutlass.default",
679
+ "xformers_flash.flash_fwd.default",
680
+ "aten.addmm.default",
681
+ "aten.mm.default",
682
+ ]
683
+ elif method == 'xformers_mm':
684
+ # those operations will be saved, and not recomputed.
685
+ # According to Francisco we can get smarter policies but this is a good start.
686
+ allow_list = [
687
+ "aten.addmm.default",
688
+ "aten.mm.default",
689
+ ]
690
+ else:
691
+ raise ValueError(f"xformers checkpointing xformers policy {method} is not known.")
692
+ policy_fn = _get_default_policy(allow_list)
693
+ return checkpoint(layer, *args, policy_fn=policy_fn, **kwargs)
694
+ else:
695
+ raise ValueError(f"Checkpointing method {method} is unknown.")
696
 
697
  def forward(self, x: torch.Tensor, *args, **kwargs):
698
+
 
699
  B, T, C = x.shape
700
 
701
  if 'offsets' in self._streaming_state:
 
704
  offsets = torch.zeros(B, dtype=torch.long, device=x.device)
705
 
706
  if self.positional_embedding in ['sin', 'sin_rope']:
 
707
  positions = torch.arange(T, device=x.device).view(1, -1, 1)
708
  positions = positions + offsets.view(-1, 1, 1)
709
  pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
710
  x = x + self.positional_scale * pos_emb
711
 
712
  for layer in self.layers:
713
+ x = self._apply_layer(layer, x, *args, **kwargs)
 
 
714
 
715
  if self._is_streaming:
716
  self._streaming_state['offsets'] = offsets + T
717
+
718
  return x
719
 
720
  def make_optim_group(self):
 
757
 
758
 
759
  def _is_custom(custom: bool, memory_efficient: bool):
760
+ return custom or memory_efficient
audiocraft/utils/utils.py CHANGED
@@ -1,23 +1,11 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
-
8
- from contextlib import contextmanager
9
- from functools import wraps, lru_cache
10
  import hashlib
11
  import json
12
  import logging
13
- from pathlib import Path
14
  import typing as tp
15
-
16
  import flashy
17
  import flashy.distrib
18
  import omegaconf
19
  import torch
20
- from torch.nn.utils.rnn import pad_sequence
21
 
22
 
23
  logger = logging.getLogger(__name__)
@@ -46,13 +34,7 @@ def dict_from_config(cfg: omegaconf.DictConfig) -> dict:
46
  return dct
47
 
48
 
49
- def random_subset(dataset, max_samples: int, seed: int = 42) -> torch.utils.data.Subset:
50
- if max_samples >= len(dataset):
51
- return dataset
52
 
53
- generator = torch.Generator().manual_seed(seed)
54
- perm = torch.randperm(len(dataset), generator=generator)
55
- return torch.utils.data.Subset(dataset, perm[:max_samples].tolist())
56
 
57
 
58
  def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
@@ -89,67 +71,28 @@ def get_dataset_from_loader(dataloader):
89
 
90
 
91
 
92
- def sample_top_k(p, k, n_draw=None):
93
  """
94
  p probabs 2048 ?
95
  num_draw : how many tokens to sample (for duplicate elongation)
96
  """
97
 
98
- p = torch.softmax(p / 1.0, dim=-1)
99
 
100
 
101
 
102
  top_k_value, i250 = torch.topk(p, k, dim=-1) # probs: [1, 4, 2048]
 
103
  min_value_top_k = top_k_value[..., [-1]] #
104
  p *= (p >= min_value_top_k).float()
105
  p.div_(p.sum(dim=-1, keepdim=True))
106
  # -- next_token = multinomial(probs, num_samples=num_draw)
 
 
107
  p_ = p.reshape(-1, p.shape[-1])
 
 
108
  out = torch.multinomial(p_,
109
  num_samples=n_draw,
110
  replacement=False) # [4, num_draw]
111
  return out.transpose(0, 1)[:, :, None] # [num_draw, 4, 1]
112
-
113
-
114
-
115
-
116
-
117
- def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -> torch.Tensor:
118
- """Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
119
- For example: [3, 5] => [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]
120
-
121
- Args:
122
- lengths (torch.Tensor): tensor with lengths
123
- max_len (int): can set the max length manually. Defaults to None.
124
- Returns:
125
- torch.Tensor: mask with 0s where there is pad tokens else 1s
126
- """
127
- assert len(lengths.shape) == 1, "Length shape should be 1 dimensional."
128
- final_length = lengths.max().item() if not max_len else max_len
129
- final_length = max(final_length, 1) # if all seqs are of len zero we don't want a zero-size tensor
130
- return torch.arange(final_length, device=lengths.device)[None, :] < lengths[:, None]
131
-
132
-
133
- def collate(tensors: tp.List[torch.Tensor], dim: int = 0) -> tp.Tuple[torch.Tensor, torch.Tensor]:
134
- """Get a list of tensors and collate them to a single tensor. according to the following logic:
135
- - `dim` specifies the time dimension which will be stacked and padded.
136
- - The output will contain 1 new dimension (dimension index 0) which will be the size of
137
- of the original list.
138
-
139
- Args:
140
- tensors (tp.List[torch.Tensor]): List of tensors to collate.
141
- dim (int): Dimension which will be stacked and padded.
142
- Returns:
143
- tp.Tuple[torch.Tensor, torch.Tensor]:
144
- torch.Tensor: Stacked and padded tensor. The output will contain 1 new dimension
145
- (dimension index 0) which will be the size of the original list.
146
- torch.Tensor: Tensor containing length of original tensor sizes (without padding).
147
- """
148
- tensors = [x.transpose(0, dim) for x in tensors]
149
- lens = torch.LongTensor([len(x) for x in tensors])
150
- padded_tensors = pad_sequence(tensors)
151
- padded_tensors = padded_tensors.transpose(0, 1)
152
- padded_tensors = padded_tensors.transpose(1, dim + 1)
153
- return padded_tensors, lens
154
-
155
-
 
 
 
 
 
 
 
 
 
 
1
  import hashlib
2
  import json
3
  import logging
 
4
  import typing as tp
 
5
  import flashy
6
  import flashy.distrib
7
  import omegaconf
8
  import torch
 
9
 
10
 
11
  logger = logging.getLogger(__name__)
 
34
  return dct
35
 
36
 
 
 
 
37
 
 
 
 
38
 
39
 
40
  def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
 
71
 
72
 
73
 
74
+ def sample_top_k(p, k=250, n_draw=None):
75
  """
76
  p probabs 2048 ?
77
  num_draw : how many tokens to sample (for duplicate elongation)
78
  """
79
 
80
+ p = torch.softmax(p, dim=-1) # p/temp
81
 
82
 
83
 
84
  top_k_value, i250 = torch.topk(p, k, dim=-1) # probs: [1, 4, 2048]
85
+ # print('\n_____TOPK________\n', top_k_value.shape, top_k_value[0, 0, :10], '\n___________END_TOPK____________\n')
86
  min_value_top_k = top_k_value[..., [-1]] #
87
  p *= (p >= min_value_top_k).float()
88
  p.div_(p.sum(dim=-1, keepdim=True))
89
  # -- next_token = multinomial(probs, num_samples=num_draw)
90
+
91
+ # RESHAPED into bs, 4, 250
92
  p_ = p.reshape(-1, p.shape[-1])
93
+
94
+
95
  out = torch.multinomial(p_,
96
  num_samples=n_draw,
97
  replacement=False) # [4, num_draw]
98
  return out.transpose(0, 1)[:, :, None] # [num_draw, 4, 1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
demo.py CHANGED
@@ -4,10 +4,10 @@ import numpy as np
4
 
5
  print('\n\n\n\n___________________')
6
 
7
- txt = 'sea waves rock crash pirates'
8
 
9
  sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
10
- sound_generator.set_generation_params(duration=.7) # why is generating so long at 14 seconds
11
 
12
  x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
13
  x /= np.abs(x).max() + 1e-7
 
4
 
5
  print('\n\n\n\n___________________')
6
 
7
+ txt = 'dogs in street'
8
 
9
  sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
10
+ sound_generator.set_generation_params(duration=1.7) # why is generating so long at 14 seconds
11
 
12
  x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
13
  x /= np.abs(x).max() + 1e-7
live_api.py CHANGED
@@ -17,7 +17,7 @@ from flask_cors import CORS
17
  from audiocraft.audiogen import AudioGen #, audio_write
18
 
19
  sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
20
- sound_generator.set_generation_params(duration=4)
21
 
22
 
23
  # ====STYLE VECTOR====
@@ -51,11 +51,13 @@ def tts_multi_sentence(scene=None):
51
  x = sound_generator.generate([scene])[0].detach().cpu().numpy()[0, :]
52
 
53
  x /= np.abs(x).max() + 1e-7
54
- # sound_background = audio_write(None,
55
- # sound_background.cpu(),
56
- # 16000, #24000, # Same as styleTTs sample_rate,
57
- # strategy="loudness",
58
- # loudness_compressor=True)
 
 
59
  print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
60
  else:
61
  print(scene, '\nDrop\n')
 
17
  from audiocraft.audiogen import AudioGen #, audio_write
18
 
19
  sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
20
+ sound_generator.set_generation_params(duration=.7)
21
 
22
 
23
  # ====STYLE VECTOR====
 
51
  x = sound_generator.generate([scene])[0].detach().cpu().numpy()[0, :]
52
 
53
  x /= np.abs(x).max() + 1e-7
54
+ # is 16kHz - AUdiogen Fs
55
+ x = audresample.resample(x,
56
+ original_rate=sound_generator.sample_rate, # 16000
57
+ target_rate=24000)[0, :]
58
+
59
+
60
+ #
61
  print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
62
  else:
63
  print(scene, '\nDrop\n')