revert transformer.py

Browse files

Files changed (5) hide show

audiocraft/lm.py +20 -42
audiocraft/transformer.py +204 -39
audiocraft/utils/utils.py +7 -64
demo.py +2 -2
live_api.py +8 -6

audiocraft/lm.py CHANGED Viewed

@@ -246,32 +246,29 @@ class LMModel(StreamingModule):
     def _sample_next_token(self,
                            sequence,
                            cfg_conditions,
-                           unconditional_state,
-                           use_sampling=False,
-                           temp: float = 1.0,
-                           top_k: int = 0,
-                           top_p: float = 0.0,
-                           cfg_coef: tp.Optional[float] = None,
-                           two_step_cfg: tp.Optional[bool] = None) -> torch.Tensor:
         """self.n_draw"""
         B = sequence.shape[0]
-        cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
-        model = self if self._fsdp is None else self._fsdp
-        two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
-        condition_tensors = cfg_conditions
         logits = model(
             sequence,                                          # cond_logits = wav condition
             conditions=[], condition_tensors=condition_tensors)  # uncond_logits already see the text
-        # print(f'{logits.shape=} L')
-        logits = logits[0, :, :, :].transpose(1,0)  # sample expects [1, 4, 2048]
-        # logits = [2, 4, 1, 2048]
-        # print(f'{B=}, {logits.shape=} SAMPLER {top_k=}')
-        next_token = utils.sample_top_k(logits, k=top_k, n_draw=self.n_draw)  # [1,4,2048] logits
         return next_token
     # GENERATE class revert_codebook_patterns()
@@ -282,15 +279,7 @@ class LMModel(StreamingModule):
                  num_samples = 1,       # THIS IS HOW MANY GENERATIONS - A SAMPLE IS A FULL WAV
                  max_gen_len=256,  # unduplicated sequence length - actual len will be  n_draw * maxgenlen
                  use_sampling: bool = True,
-                 temp: float = 1.0,
-                 top_k: int = 250,
-                 top_p: float = 0.0,
-                 cfg_coef: tp.Optional[float] = None,
-                 two_step_cfg: tp.Optional[bool] = None,
-                 remove_prompts: bool = False,
-                 check: bool = False,
-                 callback: tp.Optional[tp.Callable[[int, int], None]] = None,
-                 **kwargs) -> torch.Tensor:
         print(f'{num_samples=}')
         first_param = next(iter(self.parameters()))
@@ -365,32 +354,21 @@ class LMModel(StreamingModule):
                 next_token = self._sample_next_token(
                     curr_sequence,
                     cfg_conditions,
-                    unconditional_state,
-                    use_sampling,
-                    temp, top_k, top_p,
-                    cfg_coef=cfg_coef,
-                    two_step_cfg=two_step_cfg)  # [5, 4, 1]
-                print(f'{next_token.shape=}')
-                # replicate the sequence to hold 5 or more sequences as we generate 5 tokens or more
-                # ensure the tokens that should be masked are properly set to special_token_id
-                # as the model never output special_token_id
-                # valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
-                # next_token[~valid_mask] = self.special_token_id
-                # print(f'{unconditional_state=} \n
-                # print('Set All to Special')
-                # RUNS with = 2047 just different of self.special_token_id = 2047 = drill noise
                 # special_token_id is filler for CODEBOOK_PATTERN ?
                 # next_token[:] = self.special_token_id    # seanet.embed torch.embedding does not have this - out of bounds in detokenize
                 _gen_sequence[..., offset:offset+1] = next_token[0, :, :]  #gen_sequence.shape=torch.Size([1, 4, 39])
-                # only cat 1 token to 1 sequence - preserve the duplicates in
                 duplicate_draw.append(next_token)
                 prev_offset = offset

     def _sample_next_token(self,
                            sequence,
                            cfg_conditions,
+                           unconditional_state):
         """self.n_draw"""
         B = sequence.shape[0]
+        model = self if self._fsdp is None else self._fsdp
+        condition_tensors = cfg_conditions
+        # logits = [2, 4, 1, 2048]
         logits = model(
             sequence,                                          # cond_logits = wav condition
             conditions=[], condition_tensors=condition_tensors)  # uncond_logits already see the text
+        # use cfg
+        # logits = (3 * logits[1, :, :, :] - 2.4 * logits[0, :, :, :]).transpose(1,0)
+        # or use 1 of logits
+        logits = logits[0, :, :, :].transpose(1,0)  # [2,4,1, 2048]  ->   [1,4,2048]
+        # print(f'{B=}, {logits.shape=} SAMPLER {top_k=}')
+        next_token = utils.sample_top_k(logits, n_draw=self.n_draw)  # [1,4,2048] logits
         return next_token
     # GENERATE class revert_codebook_patterns()
                  num_samples = 1,       # THIS IS HOW MANY GENERATIONS - A SAMPLE IS A FULL WAV
                  max_gen_len=256,  # unduplicated sequence length - actual len will be  n_draw * maxgenlen
                  use_sampling: bool = True,
+                 **kwargs):
         print(f'{num_samples=}')
         first_param = next(iter(self.parameters()))
                 next_token = self._sample_next_token(
                     curr_sequence,
                     cfg_conditions,
+                    unconditional_state)  # [5, 4, 1]
+                # RUNS with = 2047 just different of self.special_token_id = 2047 = alwayssingletoken = drill noise
                 # special_token_id is filler for CODEBOOK_PATTERN ?
                 # next_token[:] = self.special_token_id    # seanet.embed torch.embedding does not have this - out of bounds in detokenize
                 _gen_sequence[..., offset:offset+1] = next_token[0, :, :]  #gen_sequence.shape=torch.Size([1, 4, 39])
                 duplicate_draw.append(next_token)
                 prev_offset = offset

audiocraft/transformer.py CHANGED Viewed

@@ -86,6 +86,7 @@ def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float =
     adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
     max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype)  # avoid sync point
     phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
     return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
@@ -177,7 +178,7 @@ class StreamingMultiheadAttention(StreamingModule):
         self.past_context = past_context
         self.memory_efficient = memory_efficient
         self.attention_as_float32 = attention_as_float32
         self.cross_attention = cross_attention
         self.safe_streaming = safe_streaming
         self.num_heads = num_heads
@@ -230,8 +231,41 @@ class StreamingMultiheadAttention(StreamingModule):
                     state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
     def _complete_kv(self, k, v):
         time_dim = _get_attention_time_dimension(self.memory_efficient)
         if self.cross_attention:
             # With cross attention we assume all keys and values
@@ -240,16 +274,15 @@ class StreamingMultiheadAttention(StreamingModule):
             return k, v
         # Complete the key/value pair using the streaming state.
         if self._streaming_state:
-            # print('{self._streaming_state.keys()=}')   EMPTY - ALTHOUGH WE HAVE STREAMING STATE
             pk = self._streaming_state['past_keys']
             nk = torch.cat([pk, k], dim=time_dim)
             if v is k:
                 nv = nk
             else:
                 pv = self._streaming_state['past_values']
                 nv = torch.cat([pv, v], dim=time_dim)
         else:
             nk = k
             nv = v
@@ -257,28 +290,35 @@ class StreamingMultiheadAttention(StreamingModule):
         assert nk.shape[time_dim] == nv.shape[time_dim]
         offset = 0
         if self.past_context is not None:
             offset = max(0, nk.shape[time_dim] - self.past_context)
         if self._is_streaming:
             self._streaming_state['past_keys'] = nk[:, offset:]
             if v is not k:
                 self._streaming_state['past_values'] = nv[:, offset:]
             if 'offset' in self._streaming_state:
                 self._streaming_state['offset'] += offset
             else:
                 self._streaming_state['offset'] = torch.tensor(0)
         return nk, nv
     def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
                 key_padding_mask=None, need_weights=False, attn_mask=None,
                 average_attn_weights=True, is_causal=False):
         assert not is_causal, ("New param added in torch 2.0.1 not supported, "
                                "use the causal args in the constructor.")
@@ -292,22 +332,29 @@ class StreamingMultiheadAttention(StreamingModule):
             assert self.causal or self.cross_attention, \
                 "Streaming only available for causal or cross attention"
         if self.custom:
             if self.cross_attention:
                 # Different queries, keys, values, we have to spit manually the weights
                 # before applying the linear.
                 dim = self.in_proj_weight.shape[0] // 3
-                bias_q, bias_k, bias_v = None, None, None
                 q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
                 # todo: when streaming, we could actually save k, v and check the shape actually match.
                 k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
@@ -323,31 +370,125 @@ class StreamingMultiheadAttention(StreamingModule):
                     assert value is key, "specialized implementation"
                 projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
                 if self.kv_repeat == 1:
                     if time_dim == 2:
                         bound_layout = "b h p t d"
                     else:
                         bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
                     q, k, v = ops.unbind(packed, dim=2)
                 k, v = self._complete_kv(k, v)
-#print(f'{k.shape=}, {v.shape=}, {q.shape=}\n\n\n\n')
-            # what is the 24 dimension is this heads?
-            x = torch.nn.functional.scaled_dot_product_attention(
-                q, k, v, is_causal=attn_mask is not None, dropout_p=0)
             x = x.to(dtype)
             x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
             x = self.out_proj(x)
         return x, None
 class StreamingTransformerLayer(nn.TransformerEncoderLayer):
     def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
                  bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
                  past_context: tp.Optional[int] = None, custom: bool = False,
@@ -495,7 +636,6 @@ class StreamingTransformer(StreamingModule):
         assert positional_embedding in ['sin', 'rope', 'sin_rope']
         self.rope: tp.Optional[RotaryEmbedding] = None
         if self.positional_embedding in ['rope', 'sin_rope']:
-            print('ROPE\nL')
             assert _is_custom(custom, memory_efficient)
             self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
                                         xpos=xpos, scale=positional_scale, device=device)
@@ -523,11 +663,39 @@ class StreamingTransformer(StreamingModule):
                 # backward hook inside of FSDP...
                 layer._magma_checkpointed = True  # type: ignore
     def forward(self, x: torch.Tensor, *args, **kwargs):
-        # Input  x: [1, 1, 1536]
-        # Output x: [2, 1, 1536]  how is batch expanded to 2
         B, T, C = x.shape
         if 'offsets' in self._streaming_state:
@@ -536,20 +704,17 @@ class StreamingTransformer(StreamingModule):
             offsets = torch.zeros(B, dtype=torch.long, device=x.device)
         if self.positional_embedding in ['sin', 'sin_rope']:
-            # print(f'{self.positional_embedding=}\n')  'sin'
             positions = torch.arange(T, device=x.device).view(1, -1, 1)
             positions = positions + offsets.view(-1, 1, 1)
             pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
             x = x + self.positional_scale * pos_emb
         for layer in self.layers:
-            # print(f'{args=} {kwargs.keys()=}')
-            # # kwargs=() kwargs={'cross_attention_src', 'src_mask'}
-            x = layer(x, **kwargs)
         if self._is_streaming:
             self._streaming_state['offsets'] = offsets + T
-        print('OUT STReamTransfor', x.shape)
         return x
     def make_optim_group(self):
@@ -592,4 +757,4 @@ def _verify_xformers_internal_compat():
 def _is_custom(custom: bool, memory_efficient: bool):
-    return custom or memory_efficient

     adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
     max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype)  # avoid sync point
     phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
+    print('==============CONCAT 3 ============')
     return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
         self.past_context = past_context
         self.memory_efficient = memory_efficient
         self.attention_as_float32 = attention_as_float32
+        self.rope = rope
         self.cross_attention = cross_attention
         self.safe_streaming = safe_streaming
         self.num_heads = num_heads
                     state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    def _get_mask(self, current_steps: int, device: torch.device, dtype: torch.dtype):
+        # Return a causal mask, accounting for potentially stored past keys/values
+        # We actually return a bias for the attention score, as this has the same
+        # convention both in the builtin MHA in Pytorch, and Xformers functions.
+        time_dim = _get_attention_time_dimension(self.memory_efficient)
+        if self.memory_efficient:
+            from xformers.ops import LowerTriangularMask
+            if current_steps == 1:
+                # If we only have one step, then we do not need a mask.
+                return None
+            elif 'past_keys' in self._streaming_state:
+                raise RuntimeError("Not supported at the moment")
+            else:
+                # Then we can safely use a lower triangular mask
+                return LowerTriangularMask()
+        if self._streaming_state:
+            past_keys = self._streaming_state['past_keys']
+            past_steps = past_keys.shape[time_dim]
+        else:
+            past_steps = 0
+        queries_pos = torch.arange(
+            past_steps, current_steps + past_steps, device=device).view(-1, 1)
+        keys_pos = torch.arange(past_steps + current_steps, device=device).view(1, -1)
+        delta = queries_pos - keys_pos
+        valid = delta >= 0
+        if self.past_context is not None:
+            valid &= (delta <= self.past_context)
+        return torch.where(
+            valid,
+            torch.zeros([], device=device, dtype=dtype),
+            torch.full([], float('-inf'), device=device, dtype=dtype))
     def _complete_kv(self, k, v):
         time_dim = _get_attention_time_dimension(self.memory_efficient)
         if self.cross_attention:
             # With cross attention we assume all keys and values
             return k, v
         # Complete the key/value pair using the streaming state.
         if self._streaming_state:
             pk = self._streaming_state['past_keys']
             nk = torch.cat([pk, k], dim=time_dim)
+            print('==============CONCAT 1===============')
             if v is k:
                 nv = nk
             else:
                 pv = self._streaming_state['past_values']
                 nv = torch.cat([pv, v], dim=time_dim)
+                print('==============CONCAT 2================')
         else:
             nk = k
             nv = v
         assert nk.shape[time_dim] == nv.shape[time_dim]
         offset = 0
         if self.past_context is not None:
             offset = max(0, nk.shape[time_dim] - self.past_context)
         if self._is_streaming:
             self._streaming_state['past_keys'] = nk[:, offset:]
             if v is not k:
                 self._streaming_state['past_values'] = nv[:, offset:]
             if 'offset' in self._streaming_state:
                 self._streaming_state['offset'] += offset
             else:
                 self._streaming_state['offset'] = torch.tensor(0)
         return nk, nv
+    def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
+        time_dim = _get_attention_time_dimension(self.memory_efficient)
+        # Apply rope embeddings to query and key tensors.
+        assert self.rope is not None
+        if 'past_keys' in self._streaming_state:
+            past_keys_offset = self._streaming_state['past_keys'].shape[1]
+        else:
+            past_keys_offset = 0
+        if 'offset' in self._streaming_state:
+            past_context_offset = int(self._streaming_state['offset'].item())
+        else:
+            past_context_offset = 0
+        streaming_offset = past_context_offset + past_keys_offset
+        return self.rope.rotate_qk(query, key, start=streaming_offset, time_dim=time_dim)
     def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
                 key_padding_mask=None, need_weights=False, attn_mask=None,
                 average_attn_weights=True, is_causal=False):
         assert not is_causal, ("New param added in torch 2.0.1 not supported, "
                                "use the causal args in the constructor.")
             assert self.causal or self.cross_attention, \
                 "Streaming only available for causal or cross attention"
+        custom_attn_mask = attn_mask is not None
+        if self.causal:
+            assert attn_mask is None
+            # At the moment we specialize only for the self-attention case.
+            assert query.shape[1] == key.shape[1], "Causal only for same length query / key / value"
+            assert value.shape[1] == key.shape[1], "Causal only for same length query / key / value"
+            attn_mask = self._get_mask(query.shape[1], query.device, query.dtype)
         if self.custom:
+            # custom implementation
+            assert need_weights is False
+            assert key_padding_mask is None
             if self.cross_attention:
                 # Different queries, keys, values, we have to spit manually the weights
                 # before applying the linear.
                 dim = self.in_proj_weight.shape[0] // 3
+                if self.in_proj_bias is None:
+                    bias_q, bias_k, bias_v = None, None, None
+                else:
+                    bias_q = self.in_proj_bias[:dim]
+                    bias_k = self.in_proj_bias[dim: 2 * dim]
+                    bias_v = self.in_proj_bias[2 * dim:]
                 q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
                 # todo: when streaming, we could actually save k, v and check the shape actually match.
                 k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
                     assert value is key, "specialized implementation"
                 projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
                 if self.kv_repeat == 1:
                     if time_dim == 2:
                         bound_layout = "b h p t d"
                     else:
                         bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
                     q, k, v = ops.unbind(packed, dim=2)
+                else:
+                    embed_dim = self.embed_dim
+                    per_head_dim = (embed_dim // self.num_heads)
+                    kv_heads = self.num_heads // self.kv_repeat
+                    q = projected[:, :, :embed_dim]
+                    start = embed_dim
+                    end = start + per_head_dim * kv_heads
+                    k = projected[:, :, start: end]
+                    v = projected[:, :, end:]
+                    q = rearrange(q, f"b t (h d) -> {layout}", h=self.num_heads)
+                    k = rearrange(k, f"b t (h d) -> {layout}", h=kv_heads)
+                    v = rearrange(v, f"b t (h d) -> {layout}", h=kv_heads)
+                if self.qk_layer_norm is True:
+                    assert self.kv_repeat == 1
+                    q, k = [rearrange(x, f"{layout} -> b t (h d)") for x in [q, k]]
+                    q = self.q_layer_norm(q)
+                    k = self.k_layer_norm(k)
+                    q, k = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k]]
+                if self.rope:
+                    q, k = self._apply_rope(q, k)
                 k, v = self._complete_kv(k, v)
+                if self.kv_repeat > 1:
+                    k = expand_repeated_kv(k, self.kv_repeat, self.memory_efficient)
+                    v = expand_repeated_kv(v, self.kv_repeat, self.memory_efficient)
+            if self.attention_as_float32:
+                q, k, v = [x.float() for x in [q, k, v]]
+            if self.memory_efficient:
+                if custom_attn_mask:
+                    # When using a custom attn mask:
+                    # Move to query's device, repeat for each sample, remove align8 padding
+                    seq_len = query.shape[1]
+                    attn_mask = attn_mask.to(q.dtype)
+                    attn_mask = attn_mask.repeat((q.shape[0], 1, 1, 1))
+                    attn_mask = attn_mask[..., :seq_len, :seq_len]
+                p = self.dropout if self.training else 0
+                if _efficient_attention_backend == 'torch':
+                    x = torch.nn.functional.scaled_dot_product_attention(
+                        q, k, v, is_causal=attn_mask is not None, dropout_p=p)
+                else:
+                    x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
+            else:
+                # We include the dot product as float32, for consistency
+                # with the other implementations that include that step
+                # as part of the attention. Note that when using `autocast`,
+                # the einsums would be done as bfloat16, but the softmax
+                # would be done as bfloat16, so `attention_as_float32` will
+                # extend a bit the range of operations done in float32,
+                # although this should make no difference.
+                q = q / q.shape[-1] ** 0.5
+                key_layout = layout.replace('t', 'k')
+                query_layout = layout
+                if self._is_streaming and self.safe_streaming and q.device.type == 'cuda':
+                    with torch.autocast(device_type=q.device.type, dtype=torch.float32):
+                        pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
+                else:
+                    pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
+                if attn_mask is not None:
+                    pre_w = pre_w + attn_mask
+                w = torch.softmax(pre_w, dim=-1)
+                w = F.dropout(w, self.dropout, training=self.training).to(v)
+                # Key and value have the same format.
+                x = torch.einsum(f"b h t k, {key_layout} -> {layout}", w, v)
             x = x.to(dtype)
             x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
             x = self.out_proj(x)
+        else:
+            key, value = self._complete_kv(key, value)
+            if self.attention_as_float32:
+                query, key, value = [x.float() for x in [query, key, value]]
+            x, _ = self.mha(
+                query, key, value, key_padding_mask,
+                need_weights, attn_mask, average_attn_weights)
+            x = x.to(dtype)
         return x, None
 class StreamingTransformerLayer(nn.TransformerEncoderLayer):
+    """TransformerLayer with Streaming / Causal support.
+    This also integrates cross_attention, when passing `cross_attention=True`,
+    rather than having two separate classes like in PyTorch.
+    Args:
+        d_model (int): Dimension of the data.
+        num_heads (int): Number of heads.
+        dim_feedforward (int): Intermediate dimension of FF module.
+        dropout (float): Dropout both for MHA and FF.
+        bias_ff (bool): Use bias for FF.
+        bias_attn (bool): Use bias for MHA.
+        causal (bool): Causal mask applied automatically.
+        past_context (int, optional): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        memory_efficient (bool): Use xformers based memory efficient attention.
+        attention_as_float32 (bool): Perform the attention as float32
+            (especially important with memory_efficient as autocast won't do this automatically).
+        qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product in attention.
+        qk_layer_norm_cross (bool): Same for the cross attention.
+        cross_attention (bool): If True, expect to get secondary input for cross-attention.
+            Cross attention will use the default MHA, as it typically won't require
+            special treatment.
+        layer_scale (float, optional): If not None, LayerScale will be used with
+            the given value as initial scale.
+        rope (`RotaryEmbedding`, optional): Rope embedding to use.
+        attention_dropout (float, optional): If not None, separate the value of the dimension dropout
+            in FFN and of the attention dropout.
+        kv_repeat (int): If > 1, will repeat keys and queries multiple times (need to divide num_heads).
+            This will lead to faster decoding time on A100 or other GPUs with tensorcore.
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+        **kwargs: See `nn.TransformerEncoderLayer`.
+    """
     def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
                  bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
                  past_context: tp.Optional[int] = None, custom: bool = False,
         assert positional_embedding in ['sin', 'rope', 'sin_rope']
         self.rope: tp.Optional[RotaryEmbedding] = None
         if self.positional_embedding in ['rope', 'sin_rope']:
             assert _is_custom(custom, memory_efficient)
             self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
                                         xpos=xpos, scale=positional_scale, device=device)
                 # backward hook inside of FSDP...
                 layer._magma_checkpointed = True  # type: ignore
+    def _apply_layer(self, layer, *args, **kwargs):
+        method = self.checkpointing
+        if method == 'none':
+            return layer(*args, **kwargs)
+        elif method == 'torch':
+            return torch_checkpoint(layer, *args, use_reentrant=False, **kwargs)
+        elif method.startswith('xformers'):
+            from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy
+            if method == 'xformers_default':
+                # those operations will be saved, and not recomputed.
+                # According to Francisco we can get smarter policies but this is a good start.
+                allow_list = [
+                    "xformers.efficient_attention_forward_cutlass.default",
+                    "xformers_flash.flash_fwd.default",
+                    "aten.addmm.default",
+                    "aten.mm.default",
+                ]
+            elif method == 'xformers_mm':
+                # those operations will be saved, and not recomputed.
+                # According to Francisco we can get smarter policies but this is a good start.
+                allow_list = [
+                    "aten.addmm.default",
+                    "aten.mm.default",
+                ]
+            else:
+                raise ValueError(f"xformers checkpointing xformers policy {method} is not known.")
+            policy_fn = _get_default_policy(allow_list)
+            return checkpoint(layer, *args, policy_fn=policy_fn, **kwargs)
+        else:
+            raise ValueError(f"Checkpointing method {method} is unknown.")
     def forward(self, x: torch.Tensor, *args, **kwargs):
         B, T, C = x.shape
         if 'offsets' in self._streaming_state:
             offsets = torch.zeros(B, dtype=torch.long, device=x.device)
         if self.positional_embedding in ['sin', 'sin_rope']:
             positions = torch.arange(T, device=x.device).view(1, -1, 1)
             positions = positions + offsets.view(-1, 1, 1)
             pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
             x = x + self.positional_scale * pos_emb
         for layer in self.layers:
+            x = self._apply_layer(layer, x, *args, **kwargs)
         if self._is_streaming:
             self._streaming_state['offsets'] = offsets + T
         return x
     def make_optim_group(self):
 def _is_custom(custom: bool, memory_efficient: bool):
+    return custom or memory_efficient

audiocraft/utils/utils.py CHANGED Viewed

@@ -1,23 +1,11 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from contextlib import contextmanager
-from functools import wraps, lru_cache
 import hashlib
 import json
 import logging
-from pathlib import Path
 import typing as tp
 import flashy
 import flashy.distrib
 import omegaconf
 import torch
-from torch.nn.utils.rnn import pad_sequence
 logger = logging.getLogger(__name__)
@@ -46,13 +34,7 @@ def dict_from_config(cfg: omegaconf.DictConfig) -> dict:
     return dct
-def random_subset(dataset, max_samples: int, seed: int = 42) -> torch.utils.data.Subset:
-    if max_samples >= len(dataset):
-        return dataset
-    generator = torch.Generator().manual_seed(seed)
-    perm = torch.randperm(len(dataset), generator=generator)
-    return torch.utils.data.Subset(dataset, perm[:max_samples].tolist())
 def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
@@ -89,67 +71,28 @@ def get_dataset_from_loader(dataloader):
-def sample_top_k(p, k, n_draw=None):
     """
         p probabs 2048 ?
         num_draw : how many tokens to sample (for duplicate elongation)
     """
-    p = torch.softmax(p / 1.0, dim=-1)
     top_k_value, i250 = torch.topk(p, k, dim=-1)   # probs: [1, 4, 2048]
     min_value_top_k = top_k_value[..., [-1]]  #
     p *= (p >= min_value_top_k).float()
     p.div_(p.sum(dim=-1, keepdim=True))
     # -- next_token = multinomial(probs, num_samples=num_draw)
     p_ = p.reshape(-1, p.shape[-1])
     out = torch.multinomial(p_,
                              num_samples=n_draw,
                              replacement=False)  # [4, num_draw]
     return out.transpose(0, 1)[:, :, None]       # [num_draw, 4, 1]
-def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -> torch.Tensor:
-    """Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
-    For example: [3, 5] => [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]
-    Args:
-        lengths (torch.Tensor): tensor with lengths
-        max_len (int): can set the max length manually. Defaults to None.
-    Returns:
-        torch.Tensor: mask with 0s where there is pad tokens else 1s
-    """
-    assert len(lengths.shape) == 1, "Length shape should be 1 dimensional."
-    final_length = lengths.max().item() if not max_len else max_len
-    final_length = max(final_length, 1)  # if all seqs are of len zero we don't want a zero-size tensor
-    return torch.arange(final_length, device=lengths.device)[None, :] < lengths[:, None]
-def collate(tensors: tp.List[torch.Tensor], dim: int = 0) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-    """Get a list of tensors and collate them to a single tensor. according to the following logic:
-    - `dim` specifies the time dimension which will be stacked and padded.
-    - The output will contain 1 new dimension (dimension index 0) which will be the size of
-    of the original list.
-    Args:
-        tensors (tp.List[torch.Tensor]): List of tensors to collate.
-        dim (int): Dimension which will be stacked and padded.
-    Returns:
-        tp.Tuple[torch.Tensor, torch.Tensor]:
-            torch.Tensor: Stacked and padded tensor. The output will contain 1 new dimension
-                (dimension index 0) which will be the size of the original list.
-            torch.Tensor: Tensor containing length of original tensor sizes (without padding).
-    """
-    tensors = [x.transpose(0, dim) for x in tensors]
-    lens = torch.LongTensor([len(x) for x in tensors])
-    padded_tensors = pad_sequence(tensors)
-    padded_tensors = padded_tensors.transpose(0, 1)
-    padded_tensors = padded_tensors.transpose(1, dim + 1)
-    return padded_tensors, lens

 import hashlib
 import json
 import logging
 import typing as tp
 import flashy
 import flashy.distrib
 import omegaconf
 import torch
 logger = logging.getLogger(__name__)
     return dct
 def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
+def sample_top_k(p, k=250, n_draw=None):
     """
         p probabs 2048 ?
         num_draw : how many tokens to sample (for duplicate elongation)
     """
+    p = torch.softmax(p, dim=-1)  # p/temp
     top_k_value, i250 = torch.topk(p, k, dim=-1)   # probs: [1, 4, 2048]
+    # print('\n_____TOPK________\n', top_k_value.shape, top_k_value[0, 0, :10], '\n___________END_TOPK____________\n')
     min_value_top_k = top_k_value[..., [-1]]  #
     p *= (p >= min_value_top_k).float()
     p.div_(p.sum(dim=-1, keepdim=True))
     # -- next_token = multinomial(probs, num_samples=num_draw)
+    # RESHAPED into bs, 4, 250
     p_ = p.reshape(-1, p.shape[-1])
     out = torch.multinomial(p_,
                              num_samples=n_draw,
                              replacement=False)  # [4, num_draw]
     return out.transpose(0, 1)[:, :, None]       # [num_draw, 4, 1]

demo.py CHANGED Viewed

@@ -4,10 +4,10 @@ import numpy as np
 print('\n\n\n\n___________________')
-txt = 'sea waves rock crash pirates'
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
-sound_generator.set_generation_params(duration=.7)   # why is generating so long at 14 seconds
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7

 print('\n\n\n\n___________________')
+txt = 'dogs in street'
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
+sound_generator.set_generation_params(duration=1.7)   # why is generating so long at 14 seconds
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7

live_api.py CHANGED Viewed

@@ -17,7 +17,7 @@ from flask_cors import CORS
 from audiocraft.audiogen import AudioGen #, audio_write
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
-sound_generator.set_generation_params(duration=4)
 # ====STYLE VECTOR====
@@ -51,11 +51,13 @@ def tts_multi_sentence(scene=None):
         x = sound_generator.generate([scene])[0].detach().cpu().numpy()[0, :]
         x /= np.abs(x).max() + 1e-7
-        # sound_background = audio_write(None,
-        #                                sound_background.cpu(),
-        #                                16000, #24000,  # Same as styleTTs sample_rate,
-        #                                strategy="loudness",
-        #                                loudness_compressor=True)
         print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
     else:
         print(scene, '\nDrop\n')

 from audiocraft.audiogen import AudioGen #, audio_write
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
+sound_generator.set_generation_params(duration=.7)
 # ====STYLE VECTOR====
         x = sound_generator.generate([scene])[0].detach().cpu().numpy()[0, :]
         x /= np.abs(x).max() + 1e-7
+        # is 16kHz - AUdiogen Fs
+        x = audresample.resample(x,
+            original_rate=sound_generator.sample_rate,  # 16000
+            target_rate=24000)[0, :]
+        #
         print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
     else:
         print(scene, '\nDrop\n')