DEBUG: cross_attention_src = query or key?

Browse files

Files changed (6) hide show

audiocraft/audiogen.py +13 -47
audiocraft/conditioners.py +7 -48
audiocraft/lm.py +96 -112
audiocraft/streaming.py +0 -131
audiocraft/transformer.py +137 -394
demo.py +1 -1

audiocraft/audiogen.py CHANGED Viewed

@@ -4,11 +4,6 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-"""
-Main model for using AudioGen. This will combine all the required components
-and provide easy access to the generation API.
-"""
 import typing as tp
 import torch
 from audiocraft.loaders import load_compression_model, load_lm_model
@@ -87,51 +82,25 @@ class BaseGenModel(ABC):
         """Sample rate of the generated audio."""
         return self.compression_model.sample_rate
-    @property
-    def audio_channels(self) -> int:
-        """Audio channels of the generated audio."""
-        return self.compression_model.channels
-    @torch.no_grad()
-    def _prepare_tokens_and_attributes(
-            self,
-            descriptions,
-            prompt,
-    ):
-        attributes = [
-            ConditioningAttributes(text={'description': description}) for description in descriptions]
-        prompt_tokens = None
-        return attributes, prompt_tokens
-    def generate_unconditional(self,
-                               num_samples,
-                               progress=False,
-                               return_tokens=False):
-        descriptions: tp.List[tp.Optional[str]] = [None] * num_samples
-        attributes, _ = self._prepare_tokens_and_attributes(descriptions, None)
-        tokens = self._generate_tokens(attributes)
-        if return_tokens:
-            return self.generate_audio(tokens), tokens
-        return self.generate_audio(tokens)
-    def generate(self,
-                 descriptions,
-                 progress=False,
-                 return_tokens=False):
-        attributes, _ = self._prepare_tokens_and_attributes(descriptions, None)
         tokens = self._generate_tokens(attributes)
-        if return_tokens:
-            return self.generate_audio(tokens), tokens
         return self.generate_audio(tokens)
-    def _generate_tokens(self, attributes,
-                         prompt_tokens=None,
-                         progress=False):
         total_gen_len = int(self.duration * self.frame_rate)
-        max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
-        current_gen_offset: int = 0
@@ -140,10 +109,7 @@ class BaseGenModel(ABC):
             # generate by sampling from LM, simple case.
             with self.autocast:
-                gen_tokens = self.lm.generate(conditions=attributes,
-                                                callback=None,
-                                                max_gen_len=total_gen_len,
-                                                **self.generation_params)
         else:
             print('<>Long gen ?<>')
         # print(f'{gen_tokens.shape=}')   # [5,4,35]

 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import typing as tp
 import torch
 from audiocraft.loaders import load_compression_model, load_lm_model
         """Sample rate of the generated audio."""
         return self.compression_model.sample_rate
+    def generate(self, descriptions):
+        attributes = [
+            ConditioningAttributes(text={'description': d}) for d in descriptions]
         tokens = self._generate_tokens(attributes)
         return self.generate_audio(tokens)
+    def _generate_tokens(self, attributes):
         total_gen_len = int(self.duration * self.frame_rate)
+#         # print(f'{self.generation_params=}')
+# self.generation_params={'use_sampling': True,
+#                         'temp': 1.0, 'top_k': 250,
+#                         'top_p': 0.0, 'cfg_coef': 2.4, 'two_step_cfg': False}
             # generate by sampling from LM, simple case.
             with self.autocast:
+                gen_tokens = self.lm.generate(conditions=attributes, max_gen_len=total_gen_len)
         else:
             print('<>Long gen ?<>')
         # print(f'{gen_tokens.shape=}')   # [5,4,35]

audiocraft/conditioners.py CHANGED Viewed

@@ -8,7 +8,7 @@ import soundfile
 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 from torch import nn
-from .streaming import StreamingModule
 from .utils.autocast import TorchAutocast
@@ -126,17 +126,7 @@ class BaseConditioner(nn.Module):
         """
         raise NotImplementedError()
-    def forward(self, inputs: tp.Any) -> ConditionType:
-        """Gets input that should be used as conditioning (e.g, genre, description or a waveform).
-        Outputs a ConditionType, after the input data was embedded as a dense vector.
-        Returns:
-            ConditionType:
-                - A tensor of size [B, T, D] where B is the batch size, T is the length of the
-                  output embedding and D is the dimension of the embedding.
-                - And a mask indicating where the padding tokens.
-        """
-        raise NotImplementedError()
 class TextConditioner(BaseConditioner):
@@ -239,6 +229,9 @@ class T5Conditioner(TextConditioner):
             embeds = self.t5(**inputs).last_hidden_state
         embeds = self.output_proj(embeds.to(self.output_proj.weight))
         embeds = (embeds * mask.unsqueeze(-1))
         return embeds, mask
@@ -352,21 +345,8 @@ class ConditioningProvider(nn.Module):
-class ConditionFuser(StreamingModule):
-    """Condition fuser handles the logic to combine the different conditions
-    to the actual model input.
-    Args:
-        fuse2cond (tp.Dict[str, str]): A dictionary that says how to fuse
-            each condition. For example:
-            {
-                "prepend": ["description"],
-                "sum": ["genre", "bpm"],
-                "cross": ["description"],
-            }
-        cross_attention_pos_emb (bool, optional): Use positional embeddings in cross attention.
-        cross_attention_pos_emb_scale (int): Scale for positional embeddings in cross attention if used.
-    """
     FUSING_METHODS = ["sum", "prepend", "cross", "input_interpolate"]
     def __init__(self, fuse2cond: tp.Dict[str, tp.List[str]], cross_attention_pos_emb: bool = False,
@@ -387,25 +367,4 @@ class ConditionFuser(StreamingModule):
         self,
         input,
         conditions):
-        B, T, _ = input.shape
-        first_step = True
-        offsets = torch.zeros(input.shape[0], dtype=torch.long, device=input.device)
-        cross_attention_output = None
-        for cond_type, (cond, cond_mask) in conditions.items():
-            # print(f'{self.cond2fuse=}')  - self.cond2fuse={'description': 'cross'}
-            cross_attention_output = cond
-            # print(f'{cross_attention_output.shape=} for {input.sum()=}')
-# cross_attention_output.shape=torch.Size([2, 5, 1536]) for input.sum()=tensor(-0.0650, device='cuda:0')
-# cross_attention_output.shape=torch.Size([2, 5, 1536]) for input.sum()=tensor(3.7672, device='cuda:0')
-        if self._is_streaming:
-            self._streaming_state['offsets'] = offsets + T
-        return input, cross_attention_output

 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 from torch import nn
 from .utils.autocast import TorchAutocast
         """
         raise NotImplementedError()
 class TextConditioner(BaseConditioner):
             embeds = self.t5(**inputs).last_hidden_state
         embeds = self.output_proj(embeds.to(self.output_proj.weight))
         embeds = (embeds * mask.unsqueeze(-1))
+        # T5 torch.Size([2, 4, 1536]) dict_keys(['input_ids', 'attention_mask'])
+        # print(f'{inputs["input_ids"].shape=}')  # inputs["input_ids"].shape=torch.Size([2, 4])
         return embeds, mask
+class ConditionFuser(nn.Module):
     FUSING_METHODS = ["sum", "prepend", "cross", "input_interpolate"]
     def __init__(self, fuse2cond: tp.Dict[str, tp.List[str]], cross_attention_pos_emb: bool = False,
         self,
         input,
         conditions):
+        return input, conditions['description'][0] #cross_attention_output

audiocraft/lm.py CHANGED Viewed

@@ -6,7 +6,6 @@ import re
 import typing as tp
 import torch
 import torch.nn.functional as F
-from audiocraft.streaming import StreamingModule
 from audiocraft.transformer import StreamingTransformer, create_norm_fn
 from dataclasses import dataclass
 from functools import partial
@@ -109,7 +108,7 @@ class LMOutput:
     mask: torch.Tensor  # [B, K, T]
-class LMModel(StreamingModule):
     """Transformer-based language model on multiple streams of codes.
     Args:
@@ -148,7 +147,7 @@ class LMModel(StreamingModule):
         super().__init__()
         self.cfg_coef = cfg_coef
-        self.n_draw = 24
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?
@@ -160,9 +159,26 @@ class LMModel(StreamingModule):
         self.emb = nn.ModuleList([ScaledEmbedding(embed_dim, dim, lr=emb_lr) for _ in range(n_q)])
         if 'activation' in kwargs:
             kwargs['activation'] = get_activation_fn(kwargs['activation'])
         self.transformer = StreamingTransformer(
-            d_model=dim, num_heads=num_heads, dim_feedforward=int(hidden_scale * dim),
-            norm=norm, norm_first=norm_first, **kwargs)
         self.out_norm: tp.Optional[nn.Module] = None
         if norm_first:
             self.out_norm = create_norm_fn(norm, dim)
@@ -199,7 +215,10 @@ class LMModel(StreamingModule):
                 depth = layer_idx + 1
             elif depthwise_init == 'global':
                 depth = len(self.transformer.layers)
-            init_fn = partial(init_layer, method=weight_init, init_depth=depth, zero_bias_init=zero_bias_init)
             tr_layer.apply(init_fn)
         for linear in self.linears:
@@ -215,91 +234,55 @@ class LMModel(StreamingModule):
     def forward(self,
                 sequence,
-                conditions,
                 condition_tensors=None,
                 stage = -1):
-        B, K, S = sequence.shape
         input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
-        input_, cross_attention_input = self.fuser(input_, condition_tensors) # DEFINE conditioners.py
-        # print(f'{input_.shape=}  {cross_attention_input.shape=}  FUSER LLM FORw')
-        # input_.shape=torch.Size([1, 1, 1536])  cross_attention_input.shape=torch.Size([2, 7, 1536])  FUSER LLM FORw
         out = self.transformer(input_, cross_attention_src=cross_attention_input,
                                src_mask=(self.attn_mask_per_stage[stage] if stage >= 0 else None))
         if self.out_norm:
             out = self.out_norm(out)
         logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
         # remove the prefix from the model outputs
-        if len(self.fuser.fuse2cond['prepend']) > 0:
-            logits = logits[:, :, -S:]
-            print('==========================================PRESFIX')
         return logits  # [B, K, S, card]
-    def _sample_next_token(self,
-                           sequence,
-                           cfg_conditions,
-                           unconditional_state):
-        """self.n_draw"""
-        B = sequence.shape[0]
-        model = self if self._fsdp is None else self._fsdp
-        condition_tensors = cfg_conditions
-        # logits = [2, 4, 1, 2048]
-        logits = model(
-            sequence,                                          # cond_logits = wav condition
-            conditions=[], condition_tensors=condition_tensors)  # uncond_logits already see the text
-        # use cfg
-        # logits = (3 * logits[1, :, :, :] - 2.4 * logits[0, :, :, :]).transpose(1,0)
-        # or use 1 of logits
-        logits = logits[0, :, :, :].transpose(1,0)  # [2,4,1, 2048]  ->   [1,4,2048]
-        # print(f'{B=}, {logits.shape=} SAMPLER {top_k=}')
-        next_token = utils.sample_top_k(logits, n_draw=self.n_draw)  # [1,4,2048] logits
-        return next_token
     # GENERATE class revert_codebook_patterns()
     @torch.no_grad()
     def generate(self,
                  prompt = None,
                  conditions = [],
-                 num_samples = 1,       # THIS IS HOW MANY GENERATIONS - A SAMPLE IS A FULL WAV
-                 max_gen_len=256,  # unduplicated sequence length - actual len will be  n_draw * maxgenlen
-                 use_sampling: bool = True,
-                 **kwargs):
-        print(f'{num_samples=}')
         first_param = next(iter(self.parameters()))
         device = first_param.device
-        # below we create set of conditions: one conditional and one unconditional
-        # to do that we merge the regular condition together with the null condition
-        # we then do 1 forward pass instead of 2.
-        # the reason for that is two-fold:
-        # 1. it is about x2 faster than doing 2 forward passes
-        # 2. avoid the streaming API treating the 2 passes as part of different time steps
-        # We also support doing two different passes, in particular to ensure that
-        # the padding structure is exactly the same between train and test.
-        # With a batch size of 1, this can be slower though.
-        cfg_conditions: CFGConditions
-        # two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
-        null_conditions = conditions
-        conditions = conditions + null_conditions
         tokenized = self.condition_provider.tokenize(conditions)
         cfg_conditions = self.condition_provider(tokenized)
@@ -326,58 +309,59 @@ class LMModel(StreamingModule):
-        with self.streaming():
-            unconditional_state = self.get_streaming_state()
-            prev_offset = 0
-            gen_sequence_len = _gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
             # --
-            # print(mask.shape, mask.sum(), 'MSK LM')
-            # torch.Size([4, 39]) tensor(140, device='cuda:0') MSK LM ? Fully 1 normal no special token
-            # --
-            duplicate_draw = [
-                _gen_sequence[:, :, 0:1].repeat(self.n_draw, 1, 1)
-                             ]
-            # list to hold next tokens - draw sample multiple tokens at each time-step
-            #   but continue the sequence only with isingle next token
-            for offset in range(1, gen_sequence_len):  # start_offset_sequence=1
-                # print(f'{_gen_sequence.shape=}')  # [1,4,16]
-                # starts from 1 not 0 thus uses the 0:1 as curr sequence
-                # although this is empty contains -1 ?
-                curr_sequence = _gen_sequence[..., prev_offset:offset]
-                next_token = self._sample_next_token(
-                    curr_sequence,
-                    cfg_conditions,
-                    unconditional_state)  # [5, 4, 1]
-                # RUNS with = 2047 just different of self.special_token_id = 2047 = alwayssingletoken = drill noise
-                # special_token_id is filler for CODEBOOK_PATTERN ?
-                # next_token[:] = self.special_token_id    # seanet.embed torch.embedding does not have this - out of bounds in detokenize
-                _gen_sequence[..., offset:offset+1] = next_token[0, :, :]  #gen_sequence.shape=torch.Size([1, 4, 39])
-                duplicate_draw.append(next_token)
-                prev_offset = offset
-        unconditional_state.clear()
-        gen_sequence = torch.cat(duplicate_draw, 2)  # [self.n_draw, 4, len_seq]
         # revert codes as "batch"
@@ -415,4 +399,4 @@ class LMModel(StreamingModule):
-        return out_codes  # supposedly contains extra prompt

 import typing as tp
 import torch
 import torch.nn.functional as F
 from audiocraft.transformer import StreamingTransformer, create_norm_fn
 from dataclasses import dataclass
 from functools import partial
     mask: torch.Tensor  # [B, K, T]
+class LMModel(nn.Module):
     """Transformer-based language model on multiple streams of codes.
     Args:
         super().__init__()
         self.cfg_coef = cfg_coef
+        self.n_draw = 5
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?
         self.emb = nn.ModuleList([ScaledEmbedding(embed_dim, dim, lr=emb_lr) for _ in range(n_q)])
         if 'activation' in kwargs:
             kwargs['activation'] = get_activation_fn(kwargs['activation'])
+        # ========================================================================
+        #  {
+        #   'dtype': torch.float16, 'device': 'cuda',
+        #   'num_layers': 48, 'dropout': 0.0, 'activation': 'gelu',
+        #   'bias_ff': False, 'bias_attn': False,
+        #   'past_context': None, 'causal': True,
+        #   'custom': False, 'memory_efficient': True,
+        #   'attention_as_float32': False, 'positional_embedding': 'sin', 'xpos': False,
+        #   'checkpointing': 'none', 'cross_attention': True, 'qk_layer_norm': False,
+        #   'qk_layer_norm_cross': False, 'attention_dropout': None, 'kv_repeat': 1
+        #   }
+        # ==========================================================================
+        kwargs.pop('layer_scale')  # nn.Indentity()
         self.transformer = StreamingTransformer(
+            d_model=dim,
+            num_heads=num_heads,
+            dim_feedforward=int(hidden_scale * dim),
+            norm=norm,
+            norm_first=norm_first, **kwargs)
         self.out_norm: tp.Optional[nn.Module] = None
         if norm_first:
             self.out_norm = create_norm_fn(norm, dim)
                 depth = layer_idx + 1
             elif depthwise_init == 'global':
                 depth = len(self.transformer.layers)
+            init_fn = partial(init_layer,
+                              method=weight_init,
+                              init_depth=depth,
+                              zero_bias_init=zero_bias_init)
             tr_layer.apply(init_fn)
         for linear in self.linears:
     def forward(self,
                 sequence,
                 condition_tensors=None,
                 stage = -1):
+        B, K, S = sequence.shape    # linears are n_q
         input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
+        # input_, cross_attention_input = self.fuser(input_, condition_tensors)
+        cross_attention_input = condition_tensors['description'][0]
+        # print(f'{input_.shape=}  {cross_attention_input.shape=}  FUSER LLM')
         out = self.transformer(input_, cross_attention_src=cross_attention_input,
                                src_mask=(self.attn_mask_per_stage[stage] if stage >= 0 else None))
         if self.out_norm:
             out = self.out_norm(out)
+        # K = 2 because of llm producing 2 tokens?
+        # so only 2 x sel.flinear() of 4 are used ?
+        # WHy torch.stack is in dim=1
         logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
+        print(f'{input_.shape=}  {out.shape=}  {cross_attention_input.shape=}  {logits.shape=} FUSER LLM')
         # remove the prefix from the model outputs
+        # if len(self.fuser.fuse2cond['prepend']) > 0:
+        #     logits = logits[:, :, -S:]
+        #     print('==========================================PRESFIX')
         return logits  # [B, K, S, card]
     # GENERATE class revert_codebook_patterns()
     @torch.no_grad()
     def generate(self,
                  prompt = None,
                  conditions = [],
+                 num_samples = 1,  # N next token
+                 max_gen_len=256):
+        print(f'{prompt=} {conditions=}')
         first_param = next(iter(self.parameters()))
         device = first_param.device
         tokenized = self.condition_provider.tokenize(conditions)
+        # print('TOKENIZ', tokenized)  # 'description'
+        # TOKENIZ {'description': {'input_ids': tensor([[3887,   16, 2815,    1],
+        # [3887,   16, 2815,    1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1],
+        # [1, 1, 1, 1]], device='cuda:0')}}
         cfg_conditions = self.condition_provider(tokenized)
+        # --
+        # print(mask.shape, mask.sum(), 'MSK LM')
+        # torch.Size([4, 39]) tensor(140, device='cuda:0') MSK LM ? Fully 1 normal no special token
+        # --\
+        # list - Elongation for take-5 next tokens - n_draw 5 tokens at each time-step
+        # append them at end of sequence
+        duplicate_draw = [
+            _gen_sequence[:, :, 0:1].repeat(self.n_draw, 1, 1)
+                            ]
+        for offset in range(1, _gen_sequence.shape[2]):  # gen_sequence shape is [B, K, S]):
+            # print(f'{_gen_sequence.shape=}')  # [1,4,16]
+            # starts from 1 not 0 thus uses the 0:1 as curr sequence
+            # although this is empty contains -1 ?
+            # ====================== SAMPLE NEXT TOK
+            # next_token = self._sample_next_token(
+            #     _gen_sequence[..., :offset],
+            #     cfg_conditions)  # [5, 4, 1]
             # --
+            # def _sample_next_token(self,
+            #            sequence,
+            #            cfg_conditions):
+            model = self if self._fsdp is None else self._fsdp
+            logits = model(_gen_sequence[..., :offset],
+                        condition_tensors=cfg_conditions)
+            # print(logits.shape, 'Next Logits')  # [1, 4, 2, 2048] why 2 tokens on query
+            # use cfg
+            # logits = (3 * logits[1, :, :, :] - 2.4 * logits[0, :, :, :]).transpose(1,0)
+            # or use 1 of logits
+            logits = logits[0, :, 0:1, :]  # [1,4,2048]
+            next_token = utils.sample_top_k(logits, n_draw=self.n_draw)  # [1,4,2048] logits
+            # =================================
+            _gen_sequence[:, :, offset] = next_token[0, :, 0]  #gen_sequence.shape=torch.Size([1, 4, 39])
+            duplicate_draw.append(next_token)
+        gen_sequence = torch.cat(duplicate_draw, 2)  # RESHAPE -> N_DRAW -> TIME
         # revert codes as "batch"
+        return out_codes  #

audiocraft/streaming.py DELETED Viewed

@@ -1,131 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Streaming module API that should be implemented by all Streaming components,
-"""
-from contextlib import contextmanager
-import typing as tp
-from torch import nn
-import torch
-State = tp.Dict[str, torch.Tensor]
-class StreamingModule(nn.Module):
-    """Common API for streaming components.
-    Each streaming component has a streaming state, which is just a dict[str, Tensor].
-    By convention, the first dim of each tensor must be the batch size.
-    Don't use dots in the key names, as this would clash with submodules
-    (like in state_dict).
-    If `self._is_streaming` is True, the component should use and remember
-    the proper state inside `self._streaming_state`.
-    To set a streaming component in streaming state, use
-        with module.streaming():
-            ...
-    This will automatically reset the streaming state when exiting the context manager.
-    This also automatically propagates to all streaming children module.
-    Some module might also implement the `StreamingModule.flush` method, although
-    this one is trickier, as all parents module must be StreamingModule and implement
-    it as well for it to work properly. See `StreamingSequential` after.
-    """
-    def __init__(self) -> None:
-        super().__init__()
-        self._streaming_state: State = {}
-        self._is_streaming = False
-    def _apply_named_streaming(self, fn: tp.Any):
-        for name, module in self.named_modules():
-            if isinstance(module, StreamingModule):
-                fn(name, module)
-    def _set_streaming(self, streaming: bool):
-        def _set_streaming(name, module):
-            module._is_streaming = streaming
-        self._apply_named_streaming(_set_streaming)
-    @contextmanager
-    def streaming(self):
-        """Context manager to enter streaming mode. Reset streaming state on exit."""
-        self._set_streaming(True)
-        try:
-            yield
-        finally:
-            self._set_streaming(False)
-            self.reset_streaming()
-    def reset_streaming(self):
-        """Reset the streaming state."""
-        def _reset(name: str, module: StreamingModule):
-            module._streaming_state.clear()
-        self._apply_named_streaming(_reset)
-    def get_streaming_state(self) -> State:
-        """Return the streaming state, including that of sub-modules."""
-        state: State = {}
-        def _add(name: str, module: StreamingModule):
-            if name:
-                name += "."
-            for key, value in module._streaming_state.items():
-                state[name + key] = value
-        self._apply_named_streaming(_add)
-        return state
-    def set_streaming_state(self, state: State):
-        """Set the streaming state, including that of sub-modules."""
-        state = dict(state)
-        def _set(name: str, module: StreamingModule):
-            if name:
-                name += "."
-            module._streaming_state.clear()
-            for key, value in list(state.items()):
-                # complexity is not ideal here, but probably fine.
-                if key.startswith(name):
-                    local_key = key[len(name):]
-                    if '.' not in local_key:
-                        module._streaming_state[local_key] = value
-                        del state[key]
-        self._apply_named_streaming(_set)
-        assert len(state) == 0, list(state.keys())
-    def flush(self, x: tp.Optional[torch.Tensor] = None):
-        """Flush any remaining outputs that were waiting for completion.
-        Typically, for convolutions, this will add the final padding
-        and process the last buffer.
-        This should take an optional argument `x`, which will be provided
-        if a module before this one in the streaming pipeline has already
-        spitted out a flushed out buffer.
-        """
-        if x is None:
-            return None
-        else:
-            return self(x)
-class StreamingSequential(StreamingModule, nn.Sequential):
-    """A streaming compatible alternative of `nn.Sequential`.
-    """
-    def flush(self, x: tp.Optional[torch.Tensor] = None):
-        for module in self:
-            if isinstance(module, StreamingModule):
-                x = module.flush(x)
-            elif x is not None:
-                x = module(x)
-        return x

audiocraft/transformer.py CHANGED Viewed

@@ -1,30 +1,10 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Transformer model, with streaming support, xformer attention support
-and easy causal attention with a potentially finite receptive field.
-See `StreamingTransformer` for more information.
-Unlike regular PyTorch Transformer, we make the hard choice that batches are first.
-"""
 import typing as tp
 from einops import rearrange
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
-from torch.utils.checkpoint import checkpoint as torch_checkpoint
 from xformers import ops
-from .rope import RotaryEmbedding
-from .streaming import StreamingModule
 _efficient_attention_backend: str = 'torch'
@@ -35,14 +15,10 @@ def set_efficient_attention_backend(backend: str = 'torch'):
     _efficient_attention_backend = backend
-def _get_attention_time_dimension(memory_efficient: bool) -> int:
-    if _efficient_attention_backend == 'torch' and memory_efficient:
-        return 2
-    else:
-        return 1
-def _is_profiled() -> bool:
     # Return true if we are currently running with a xformers profiler activated.
     try:
         from xformers.profiler import profiler
@@ -51,16 +27,8 @@ def _is_profiled() -> bool:
     return profiler._Profiler._CURRENT_PROFILER is not None
-def create_norm_fn(norm_type: str, dim: int, **kwargs) -> nn.Module:
-    """Create normalization module for transformer encoder layer.
-    Args:
-        norm_type (str): Normalization method.
-        dim (int): Dimension of the normalized layer.
-        **kwargs (dict): Additional parameters for normalization layer.
-    Returns:
-        nn.Module: Normalization module.
-    """
     if norm_type == 'layer_norm':
         return nn.LayerNorm(dim, eps=1e-5, **kwargs)
     else:
@@ -86,87 +54,26 @@ def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float =
     adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
     max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype)  # avoid sync point
     phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
-    print('==============CONCAT 3 ============')
-    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
-def expand_repeated_kv(x: torch.Tensor, n_rep: int, memory_efficient: bool) -> torch.Tensor:
-    """torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers."""
-    if n_rep == 1:
-        return x
-    if _efficient_attention_backend == 'torch' and memory_efficient:
-        bs, n_kv_heads, slen, head_dim = x.shape
-        return (
-            x[:, :, None, :, :]
-            .expand(bs, n_kv_heads, n_rep, slen, head_dim)
-            .reshape(bs, n_kv_heads * n_rep, slen, head_dim)
-        )
-    else:
-        bs, slen, n_kv_heads, head_dim = x.shape
-        return (
-            x[:, :, :, None, :]
-            .expand(bs, slen, n_kv_heads, n_rep, head_dim)
-            .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
-        )
-class LayerScale(nn.Module):
-    """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
-    This rescales diagonally the residual outputs close to 0, with a learnt scale.
-    Args:
-        channels (int): Number of channels.
-        init (float): Initial scale.
-        channel_last (bool): If True, expect `[*, C]` shaped tensors, otherwise, `[*, C, T]`.
-        device (torch.device or str, optional): Device on which to initialize the module.
-        dtype (torch.dtype, optional): dtype to use to initialize the module.
-    """
-    def __init__(self, channels: int, init: float = 1e-4, channel_last: bool = True,
-                 device=None, dtype=None):
-        super().__init__()
-        self.channel_last = channel_last
-        self.scale = nn.Parameter(
-            torch.full((channels,), init,
-                       requires_grad=True, device=device, dtype=dtype))
-    def forward(self, x: torch.Tensor):
-        if self.channel_last:
-            return self.scale * x
-        else:
-            return self.scale[:, None] * x
-class StreamingMultiheadAttention(StreamingModule):
-    """Similar to `nn.MultiheadAttention` but with support for streaming, causal evaluation.
-    Args:
-        embed_dim (int): Dimension to project to.
-        num_heads (int): Number of heads.
-        dropout (float): Dropout level.
-        bias (bool): Use bias in projections.
-        causal (bool): Causal mask applied automatically.
-        past_context (int, optional): Receptive field for the causal mask, infinite if None.
-        custom (bool): Use custom MHA implementation, for testing / benchmarking.
-        memory_efficient (bool): Use xformers based memory efficient attention.
-        attention_as_float32 (bool): Perform the attention as float32
-            (especially important with memory_efficient as autocast won't do this automatically).
-        rope (`RotaryEmbedding`, optional): Rope embedding to use.
-        cross_attention: Should be true when used as a cross attention.
-            All keys and values must be available at once, streaming is only for the queries.
-            Cannot be used with `causal` or `rope` (as it wouldn't make sens to
-            interpret the time steps in the keys relative to those in the queries).
-        safe_streaming (bool): Bug fix, will go away with xformers update.
-        qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product.
-        kv_repeat (int): If > 1, will repeat keys and queries multiple times (need to divide num_heads).
-            This will lead to faster decoding time on A100 or other GPUs with tensorcore.
-        device (torch.device, optional): Device on which to initialize.
-        dtype (torch.dtype, optional): dtype to use.
-    """
-    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0, bias: bool = True,
                  causal: bool = False, past_context: tp.Optional[int] = None, custom: bool = False,
                  memory_efficient: bool = False, attention_as_float32: bool = False,
-                 rope: tp.Optional[RotaryEmbedding] = None, cross_attention: bool = False,
-                 safe_streaming: bool = True, qk_layer_norm: bool = False, kv_repeat: int = 1,
                  device=None, dtype=None):
         super().__init__()
         factory_kwargs = {'device': device, 'dtype': dtype}
@@ -178,15 +85,15 @@ class StreamingMultiheadAttention(StreamingModule):
         self.past_context = past_context
         self.memory_efficient = memory_efficient
         self.attention_as_float32 = attention_as_float32
-        self.rope = rope
         self.cross_attention = cross_attention
-        self.safe_streaming = safe_streaming
         self.num_heads = num_heads
         self.dropout = dropout
         self.kv_repeat = kv_repeat
         if cross_attention:
             assert not causal, "Causal cannot work with cross attention."
-            assert rope is None, "Rope cannot work with cross attention."
         if memory_efficient:
             _verify_xformers_memory_efficient_compat()
@@ -231,123 +138,42 @@ class StreamingMultiheadAttention(StreamingModule):
                     state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
-    def _get_mask(self, current_steps: int, device: torch.device, dtype: torch.dtype):
-        # Return a causal mask, accounting for potentially stored past keys/values
-        # We actually return a bias for the attention score, as this has the same
-        # convention both in the builtin MHA in Pytorch, and Xformers functions.
-        time_dim = _get_attention_time_dimension(self.memory_efficient)
-        if self.memory_efficient:
-            from xformers.ops import LowerTriangularMask
-            if current_steps == 1:
-                # If we only have one step, then we do not need a mask.
-                return None
-            elif 'past_keys' in self._streaming_state:
-                raise RuntimeError("Not supported at the moment")
-            else:
-                # Then we can safely use a lower triangular mask
-                return LowerTriangularMask()
-        if self._streaming_state:
-            past_keys = self._streaming_state['past_keys']
-            past_steps = past_keys.shape[time_dim]
-        else:
-            past_steps = 0
-        queries_pos = torch.arange(
-            past_steps, current_steps + past_steps, device=device).view(-1, 1)
-        keys_pos = torch.arange(past_steps + current_steps, device=device).view(1, -1)
-        delta = queries_pos - keys_pos
-        valid = delta >= 0
-        if self.past_context is not None:
-            valid &= (delta <= self.past_context)
-        return torch.where(
-            valid,
-            torch.zeros([], device=device, dtype=dtype),
-            torch.full([], float('-inf'), device=device, dtype=dtype))
-    def _complete_kv(self, k, v):
-        time_dim = _get_attention_time_dimension(self.memory_efficient)
-        if self.cross_attention:
-            # With cross attention we assume all keys and values
-            # are already available, and streaming is with respect
-            # to the queries only.
-            return k, v
-        # Complete the key/value pair using the streaming state.
-        if self._streaming_state:
-            pk = self._streaming_state['past_keys']
-            nk = torch.cat([pk, k], dim=time_dim)
-            print('==============CONCAT 1===============')
-            if v is k:
-                nv = nk
-            else:
-                pv = self._streaming_state['past_values']
-                nv = torch.cat([pv, v], dim=time_dim)
-                print('==============CONCAT 2================')
-        else:
-            nk = k
-            nv = v
-        assert nk.shape[time_dim] == nv.shape[time_dim]
-        offset = 0
-        if self.past_context is not None:
-            offset = max(0, nk.shape[time_dim] - self.past_context)
-        if self._is_streaming:
-            self._streaming_state['past_keys'] = nk[:, offset:]
-            if v is not k:
-                self._streaming_state['past_values'] = nv[:, offset:]
-            if 'offset' in self._streaming_state:
-                self._streaming_state['offset'] += offset
-            else:
-                self._streaming_state['offset'] = torch.tensor(0)
-        return nk, nv
-    def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
-        time_dim = _get_attention_time_dimension(self.memory_efficient)
-        # Apply rope embeddings to query and key tensors.
-        assert self.rope is not None
-        if 'past_keys' in self._streaming_state:
-            past_keys_offset = self._streaming_state['past_keys'].shape[1]
-        else:
-            past_keys_offset = 0
-        if 'offset' in self._streaming_state:
-            past_context_offset = int(self._streaming_state['offset'].item())
-        else:
-            past_context_offset = 0
-        streaming_offset = past_context_offset + past_keys_offset
-        return self.rope.rotate_qk(query, key, start=streaming_offset, time_dim=time_dim)
-    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
-                key_padding_mask=None, need_weights=False, attn_mask=None,
-                average_attn_weights=True, is_causal=False):
         assert not is_causal, ("New param added in torch 2.0.1 not supported, "
                                "use the causal args in the constructor.")
-        time_dim = _get_attention_time_dimension(self.memory_efficient)
         if time_dim == 2:
             layout = "b h t d"
         else:
             layout = "b t h d"
         dtype = query.dtype
-        if self._is_streaming:
-            assert self.causal or self.cross_attention, \
-                "Streaming only available for causal or cross attention"
         custom_attn_mask = attn_mask is not None
-        if self.causal:
-            assert attn_mask is None
-            # At the moment we specialize only for the self-attention case.
-            assert query.shape[1] == key.shape[1], "Causal only for same length query / key / value"
-            assert value.shape[1] == key.shape[1], "Causal only for same length query / key / value"
-            attn_mask = self._get_mask(query.shape[1], query.device, query.dtype)
         if self.custom:
             # custom implementation
             assert need_weights is False
             assert key_padding_mask is None
             if self.cross_attention:
-                # Different queries, keys, values, we have to spit manually the weights
-                # before applying the linear.
                 dim = self.in_proj_weight.shape[0] // 3
                 if self.in_proj_bias is None:
                     bias_q, bias_k, bias_v = None, None, None
@@ -356,14 +182,23 @@ class StreamingMultiheadAttention(StreamingModule):
                     bias_k = self.in_proj_bias[dim: 2 * dim]
                     bias_v = self.in_proj_bias[2 * dim:]
                 q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
                 # todo: when streaming, we could actually save k, v and check the shape actually match.
                 k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
                 v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
                 if self.qk_layer_norm is True:
                     q = self.q_layer_norm(q)
                     k = self.k_layer_norm(k)
                 q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
             else:
                 if not _is_profiled():
                     # profiling breaks that propertysomehow.
                     assert query is key, "specialized implementation"
@@ -374,8 +209,13 @@ class StreamingMultiheadAttention(StreamingModule):
                         bound_layout = "b h p t d"
                     else:
                         bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
                     q, k, v = ops.unbind(packed, dim=2)
                 else:
                     embed_dim = self.embed_dim
                     per_head_dim = (embed_dim // self.num_heads)
@@ -395,12 +235,12 @@ class StreamingMultiheadAttention(StreamingModule):
                     q = self.q_layer_norm(q)
                     k = self.k_layer_norm(k)
                     q, k = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k]]
-                if self.rope:
-                    q, k = self._apply_rope(q, k)
-                k, v = self._complete_kv(k, v)
                 if self.kv_repeat > 1:
-                    k = expand_repeated_kv(k, self.kv_repeat, self.memory_efficient)
-                    v = expand_repeated_kv(v, self.kv_repeat, self.memory_efficient)
             if self.attention_as_float32:
                 q, k, v = [x.float() for x in [q, k, v]]
             if self.memory_efficient:
@@ -429,11 +269,8 @@ class StreamingMultiheadAttention(StreamingModule):
                 q = q / q.shape[-1] ** 0.5
                 key_layout = layout.replace('t', 'k')
                 query_layout = layout
-                if self._is_streaming and self.safe_streaming and q.device.type == 'cuda':
-                    with torch.autocast(device_type=q.device.type, dtype=torch.float32):
-                        pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
-                else:
-                    pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
                 if attn_mask is not None:
                     pre_w = pre_w + attn_mask
                 w = torch.softmax(pre_w, dim=-1)
@@ -444,58 +281,24 @@ class StreamingMultiheadAttention(StreamingModule):
             x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
             x = self.out_proj(x)
         else:
-            key, value = self._complete_kv(key, value)
-            if self.attention_as_float32:
-                query, key, value = [x.float() for x in [query, key, value]]
-            x, _ = self.mha(
-                query, key, value, key_padding_mask,
-                need_weights, attn_mask, average_attn_weights)
-            x = x.to(dtype)
         return x, None
 class StreamingTransformerLayer(nn.TransformerEncoderLayer):
-    """TransformerLayer with Streaming / Causal support.
-    This also integrates cross_attention, when passing `cross_attention=True`,
-    rather than having two separate classes like in PyTorch.
-    Args:
-        d_model (int): Dimension of the data.
-        num_heads (int): Number of heads.
-        dim_feedforward (int): Intermediate dimension of FF module.
-        dropout (float): Dropout both for MHA and FF.
-        bias_ff (bool): Use bias for FF.
-        bias_attn (bool): Use bias for MHA.
-        causal (bool): Causal mask applied automatically.
-        past_context (int, optional): Receptive field for the causal mask, infinite if None.
-        custom (bool): Use custom MHA implementation, for testing / benchmarking.
-        memory_efficient (bool): Use xformers based memory efficient attention.
-        attention_as_float32 (bool): Perform the attention as float32
-            (especially important with memory_efficient as autocast won't do this automatically).
-        qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product in attention.
-        qk_layer_norm_cross (bool): Same for the cross attention.
-        cross_attention (bool): If True, expect to get secondary input for cross-attention.
-            Cross attention will use the default MHA, as it typically won't require
-            special treatment.
-        layer_scale (float, optional): If not None, LayerScale will be used with
-            the given value as initial scale.
-        rope (`RotaryEmbedding`, optional): Rope embedding to use.
-        attention_dropout (float, optional): If not None, separate the value of the dimension dropout
-            in FFN and of the attention dropout.
-        kv_repeat (int): If > 1, will repeat keys and queries multiple times (need to divide num_heads).
-            This will lead to faster decoding time on A100 or other GPUs with tensorcore.
-        device (torch.device, optional): Device on which to initialize.
-        dtype (torch.dtype, optional): dtype to use.
-        **kwargs: See `nn.TransformerEncoderLayer`.
-    """
-    def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
                  bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
                  past_context: tp.Optional[int] = None, custom: bool = False,
                  memory_efficient: bool = False, attention_as_float32: bool = False,
                  qk_layer_norm: bool = False, qk_layer_norm_cross: bool = False,
-                 cross_attention: bool = False, layer_scale: tp.Optional[float] = None,
-                 rope: tp.Optional[RotaryEmbedding] = None, attention_dropout: tp.Optional[float] = None,
                  kv_repeat: int = 1, norm: str = 'layer_norm', device=None, dtype=None, **kwargs):
         super().__init__(d_model, num_heads, dim_feedforward, dropout,
                          device=device, dtype=dtype, batch_first=True, **kwargs)
@@ -511,22 +314,17 @@ class StreamingTransformerLayer(nn.TransformerEncoderLayer):
             'attention_as_float32': attention_as_float32,
         }
         self.self_attn: StreamingMultiheadAttention = StreamingMultiheadAttention(
-            causal=causal, past_context=past_context, rope=rope, qk_layer_norm=qk_layer_norm,
             kv_repeat=kv_repeat, **attn_kwargs, **factory_kwargs)  # type: ignore
         # Redefine feedforward layers to expose bias parameter
         self.linear1 = nn.Linear(d_model, dim_feedforward, bias=bias_ff, **factory_kwargs)
         self.linear2 = nn.Linear(dim_feedforward, d_model, bias=bias_ff, **factory_kwargs)
-        self.layer_scale_1: nn.Module
-        self.layer_scale_2: nn.Module
-        if layer_scale is None:
-            self.layer_scale_1 = nn.Identity()
-            self.layer_scale_2 = nn.Identity()
-        else:
-            self.layer_scale_1 = LayerScale(d_model, layer_scale, **factory_kwargs)
-            self.layer_scale_2 = LayerScale(d_model, layer_scale, **factory_kwargs)
-        self.cross_attention: tp.Optional[nn.Module] = None
         if cross_attention:
             self.cross_attention = StreamingMultiheadAttention(
                 cross_attention=True, qk_layer_norm=qk_layer_norm_cross,
@@ -535,98 +333,69 @@ class StreamingTransformerLayer(nn.TransformerEncoderLayer):
             self.dropout_cross = nn.Dropout(dropout)
             # eps value matching that used in PyTorch reference implementation.
             self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
-            self.layer_scale_cross: nn.Module
-            if layer_scale is None:
-                self.layer_scale_cross = nn.Identity()
-            else:
-                self.layer_scale_cross = LayerScale(d_model, layer_scale, **factory_kwargs)
         self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
         self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
-    def _cross_attention_block(self, src: torch.Tensor,
-                               cross_attention_src: torch.Tensor) -> torch.Tensor:
-        assert self.cross_attention is not None
         # queries are from src, keys and values from cross_attention_src.
         x = self.cross_attention(
             src, cross_attention_src, cross_attention_src, need_weights=False)[0]
         return self.dropout_cross(x)  # type: ignore
-    def forward(self, src: torch.Tensor, src_mask: tp.Optional[torch.Tensor] = None,  # type: ignore
-                src_key_padding_mask: tp.Optional[torch.Tensor] = None,
-                cross_attention_src: tp.Optional[torch.Tensor] = None):
-        if self.cross_attention is None:
-            assert cross_attention_src is None
-        else:
-            assert cross_attention_src is not None
         x = src
         if self.norm_first:
-            x = x + self.layer_scale_1(
-                self._sa_block(self.norm1(x), src_mask, src_key_padding_mask))
             if cross_attention_src is not None:
-                x = x + self.layer_scale_cross(
-                    self._cross_attention_block(
-                        self.norm_cross(x), cross_attention_src))
-            x = x + self.layer_scale_2(self._ff_block(self.norm2(x)))
         else:
-            x = self.norm1(x + self.layer_scale_1(
-                self._sa_block(x, src_mask, src_key_padding_mask)))
-            if cross_attention_src is not None:
-                x = self.norm_cross(
-                    x + self.layer_scale_cross(
-                        self._cross_attention_block(src, cross_attention_src)))
-            x = self.norm2(x + self.layer_scale_2(self._ff_block(x)))
         return x
-class StreamingTransformer(StreamingModule):
-    """Transformer with Streaming / Causal support.
-    Args:
-        d_model (int): Dimension of the data.
-        num_heads (int): Number of heads.
-        dim_feedforward (int): Intermediate dimension of FF module.
-        dropout (float): Dropout both for MHA and FF.
-        bias_ff (bool): Use bias for FF.
-        bias_attn (bool): Use bias for MHA.
-        causal (bool): Causal mask applied automatically.
-        past_context (int, optional): Receptive field for the causal mask, infinite if None.
-        custom (bool): Use custom MHA implementation, for testing / benchmarking.
-        memory_efficient (bool): Use xformers based memory efficient attention.
-        attention_as_float32 (bool): Perform the attention as float32
-            (especially important with memory_efficient as autocast won't do this automatically).
-        cross_attention (bool): If True, expect to get secondary input for cross-attention.
-        layer_scale (float, optional): If not None, LayerScale will be used
-            with the given value as initial scale.
-        positional_embedding (str): Positional embedding strategy (sin, rope, or sin_rope).
-        max_period (float): Maximum period of the time embedding.
-        positional_scale (float): Scale of positional embedding, set to 0 to deactivate.
-        xpos (bool): Apply xpos exponential decay to positional embedding (rope only).
-        lr (float, optional): learning rate override through the `make_optim_group` API.
-        weight_decay (float, optional): Weight_decay override through the `make_optim_group` API.
-        layer_class: (subclass of `StreamingTransformerLayer): class to use
-            to initialize the layers, allowing further customization outside of AudioCraft.
-        checkpointing (str): Checkpointing strategy to reduce memory usage.
-            No checkpointing if set to 'none'. Per layer checkpointing using PyTorch
-            if set to 'torch' (entire layer checkpointed, i.e. linears are evaluated twice,
-            minimal memory usage, but maximal runtime). Finally, `xformers_default` provide
-            a policy for opting-out some operations of the checkpointing like
-            linear layers and attention, providing a middle ground between speed and memory.
-        device (torch.device, optional): Device on which to initialize.
-        dtype (torch.dtype, optional): dtype to use.
-        **kwargs: See `nn.TransformerEncoderLayer`.
-    """
     def __init__(self, d_model: int, num_heads: int, num_layers: int, dim_feedforward: int = 2048,
                  dropout: float = 0.1, bias_ff: bool = True, bias_attn: bool = True,
                  causal: bool = False, past_context: tp.Optional[int] = None,
                  custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False,
-                 cross_attention: bool = False, layer_scale: tp.Optional[float] = None,
                  positional_embedding: str = 'sin', max_period: float = 10_000, positional_scale: float = 1.,
-                 xpos: bool = False, lr: tp.Optional[float] = None, weight_decay: tp.Optional[float] = None,
-                 layer_class: tp.Type[StreamingTransformerLayer] = StreamingTransformerLayer,
-                 checkpointing: str = 'none', device=None, dtype=None, **kwargs):
         super().__init__()
         assert d_model % num_heads == 0
         self.positional_embedding = positional_embedding
         self.max_period = max_period
         self.positional_scale = positional_scale
@@ -634,12 +403,6 @@ class StreamingTransformer(StreamingModule):
         self.lr = lr
         assert positional_embedding in ['sin', 'rope', 'sin_rope']
-        self.rope: tp.Optional[RotaryEmbedding] = None
-        if self.positional_embedding in ['rope', 'sin_rope']:
-            assert _is_custom(custom, memory_efficient)
-            self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
-                                        xpos=xpos, scale=positional_scale, device=device)
         self.checkpointing = checkpointing
         assert checkpointing in ['none', 'torch', 'xformers_default', 'xformers_mm']
@@ -654,7 +417,8 @@ class StreamingTransformer(StreamingModule):
                     dropout=dropout, bias_ff=bias_ff, bias_attn=bias_attn,
                     causal=causal, past_context=past_context, custom=custom,
                     memory_efficient=memory_efficient, attention_as_float32=attention_as_float32,
-                    cross_attention=cross_attention, layer_scale=layer_scale, rope=self.rope,
                     device=device, dtype=dtype, **kwargs))
         if self.checkpointing != 'none':
@@ -663,58 +427,37 @@ class StreamingTransformer(StreamingModule):
                 # backward hook inside of FSDP...
                 layer._magma_checkpointed = True  # type: ignore
-    def _apply_layer(self, layer, *args, **kwargs):
-        method = self.checkpointing
-        if method == 'none':
-            return layer(*args, **kwargs)
-        elif method == 'torch':
-            return torch_checkpoint(layer, *args, use_reentrant=False, **kwargs)
-        elif method.startswith('xformers'):
-            from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy
-            if method == 'xformers_default':
-                # those operations will be saved, and not recomputed.
-                # According to Francisco we can get smarter policies but this is a good start.
-                allow_list = [
-                    "xformers.efficient_attention_forward_cutlass.default",
-                    "xformers_flash.flash_fwd.default",
-                    "aten.addmm.default",
-                    "aten.mm.default",
-                ]
-            elif method == 'xformers_mm':
-                # those operations will be saved, and not recomputed.
-                # According to Francisco we can get smarter policies but this is a good start.
-                allow_list = [
-                    "aten.addmm.default",
-                    "aten.mm.default",
-                ]
-            else:
-                raise ValueError(f"xformers checkpointing xformers policy {method} is not known.")
-            policy_fn = _get_default_policy(allow_list)
-            return checkpoint(layer, *args, policy_fn=policy_fn, **kwargs)
-        else:
-            raise ValueError(f"Checkpointing method {method} is unknown.")
     def forward(self, x: torch.Tensor, *args, **kwargs):
         B, T, C = x.shape
-        if 'offsets' in self._streaming_state:
-            offsets = self._streaming_state['offsets']
-        else:
-            offsets = torch.zeros(B, dtype=torch.long, device=x.device)
-        if self.positional_embedding in ['sin', 'sin_rope']:
             positions = torch.arange(T, device=x.device).view(1, -1, 1)
-            positions = positions + offsets.view(-1, 1, 1)
             pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
             x = x + self.positional_scale * pos_emb
-        for layer in self.layers:
-            x = self._apply_layer(layer, x, *args, **kwargs)
-        if self._is_streaming:
-            self._streaming_state['offsets'] = offsets + T
         return x
     def make_optim_group(self):

 import typing as tp
 from einops import rearrange
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from xformers import ops
 _efficient_attention_backend: str = 'torch'
     _efficient_attention_backend = backend
+def _is_profiled():
     # Return true if we are currently running with a xformers profiler activated.
     try:
         from xformers.profiler import profiler
     return profiler._Profiler._CURRENT_PROFILER is not None
+def create_norm_fn(norm_type, dim, **kwargs):
     if norm_type == 'layer_norm':
         return nn.LayerNorm(dim, eps=1e-5, **kwargs)
     else:
     adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
     max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype)  # avoid sync point
     phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
+    # print('==============CONCAT 3 ============'
+    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
+class StreamingMultiheadAttention(nn.Module):
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout=0.0, bias: bool = True,
                  causal: bool = False, past_context: tp.Optional[int] = None, custom: bool = False,
                  memory_efficient: bool = False, attention_as_float32: bool = False,
+                 cross_attention: bool = False,
+                 qk_layer_norm: bool = False, kv_repeat: int = 1,
                  device=None, dtype=None):
         super().__init__()
         factory_kwargs = {'device': device, 'dtype': dtype}
         self.past_context = past_context
         self.memory_efficient = memory_efficient
         self.attention_as_float32 = attention_as_float32
         self.cross_attention = cross_attention
         self.num_heads = num_heads
         self.dropout = dropout
         self.kv_repeat = kv_repeat
         if cross_attention:
             assert not causal, "Causal cannot work with cross attention."
         if memory_efficient:
             _verify_xformers_memory_efficient_compat()
                     state_dict[prefix + "mha." + key] = state_dict.pop(prefix + key)
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    def forward(self,
+                query,
+                key,
+                value,
+                key_padding_mask=None,
+                need_weights=False,
+                attn_mask=None,
+                is_causal=False):
         assert not is_causal, ("New param added in torch 2.0.1 not supported, "
                                "use the causal args in the constructor.")
+        # print(f'{query.shape=} {key.shape=} {value.shape=} MHA')
+        time_dim = 2
         if time_dim == 2:
             layout = "b h t d"
         else:
             layout = "b t h d"
         dtype = query.dtype
         custom_attn_mask = attn_mask is not None
         if self.custom:
             # custom implementation
             assert need_weights is False
             assert key_padding_mask is None
             if self.cross_attention:
+                # print('\n\n\n\nCROSS\n\n\n\n')
                 dim = self.in_proj_weight.shape[0] // 3
                 if self.in_proj_bias is None:
                     bias_q, bias_k, bias_v = None, None, None
                     bias_k = self.in_proj_bias[dim: 2 * dim]
                     bias_v = self.in_proj_bias[2 * dim:]
                 q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
+                # print(f'{q.shape=} TRANSF FORW who concaten')
                 # todo: when streaming, we could actually save k, v and check the shape actually match.
                 k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
                 v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
                 if self.qk_layer_norm is True:
                     q = self.q_layer_norm(q)
                     k = self.k_layer_norm(k)
                 q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
+                # print(f'{q.shape=}  {k.shape=}  {v.shape=}  after rearrange')
             else:
+                # print('\n\n\n\nSELF\n\n\n\n')
+                #
+                # 47x Transformers selfattn followed by crossattn
+                #
+                # self-attn is on history? previous key or is it on only the last token?
                 if not _is_profiled():
                     # profiling breaks that propertysomehow.
                     assert query is key, "specialized implementation"
                         bound_layout = "b h p t d"
                     else:
                         bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
+                    # print(f'{query.shape=} before unbind')  # [2, 1, 4 , 2048] already bs=2
                     q, k, v = ops.unbind(packed, dim=2)
+                    # print(f'{q.shape=} {v.shape=} @L331 trasnforemr.py') # packed is bs=2
                 else:
                     embed_dim = self.embed_dim
                     per_head_dim = (embed_dim // self.num_heads)
                     q = self.q_layer_norm(q)
                     k = self.k_layer_norm(k)
                     q, k = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k]]
                 if self.kv_repeat > 1:
+                    #
+                    print('Expand repear 2')
             if self.attention_as_float32:
                 q, k, v = [x.float() for x in [q, k, v]]
             if self.memory_efficient:
                 q = q / q.shape[-1] ** 0.5
                 key_layout = layout.replace('t', 'k')
                 query_layout = layout
+                pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
                 if attn_mask is not None:
                     pre_w = pre_w + attn_mask
                 w = torch.softmax(pre_w, dim=-1)
             x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
             x = self.out_proj(x)
         else:
+            raise NotImplementedError
         return x, None
 class StreamingTransformerLayer(nn.TransformerEncoderLayer):
+    def __init__(self,
+                 d_model,
+                 num_heads,
+                 dim_feedforward=2048,
+                 dropout=0.1,
                  bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
                  past_context: tp.Optional[int] = None, custom: bool = False,
                  memory_efficient: bool = False, attention_as_float32: bool = False,
                  qk_layer_norm: bool = False, qk_layer_norm_cross: bool = False,
+                 cross_attention: bool = False,
+                #  rope=None,
+                 attention_dropout: tp.Optional[float] = None,
                  kv_repeat: int = 1, norm: str = 'layer_norm', device=None, dtype=None, **kwargs):
         super().__init__(d_model, num_heads, dim_feedforward, dropout,
                          device=device, dtype=dtype, batch_first=True, **kwargs)
             'attention_as_float32': attention_as_float32,
         }
         self.self_attn: StreamingMultiheadAttention = StreamingMultiheadAttention(
+            causal=causal, past_context=past_context,
+            # rope=rope,
+            qk_layer_norm=qk_layer_norm,
             kv_repeat=kv_repeat, **attn_kwargs, **factory_kwargs)  # type: ignore
         # Redefine feedforward layers to expose bias parameter
         self.linear1 = nn.Linear(d_model, dim_feedforward, bias=bias_ff, **factory_kwargs)
         self.linear2 = nn.Linear(dim_feedforward, d_model, bias=bias_ff, **factory_kwargs)
+        self.cross_attention = None  # default
         if cross_attention:
             self.cross_attention = StreamingMultiheadAttention(
                 cross_attention=True, qk_layer_norm=qk_layer_norm_cross,
             self.dropout_cross = nn.Dropout(dropout)
             # eps value matching that used in PyTorch reference implementation.
             self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
         self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
         self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
+    def _cross_attention_block(self,
+                               src,
+                               cross_attention_src):
         # queries are from src, keys and values from cross_attention_src.
         x = self.cross_attention(
             src, cross_attention_src, cross_attention_src, need_weights=False)[0]
         return self.dropout_cross(x)  # type: ignore
+    def forward(self,
+                src,
+                src_mask=None,
+                src_key_padding_mask=None,  # key = value = looooong I think I pass them inversed
+                cross_attention_src=None):
         x = src
         if self.norm_first:
+            # print('selfattn', x.shape, src_mask, src_key_padding_mask)
+            x = x + self._sa_block(self.norm1(x),
+                                   src_mask,               #None
+                                   src_key_padding_mask  # None
+                                   )  # Internal nn
+            # print('crossattn', x.shape, cross_attention_src.shape)
             if cross_attention_src is not None:
+                x = x + self._cross_attention_block(
+                        self.norm_cross(x),
+                        cross_attention_src)
+                # selfattn  torch.Size([2, 2, 1536]) None None           NO 4D TOKEN!
+                # crossattn torch.Size([2, 2, 1536]) torch.Size([2, 4, 1536])
+            else:
+                raise NotImplementedError  # all layers have a self & cross?
+            x = x + self._ff_block(self.norm2(x))
         else:
+            print('NLAST')
+        # print('NT', x.shape)  # [1,2 ,1536]
         return x
+class StreamingTransformer(nn.Module):
+    '''layer_class=<class 'audiocraft.transformer.StreamingTransformerLayer'>  StrTrnsf'''
     def __init__(self, d_model: int, num_heads: int, num_layers: int, dim_feedforward: int = 2048,
                  dropout: float = 0.1, bias_ff: bool = True, bias_attn: bool = True,
                  causal: bool = False, past_context: tp.Optional[int] = None,
                  custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False,
+                 cross_attention: bool = False,
                  positional_embedding: str = 'sin', max_period: float = 10_000, positional_scale: float = 1.,
+                 xpos=False,
+                 lr=None,
+                 weight_decay=None,
+                 layer_class=StreamingTransformerLayer,
+                 checkpointing='none',
+                 device=None,
+                 dtype=None,
+                 **kwargs):
         super().__init__()
         assert d_model % num_heads == 0
         self.positional_embedding = positional_embedding
         self.max_period = max_period
         self.positional_scale = positional_scale
         self.lr = lr
         assert positional_embedding in ['sin', 'rope', 'sin_rope']
         self.checkpointing = checkpointing
         assert checkpointing in ['none', 'torch', 'xformers_default', 'xformers_mm']
                     dropout=dropout, bias_ff=bias_ff, bias_attn=bias_attn,
                     causal=causal, past_context=past_context, custom=custom,
                     memory_efficient=memory_efficient, attention_as_float32=attention_as_float32,
+                    cross_attention=cross_attention,
+                    # rope=self.rope,
                     device=device, dtype=dtype, **kwargs))
         if self.checkpointing != 'none':
                 # backward hook inside of FSDP...
                 layer._magma_checkpointed = True  # type: ignore
     def forward(self, x: torch.Tensor, *args, **kwargs):
+        # print(f'{x.shape=} StreamingTransf')   # [1, 1, 1536]  Always no batch==2 here
+        # why is this called with time-len = 1? Shouldnt be called with context?
         B, T, C = x.shape
+        if self.positional_embedding in ['sin',
+                                         'sin_rope']:
             positions = torch.arange(T, device=x.device).view(1, -1, 1)
             pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
             x = x + self.positional_scale * pos_emb
+        # UNTIL HERE BATCH=1
+        for _, lay in enumerate(self.layers):
+            # if _ < 2:
+                # L=0  [1,1,1536]
+                # L=1  [2,1,1536]
+            print(f'L={_}    {args=} {kwargs["cross_attention_src"].shape=} {x.shape=} StreamTransf ForLoop')   # [2, 1, 1536]  BATCH=2
+            # x = self._apply_layer(layer, x, *args, **kwargs)
+            # x = lay(x, **kwargs)
+            x = lay(x,
+                    cross_attention_src=kwargs["cross_attention_src"],
+                    src_mask=kwargs['src_mask'])
+            # concat old token to query oh not here is on lm generate
+        print('OUT OF Tall', x.shape) # [1,2,1536]  # why this gets filled with sequence 1,2...
+        # should be 1 query
         return x
     def make_optim_group(self):

demo.py CHANGED Viewed

@@ -7,7 +7,7 @@ print('\n\n\n\n___________________')
 txt = 'dogs in street'
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
-sound_generator.set_generation_params(duration=.7)   # why is generating so long at 14 seconds
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7

 txt = 'dogs in street'
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
+sound_generator.set_generation_params(duration=1.24)   # why is generating so long at 14 seconds
 x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
 x /= np.abs(x).max() + 1e-7