revert pattern preserves 4

Browse files

Files changed (4) hide show

audiocraft/builders.py +6 -18
audiocraft/conditioners.py +1 -31
audiocraft/lm.py +86 -142
demo.py +5 -3

audiocraft/builders.py CHANGED Viewed

@@ -11,7 +11,6 @@ from .lm import LMModel
 from .seanet import SEANetDecoder
 from .codebooks_patterns import DelayedPatternProvider
 from .conditioners import (
-    ConditionFuser,
     ConditioningProvider,
     T5Conditioner,
     ConditioningAttributes
@@ -78,11 +77,9 @@ class AudioGen(nn.Module):
                 ConditioningAttributes(text={'description': d}) for d in descriptions]
             gen_tokens = self.lm.generate(
                 conditions=attributes,
-                max_gen_len=int(self.duration * self.frame_rate)) #[n_draw, 4, 37]
-            x = self.compression_model.decode(gen_tokens, None)   #[n_draw, 1, 11840]
-            n_draw, _, n_time_samples = x.shape
-            x = x.reshape(1, n_draw * n_time_samples)  # linearise n_draw
-            print('______________\nGENTOk 5', gen_tokens)
             print('GENAUD 5', x.sum())
         return x
@@ -147,13 +144,13 @@ class AudioGen(nn.Module):
             attribute_dropout = dict_from_config(getattr(cfg, 'attribute_dropout'))
             cls_free_guidance = dict_from_config(getattr(cfg, 'classifier_free_guidance'))
             cfg_prob, cfg_coef = cls_free_guidance['training_dropout'], cls_free_guidance['inference_coef']
-            fuser = self.get_condition_fuser(cfg)
             condition_provider = self.get_conditioner_provider(kwargs["dim"], cfg
                                                                ).to(self.device)
-            if len(fuser.fuse2cond['cross']) > 0:  # enforce cross-att programmatically
-                kwargs['cross_attention'] = True
             if codebooks_pattern_cfg.modeling is None:
                 print('Q MODELING\n=\n=><')
                 assert q_modeling is not None, \
@@ -166,7 +163,6 @@ class AudioGen(nn.Module):
             return LMModel(
                 pattern_provider=pattern_provider,
                 condition_provider=condition_provider,
-                fuser=fuser,
                 cfg_dropout=cfg_prob,
                 cfg_coef=cfg_coef,
                 attribute_dropout=attribute_dropout,
@@ -202,14 +198,6 @@ class AudioGen(nn.Module):
         return ConditioningProvider(conditioners)
-    def get_condition_fuser(self, cfg):
-        """Instantiate a condition fuser object."""
-        fuser_cfg = getattr(cfg, 'fuser')
-        fuser_methods = ['sum', 'cross', 'prepend', 'input_interpolate']
-        fuse2cond = {k: fuser_cfg[k] for k in fuser_methods}
-        kwargs = {k: v for k, v in fuser_cfg.items() if k not in fuser_methods}
-        fuser = ConditionFuser(fuse2cond=fuse2cond, **kwargs)
-        return fuser
     def get_codebooks_pattern_provider(self, n_q, cfg):

 from .seanet import SEANetDecoder
 from .codebooks_patterns import DelayedPatternProvider
 from .conditioners import (
     ConditioningProvider,
     T5Conditioner,
     ConditioningAttributes
                 ConditioningAttributes(text={'description': d}) for d in descriptions]
             gen_tokens = self.lm.generate(
                 conditions=attributes,
+                max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
+            x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
+            print('______________\nGENTOk 5', gen_tokens.shape)
             print('GENAUD 5', x.sum())
         return x
             attribute_dropout = dict_from_config(getattr(cfg, 'attribute_dropout'))
             cls_free_guidance = dict_from_config(getattr(cfg, 'classifier_free_guidance'))
             cfg_prob, cfg_coef = cls_free_guidance['training_dropout'], cls_free_guidance['inference_coef']
             condition_provider = self.get_conditioner_provider(kwargs["dim"], cfg
                                                                ).to(self.device)
+            # if len(fuser.fuse2cond['cross']) > 0:  # enforce cross-att programmatically
+            kwargs['cross_attention'] = True
             if codebooks_pattern_cfg.modeling is None:
                 print('Q MODELING\n=\n=><')
                 assert q_modeling is not None, \
             return LMModel(
                 pattern_provider=pattern_provider,
                 condition_provider=condition_provider,
                 cfg_dropout=cfg_prob,
                 cfg_coef=cfg_coef,
                 attribute_dropout=attribute_dropout,
         return ConditioningProvider(conditioners)
     def get_codebooks_pattern_provider(self, n_q, cfg):

audiocraft/conditioners.py CHANGED Viewed

@@ -4,7 +4,6 @@ import logging
 import random
 import typing as tp
 import warnings
-import soundfile
 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 from torch import nn
@@ -243,33 +242,4 @@ class ConditioningProvider(nn.Module):
         for text in texts:
             for condition in self.text_conditions:
                 out[condition].append(text[condition])
-        return out
-class ConditionFuser(nn.Module):
-    FUSING_METHODS = ["sum", "prepend", "cross", "input_interpolate"]
-    def __init__(self, fuse2cond: tp.Dict[str, tp.List[str]], cross_attention_pos_emb: bool = False,
-                 cross_attention_pos_emb_scale: float = 1.0):
-        super().__init__()
-        assert all(
-            [k in self.FUSING_METHODS for k in fuse2cond.keys()]
-        ), f"Got invalid fuse method, allowed methods: {self.FUSING_METHODS}"
-        self.cross_attention_pos_emb = cross_attention_pos_emb
-        self.cross_attention_pos_emb_scale = cross_attention_pos_emb_scale
-        self.fuse2cond: tp.Dict[str, tp.List[str]] = fuse2cond
-        self.cond2fuse: tp.Dict[str, str] = {}
-        for fuse_method, conditions in fuse2cond.items():
-            for condition in conditions:
-                self.cond2fuse[condition] = fuse_method
-    def forward(
-        self,
-        input,
-        conditions):
-        return input, conditions['description'][0] #cross_attention_output

 import random
 import typing as tp
 import warnings
 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 import torch
 from torch import nn
         for text in texts:
             for condition in self.text_conditions:
                 out[condition].append(text[condition])
+        return out

audiocraft/lm.py CHANGED Viewed

@@ -10,31 +10,7 @@ from functools import partial
 from torch import nn
 from audiocraft.activations import get_activation_fn
-def sample_top_k(p, k=1, n_draw=None):
-    """
-        p probabs 2048 ?
-        num_draw : how many tokens to sample (for duplicate elongation)
-    """
-    p = torch.softmax(p, dim=-1)  # p/temp
-    top_k_value, i250 = torch.topk(p, k, dim=-1)   # probs: [1, 4, 2048]
-    # print('\n_____TOPK________\n', top_k_value.shape, top_k_value[0, 0, :10], '\n___________END_TOPK____________\n')
-    min_value_top_k = top_k_value[..., [-1]]  #
-    p *= (p >= min_value_top_k).float()
-    p.div_(p.sum(dim=-1, keepdim=True))
-    # -- next_token = multinomial(probs, num_samples=num_draw)
-    # RESHAPED into bs, 4, 250
-    p_ = p.reshape(-1, p.shape[-1])
-    out = torch.multinomial(p_,
-                            num_samples=n_draw,
-                            replacement=False)  # [4, num_draw]
-    return out.transpose(0, 1)[:, :, None]       # [num_draw, 4, 1]
@@ -160,21 +136,26 @@ class LMModel(nn.Module):
     def __init__(self,
                  pattern_provider,
                  condition_provider,
-                 fuser,
-                 n_q: int = 8, card: int = 1024, dim: int = 128, num_heads: int = 8,
-                 hidden_scale: int = 4, norm: str = 'layer_norm', norm_first: bool = False,
-                 emb_lr: tp.Optional[float] = None, bias_proj: bool = True,
-                 weight_init: tp.Optional[str] = None, depthwise_init: tp.Optional[str] = None,
-                 zero_bias_init: bool = False, cfg_dropout: float = 0, cfg_coef: float = 1.0,
-                 attribute_dropout: tp.Dict[str, tp.Dict[str, float]] = {}, two_step_cfg: bool = False,
                  **kwargs):
         super().__init__()
         self.cfg_coef = cfg_coef
-        self.n_draw = 1
         self.condition_provider = condition_provider
-        self.fuser = fuser
         self.card = card  # 2048 ?
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
@@ -251,37 +232,39 @@ class LMModel(nn.Module):
     @property
     def special_token_id(self) -> int:
         return self.card
-    @property
-    def num_codebooks(self) -> int:
-        return self.n_q
     def forward(self,
                 sequence,
                 condition_tensors=None,
                 token_count=None):
-        B, K, S = sequence.shape    # linears are n_q
-        input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
-        # input_, cross_attention_input = self.fuser(input_, condition_tensors)
-        cross_attention_input = condition_tensors['description'][0]
-        # print(f'{input_.shape=}')
         out = self.transformer(input_,
-                               cross_attention_src=cross_attention_input,
                                token_count=token_count)
         if self.out_norm:
             out = self.out_norm(out)
-        # K = 2 because of llm producing 2 tokens?
-        # so only 2 x sel.flinear() of 4 are used ?
-        # WHy torch.stack is in dim=1
-        logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
-        # print(f'{input_.shape=}  {out.shape=}  {cross_attention_input.shape=}  {logits.shape=} FUSER LLM')
-        # remove the prefix from the model outputs
-        # if len(self.fuser.fuse2cond['prepend']) > 0:
-        #     logits = logits[:, :, -S:]
-        #     print('==========================================PRESFIX')
-        return logits  # [B, K, S, card]
     # GENERATE class revert_codebook_patterns()
@@ -289,7 +272,6 @@ class LMModel(nn.Module):
     def generate(self,
                  prompt = None,
                  conditions = [],
-                 num_samples = 1,  # N next token
                  max_gen_len=256):
         print(f'{prompt=} {conditions=}')
@@ -299,7 +281,8 @@ class LMModel(nn.Module):
         tokenized = self.condition_provider.tokenize(conditions)
-        # print('TOKENIZ', tokenized)  # 'description'
         # TOKENIZ {'description': {'input_ids': tensor([[3887,   16, 2815,    1],
         # [3887,   16, 2815,    1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1],
         # [1, 1, 1, 1]], device='cuda:0')}}
@@ -307,105 +290,66 @@ class LMModel(nn.Module):
         cfg_conditions = self.condition_provider(tokenized)
-        if prompt is None:
-            assert num_samples > 0
-            prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
-            print('\n\n\n\n DEFAULT PROMPT ZERO \n\n-')
-        B, K, T = prompt.shape
-        start_offset = T
-        pattern = self.pattern_provider.get_pattern(max_gen_len)  # duplicate sequence
-        # this token is used as default value for codes that are not generated yet ?
-        unknown_token = -1
-        gen_codes = torch.full((B, K, max_gen_len), unknown_token, dtype=torch.long, device=device)
-        gen_codes[..., :start_offset] = prompt  # place 0
-        _gen_sequence, _, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
-        # --
-        # print(mask.shape, mask.sum(), 'MSK LM')
-        # torch.Size([4, 39]) tensor(140, device='cuda:0') MSK LM ? Fully 1 normal no special token
-        # --\
-        # list - Elongation for take-5 next tokens - n_draw 5 tokens at each time-step
-        # append them at end of sequence
-        duplicate_draw = [
-            _gen_sequence[:, :, 0:1].repeat(self.n_draw, 1, 1)
-                            ]
-        for offset in range(1, _gen_sequence.shape[2]):
-            logits = self.forward(_gen_sequence[:, :, offset-1:offset],  # bs/n_draw, 4, 1
                                   condition_tensors=cfg_conditions,
-                                  token_count=offset)
-            # print(f'BEF {logits.shape=} BEF utils.SampleTop5')  # AGREES 4 BEF logits.shape=torch.Size([1, 4, 1, 2048]) BEF utils.SampleTop5
-            next_token = sample_top_k(logits, n_draw=self.n_draw)  # [1,4,2048] logits
-            _gen_sequence[:, :, offset] = next_token[0, :, 0]  #  next_token=[1,4,6] gen_seq=[1, 4, 39]
-            duplicate_draw.append(next_token)
-        gen_sequence = torch.cat(duplicate_draw, 2)  # RESHAPE -> N_DRAW -> TIME
-        # revert codes as "batch"
-        # In decoder - flatten
-        # _, tokd, len_seq = gen_sequence.shape
-        # gen_sequence = gen_sequence.transpose(0, 1).reshape(tokd, self.n_draw * len_seq)[None, :, :]
-        print(f' <=> BEFORE CODES {gen_sequence.shape=} {_gen_sequence.shape=}\n')   # ARRIVES here also if special
-        # revert_pattern_logits ~ NOT CALLED EXPLICIT
-        out_codes, _, _ = pattern.revert_pattern_sequence(gen_sequence,
-                                                          special_token=unknown_token)
-        # set(out_codes.unique().tolist()) - set(gen_sequence.unique().tolist())  # set()
-        # UNIQUE are the SAME ---------------?> is it rearrange
-        # ARE SOME PARTS IGNORED OR RE-ARRANGED
-        # print(f'{unknown_token=} {gen_sequence.shape=}  {out_codes.shape=}')
-        # -> unknown tokn = -1 or 2048
-        # unknown_token=-1
-        print(f' <=> CODES {out_codes.shape=} {out_codes.min()}  {out_codes.max()}\n')   # ARRIVES here also if special
-        # unknown_token=-1 gen_sequence.shape=torch.Size([1, 4, 39])  out_codes.shape=torch.Size([1, 4, 35])
-        # <=> CODES out_codes.shape=torch.Size([1, 4, 35]) 30  2024
-        # Clean Transformer MHA k_history v_history
         for lay in self.transformer.layers:
              lay.self_attn.k_history = None
              lay.self_attn.v_history = None
-        return out_codes  #

 from torch import nn
 from audiocraft.activations import get_activation_fn
     def __init__(self,
                  pattern_provider,
                  condition_provider,
+                 n_q: int = 8,
+                 card: int = 1024,
+                 dim: int = 128,
+                 num_heads: int = 8,
+                 hidden_scale: int = 4,
+                 norm: str = 'layer_norm',
+                 norm_first: bool = False,
+                 emb_lr: tp.Optional[float] = None,
+                 bias_proj: bool = True,
+                 weight_init: tp.Optional[str] = None,
+                 depthwise_init: tp.Optional[str] = None,
+                 zero_bias_init: bool = False, cfg_dropout: float = 0,
+                 cfg_coef: float = 1.0,
+                 two_step_cfg: bool = False,
                  **kwargs):
         super().__init__()
         self.cfg_coef = cfg_coef
         self.condition_provider = condition_provider
         self.card = card  # 2048 ?
+        self.n_draw = 8  # replicate so many times the generation of each text in batch
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
     @property
     def special_token_id(self) -> int:
         return self.card
+    def sample_top_k(self, p, k=249):
+        bs, _, _, hidden = p.shape # logits [3, 4, 1, 2048]
+        p = torch.softmax(p, dim=3)
+        top_k_value, i250 = torch.topk(p, k, dim=3)  # [3, 4, 1, k]
+        min_value_top_k = top_k_value[:, :, :, -1:]
+        p *= (p >= min_value_top_k).float()   # zero low probs
+        p.div_(p.sum(dim=-1, keepdim=True))   # renormalise on non-zero probs
+        # BRING THE nq = 4 IN BATCH
+        p = p.reshape(bs * self.n_q, hidden)
+        out = torch.multinomial(p,  # p=[bs,2048], out=[bs, num_samples]
+                                num_samples=self.n_draw,
+                                replacement=False)  # [bs*4, self.n_draw]
+        return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs, self.n_draw, 4]
     def forward(self,
                 sequence,
                 condition_tensors=None,
                 token_count=None):
+        input_ = sum([self.emb[k](sequence[:, k]) for k in range(self.n_q)])
         out = self.transformer(input_,
+                               cross_attention_src=condition_tensors['description'][0],
                                token_count=token_count)
         if self.out_norm:
             out = self.out_norm(out)
+        logits = torch.stack([self.linears[k](out) for k in range(self.n_q)], dim=1)
+        return logits # [bs, 4, 1, 2048]
     # GENERATE class revert_codebook_patterns()
     def generate(self,
                  prompt = None,
                  conditions = [],
                  max_gen_len=256):
         print(f'{prompt=} {conditions=}')
         tokenized = self.condition_provider.tokenize(conditions)
+        # print(f'TOKENIZ, {tokenized.keys()=}, {tokenized=}')  # 'description'
         # TOKENIZ {'description': {'input_ids': tensor([[3887,   16, 2815,    1],
         # [3887,   16, 2815,    1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1],
         # [1, 1, 1, 1]], device='cuda:0')}}
         cfg_conditions = self.condition_provider(tokenized)
+        # print(f'CFGcon, {cfg_conditions.keys()=}, {cfg_conditions["description"][0].shape=}')
+        # USE THIS ATTENTION MASK IF NOT SAME LEN;
+        bs, _7, _1536 = cfg_conditions['description'][0].shape  # [bs, textlen, 1536]
+        pattern = self.pattern_provider.get_pattern(max_gen_len)
+        gen_codes = torch.full((bs,
+                                self.n_q,
+                                max_gen_len), -1, dtype=torch.long, device=device)
+        gen_sequence, _, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
+        _, _, audiodur = gen_sequence.shape  # bs, 4, 7=audiodur
+        # print(gen_sequence.shape, mask.shape, 'F')  # mask has no batch = [4,audio_duration]
+        # print(f'{mask=}')
+        #
+        # torch.Size([3, 4, 7]) torch.Size([4, 7]) F
+        # mask=tensor([[False,  True,  True,  True, False, False, False],
+        #              [False, False,  True,  True,  True, False, False],
+        #              [False, False, False,  True,  True,  True, False],
+        #              [False, False, False, False,  True,  True,  True]], device='cuda:0')
+        mask = mask[None, None, :, :].repeat(bs, self.n_draw, 1, 1)  # [bs, n_draw, 4, audio duration]
+        gen_sequence = gen_sequence[:, None, :, :].repeat(1, self.n_draw, 1, 1)  # bs,n_draw,4,dur
+        for offset in range(1, audiodur):
+            # pass only 0-th draw in forward
+            logits = self.forward(gen_sequence[:, 0, :, offset-1:offset],
                                   condition_tensors=cfg_conditions,
+                                  token_count=offset)  # [bs, 4, 1, 2048]
+            next_token = self.sample_top_k(logits)  # [bs, n_draw, 4]
+            # MASK is not full 1---- HAS 4 x audioduration PATTERN
+            m = mask[:, :, :, offset]
+            next_token[~m] = self.special_token_id
+            gen_sequence[:, :, :, offset] = torch.where(
+                gen_sequence[:, :, :, offset] == -1, #unknown_token,
+                next_token,
+                gen_sequence[:, :, :, offset]
+            )
+        # 1. reshape n_draw as bs * n_draw
+        # 2. invert all short-sequences
+        # 3. reshape bs * n_draw -> bs, n_draw * audiodur ELONGATION
+        out_codes, _, _ = pattern.revert_pattern_sequence(
+            gen_sequence.reshape(bs * self.n_draw, 4, audiodur),  # [3,8,4,7]
+            special_token=-1)
+        # print(f'{gen_sequence.shape=} {out_codes.shape=} Ha')  # REVERT PATTERN REDUCES DURATION?
+        _, _, new_len = out_codes.shape                        # 4 IS PRESERVED AFTER REVERT!
+        out_codes = out_codes.reshape(bs, self.n_draw, 4, new_len)
+        out_codes = out_codes.transpose(1, 2).reshape(bs, 4, self.n_draw * new_len)
+        print(out_codes.shape, 'o')
+        # Clear Transformer k/v history (Different history is kept by 48x selfattn)
         for lay in self.transformer.layers:
              lay.self_attn.k_history = None
              lay.self_attn.v_history = None
+        return out_codes

demo.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import audiofile
 import numpy as np
 from audiocraft import AudioGen
-txt = 'dogs barging in the street'
-sound_generator = AudioGen(duration=.04,
                            device='cuda:0').to('cuda:0').eval()
-x = sound_generator.generate([txt])[0].detach().cpu().numpy()
 x /= np.abs(x).max() + 1e-7
 audiofile.write('del_seane.wav', x, 16000)

 import audiofile
 import numpy as np
 from audiocraft import AudioGen
+text_list = ['dogs barging in the street', 'people po']
+sound_generator = AudioGen(duration=.74,
                            device='cuda:0').to('cuda:0').eval()
+x = sound_generator.generate(text_list)  # [bs, 1, 7680]
+# print('demo', x.shape)
+x = x[0, :, :].detach().cpu().numpy()
 x /= np.abs(x).max() + 1e-7
 audiofile.write('del_seane.wav', x, 16000)