from collections import defaultdict from dataclasses import dataclass, field import logging import random import typing as tp import warnings from transformers import T5EncoderModel, T5Tokenizer # type: ignore import torch from torch import nn logger = logging.getLogger(__name__) TextCondition = tp.Optional[str] # a text condition can be a string or None (if doesn't exist) ConditionType = tp.Tuple[torch.Tensor, torch.Tensor] # condition, mask class JointEmbedCondition(tp.NamedTuple): wav: torch.Tensor text: tp.List[tp.Optional[str]] length: torch.Tensor sample_rate: tp.List[int] path: tp.List[tp.Optional[str]] = [] seek_time: tp.List[tp.Optional[float]] = [] @dataclass class ConditioningAttributes: text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict) wav: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict) joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict) def __getitem__(self, item): return getattr(self, item) @property def text_attributes(self): return self.text.keys() @property def wav_attributes(self): return self.wav.keys() @property def joint_embed_attributes(self): return self.joint_embed.keys() @property def attributes(self): return { "text": self.text_attributes, "wav": self.wav_attributes, "joint_embed": self.joint_embed_attributes, } def to_flat_dict(self): return { **{f"text.{k}": v for k, v in self.text.items()}, **{f"wav.{k}": v for k, v in self.wav.items()}, **{f"joint_embed.{k}": v for k, v in self.joint_embed.items()} } @classmethod def from_flat_dict(cls, x): out = cls() for k, v in x.items(): kind, att = k.split(".") out[kind][att] = v return out class Tokenizer: """Base tokenizer implementation (in case we want to introduce more advances tokenizers in the future). """ def __call__(self, texts: tp.List[tp.Optional[str]]) -> tp.Tuple[torch.Tensor, torch.Tensor]: raise NotImplementedError() class T5Conditioner(nn.Module): MODELS = ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"] MODELS_DIMS = { "t5-small": 512, "t5-base": 768, "t5-large": 1024, "t5-3b": 1024, "t5-11b": 1024, "google/flan-t5-small": 512, "google/flan-t5-base": 768, "google/flan-t5-large": 1024, "google/flan-t5-3b": 1024, "google/flan-t5-11b": 1024, } def __init__(self, name: str, output_dim: int, device: str, word_dropout: float = 0., normalize_text: bool = False, finetune=False): print(f'{finetune=}') assert name in self.MODELS, f"Unrecognized t5 model name (should in {self.MODELS})" super().__init__() self.dim = self.MODELS_DIMS[name] self.output_dim = output_dim self.output_proj = nn.Linear(self.dim, output_dim) self.device = device self.name = name self.word_dropout = word_dropout # Let's disable logging temporarily because T5 will vomit some errors otherwise. # thanks https://gist.github.com/simon-weber/7853144 previous_level = logging.root.manager.disable logging.disable(logging.ERROR) with warnings.catch_warnings(): warnings.simplefilter("ignore") try: self.t5_tokenizer = T5Tokenizer.from_pretrained(name) t5 = T5EncoderModel.from_pretrained(name).eval() #.train(mode=finetune) finally: logging.disable(previous_level) if finetune: self.t5 = t5 else: # this makes sure that the t5 models is not part # of the saved checkpoint self.__dict__['t5'] = t5.to(device) self.normalize_text = normalize_text if normalize_text: self.text_normalizer = WhiteSpaceTokenizer(1, lemma=True, stopwords=True) def tokenize(self, x: tp.List[tp.Optional[str]]) -> tp.Dict[str, torch.Tensor]: # if current sample doesn't have a certain attribute, replace with empty string entries: tp.List[str] = [xi if xi is not None else "" for xi in x] if self.normalize_text: _, _, entries = self.text_normalizer(entries, return_text=True) if self.word_dropout > 0. and self.training: new_entries = [] for entry in entries: words = [word for word in entry.split(" ") if random.random() >= self.word_dropout] new_entries.append(" ".join(words)) entries = new_entries empty_idx = torch.LongTensor([i for i, xi in enumerate(entries) if xi == ""]) inputs = self.t5_tokenizer(entries, return_tensors='pt', padding=True).to(self.device) mask = inputs['attention_mask'] mask[empty_idx, :] = 0 # zero-out index where the input is non-existant return inputs def forward(self, inputs): mask = inputs['attention_mask'] with torch.no_grad(): embeds = self.t5(**inputs).last_hidden_state embeds = self.output_proj(embeds.to(self.output_proj.weight)) embeds = (embeds * mask.unsqueeze(-1)) # T5 torch.Size([2, 4, 1536]) dict_keys(['input_ids', 'attention_mask']) print(f'{embeds.dtype=}') # inputs["input_ids"].shape=torch.Size([2, 4]) return embeds, mask class ConditioningProvider(nn.Module): def __init__(self, conditioners): super().__init__() self.conditioners = nn.ModuleDict(conditioners) @property def text_conditions(self): return [k for k, v in self.conditioners.items() if isinstance(v, T5Conditioner)] def tokenize(self, inputs: tp.List[ConditioningAttributes]) -> tp.Dict[str, tp.Any]: output = {} text = self._collate_text(inputs) # wavs = self._collate_wavs(inputs) # joint_embeds = self._collate_joint_embeds(inputs) # assert set(text.keys() | wavs.keys() | joint_embeds.keys()).issubset(set(self.conditioners.keys())), ( # f"Got an unexpected attribute! Expected {self.conditioners.keys()}, ", # f"got {text.keys(), wavs.keys(), joint_embeds.keys()}" # ) for attribute, batch in text.items(): #, joint_embeds.items()): output[attribute] = self.conditioners[attribute].tokenize(batch) print(f'COndProvToknz {output=}\n==') return output def forward(self, tokenized: tp.Dict[str, tp.Any]) -> tp.Dict[str, ConditionType]: """Compute pairs of `(embedding, mask)` using the configured conditioners and the tokenized representations. The output is for example: { "genre": (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])), "description": (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])), ... } Args: tokenized (dict): Dict of tokenized representations as returned by `tokenize()`. """ output = {} for attribute, inputs in tokenized.items(): condition, mask = self.conditioners[attribute](inputs) output[attribute] = (condition, mask) return output def _collate_text(self, samples): """Given a list of ConditioningAttributes objects, compile a dictionary where the keys are the attributes and the values are the aggregated input per attribute. For example: Input: [ ConditioningAttributes(text={"genre": "Rock", "description": "A rock song with a guitar solo"}, wav=...), ConditioningAttributes(text={"genre": "Hip-hop", "description": "A hip-hop verse"}, wav=...), ] Output: { "genre": ["Rock", "Hip-hop"], "description": ["A rock song with a guitar solo", "A hip-hop verse"] } Args: samples (list of ConditioningAttributes): List of ConditioningAttributes samples. Returns: dict[str, list[str, optional]]: A dictionary mapping an attribute name to text batch. """ out: tp.Dict[str, tp.List[tp.Optional[str]]] = defaultdict(list) texts = [x.text for x in samples] for text in texts: for condition in self.text_conditions: out[condition].append(text[condition]) return out