Spaces:
Running
Running
from typing import Callable, Dict, List, Union | |
from TTS.tts.utils.text import cleaners | |
from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes | |
from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name | |
from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer | |
from TTS.utils.generic_utils import get_import_path, import_class | |
class TTSTokenizer: | |
"""🐸TTS tokenizer to convert input characters to token IDs and back. | |
Token IDs for OOV chars are discarded but those are stored in `self.not_found_characters` for later. | |
Args: | |
use_phonemes (bool): | |
Whether to use phonemes instead of characters. Defaults to False. | |
characters (Characters): | |
A Characters object to use for character-to-ID and ID-to-character mappings. | |
text_cleaner (callable): | |
A function to pre-process the text before tokenization and phonemization. Defaults to None. | |
phonemizer (Phonemizer): | |
A phonemizer object or a dict that maps language codes to phonemizer objects. Defaults to None. | |
Example: | |
>>> from TTS.tts.utils.text.tokenizer import TTSTokenizer | |
>>> tokenizer = TTSTokenizer(use_phonemes=False, characters=Graphemes()) | |
>>> text = "Hello world!" | |
>>> ids = tokenizer.text_to_ids(text) | |
>>> text_hat = tokenizer.ids_to_text(ids) | |
>>> assert text == text_hat | |
""" | |
def __init__( | |
self, | |
use_phonemes=False, | |
text_cleaner: Callable = None, | |
characters: "BaseCharacters" = None, | |
phonemizer: Union["Phonemizer", Dict] = None, | |
add_blank: bool = False, | |
use_eos_bos=False, | |
): | |
self.text_cleaner = text_cleaner | |
self.use_phonemes = use_phonemes | |
self.add_blank = add_blank | |
self.use_eos_bos = use_eos_bos | |
self.characters = characters | |
self.not_found_characters = [] | |
self.phonemizer = phonemizer | |
def characters(self): | |
return self._characters | |
def characters(self, new_characters): | |
self._characters = new_characters | |
self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None | |
self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None | |
def encode(self, text: str) -> List[int]: | |
"""Encodes a string of text as a sequence of IDs.""" | |
token_ids = [] | |
for char in text: | |
try: | |
idx = self.characters.char_to_id(char) | |
token_ids.append(idx) | |
except KeyError: | |
# discard but store not found characters | |
if char not in self.not_found_characters: | |
self.not_found_characters.append(char) | |
print(text) | |
print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.") | |
return token_ids | |
def decode(self, token_ids: List[int]) -> str: | |
"""Decodes a sequence of IDs to a string of text.""" | |
text = "" | |
for token_id in token_ids: | |
text += self.characters.id_to_char(token_id) | |
return text | |
def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument | |
"""Converts a string of text to a sequence of token IDs. | |
Args: | |
text(str): | |
The text to convert to token IDs. | |
language(str): | |
The language code of the text. Defaults to None. | |
TODO: | |
- Add support for language-specific processing. | |
1. Text normalizatin | |
2. Phonemization (if use_phonemes is True) | |
3. Add blank char between characters | |
4. Add BOS and EOS characters | |
5. Text to token IDs | |
""" | |
# TODO: text cleaner should pick the right routine based on the language | |
if self.text_cleaner is not None: | |
text = self.text_cleaner(text) | |
if self.use_phonemes: | |
text = self.phonemizer.phonemize(text, separator="", language=language) | |
text = self.encode(text) | |
if self.add_blank: | |
text = self.intersperse_blank_char(text, True) | |
if self.use_eos_bos: | |
text = self.pad_with_bos_eos(text) | |
return text | |
def ids_to_text(self, id_sequence: List[int]) -> str: | |
"""Converts a sequence of token IDs to a string of text.""" | |
return self.decode(id_sequence) | |
def pad_with_bos_eos(self, char_sequence: List[str]): | |
"""Pads a sequence with the special BOS and EOS characters.""" | |
return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id] | |
def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False): | |
"""Intersperses the blank character between characters in a sequence. | |
Use the ```blank``` character if defined else use the ```pad``` character. | |
""" | |
char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad | |
result = [char_to_use] * (len(char_sequence) * 2 + 1) | |
result[1::2] = char_sequence | |
return result | |
def print_logs(self, level: int = 0): | |
indent = "\t" * level | |
print(f"{indent}| > add_blank: {self.add_blank}") | |
print(f"{indent}| > use_eos_bos: {self.use_eos_bos}") | |
print(f"{indent}| > use_phonemes: {self.use_phonemes}") | |
if self.use_phonemes: | |
print(f"{indent}| > phonemizer:") | |
self.phonemizer.print_logs(level + 1) | |
if len(self.not_found_characters) > 0: | |
print(f"{indent}| > {len(self.not_found_characters)} not found characters:") | |
for char in self.not_found_characters: | |
print(f"{indent}| > {char}") | |
def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None): | |
"""Init Tokenizer object from config | |
Args: | |
config (Coqpit): Coqpit model config. | |
characters (BaseCharacters): Defines the model character set. If not set, use the default options based on | |
the config values. Defaults to None. | |
""" | |
# init cleaners | |
text_cleaner = None | |
if isinstance(config.text_cleaner, (str, list)): | |
text_cleaner = getattr(cleaners, config.text_cleaner) | |
# init characters | |
if characters is None: | |
# set characters based on defined characters class | |
if config.characters and config.characters.characters_class: | |
CharactersClass = import_class(config.characters.characters_class) | |
characters, new_config = CharactersClass.init_from_config(config) | |
# set characters based on config | |
else: | |
if config.use_phonemes: | |
# init phoneme set | |
characters, new_config = IPAPhonemes().init_from_config(config) | |
else: | |
# init character set | |
characters, new_config = Graphemes().init_from_config(config) | |
else: | |
characters, new_config = characters.init_from_config(config) | |
# set characters class | |
new_config.characters.characters_class = get_import_path(characters) | |
# init phonemizer | |
phonemizer = None | |
if config.use_phonemes: | |
if "phonemizer" in config and config.phonemizer == "multi_phonemizer": | |
lang_to_phonemizer_name = {} | |
for dataset in config.datasets: | |
if dataset.language != "": | |
lang_to_phonemizer_name[dataset.language] = dataset.phonemizer | |
else: | |
raise ValueError("Multi phonemizer requires language to be set for each dataset.") | |
phonemizer = MultiPhonemizer(lang_to_phonemizer_name) | |
else: | |
phonemizer_kwargs = {"language": config.phoneme_language} | |
if "phonemizer" in config and config.phonemizer: | |
phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs) | |
else: | |
try: | |
phonemizer = get_phonemizer_by_name( | |
DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs | |
) | |
new_config.phonemizer = phonemizer.name() | |
except KeyError as e: | |
raise ValueError( | |
f"""No phonemizer found for language {config.phoneme_language}. | |
You may need to install a third party library for this language.""" | |
) from e | |
return ( | |
TTSTokenizer( | |
config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars | |
), | |
new_config, | |
) | |