|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Tokenization class for VITS.""" |
|
|
|
import json |
|
import os |
|
import re |
|
from typing import Any, Dict, List, Optional, Tuple, Union |
|
|
|
from transformers.tokenization_utils import PreTrainedTokenizer |
|
from transformers.utils import is_phonemizer_available, logging |
|
from transformers.utils import get_file_from_repo |
|
|
|
|
|
if is_phonemizer_available(): |
|
import phonemizer |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
VOCAB_FILES_NAMES = { |
|
"vocab_file": "vocab.json", |
|
} |
|
|
|
def is_symbol(ch): |
|
return ch in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" |
|
|
|
class BertVits2Tokenizer(PreTrainedTokenizer): |
|
""" |
|
Construct a VITS tokenizer. Also supports MMS-TTS. |
|
|
|
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to |
|
this superclass for more information regarding those methods. |
|
|
|
Args: |
|
vocab_file (`str`): |
|
Path to the vocabulary file. |
|
language (`str`, *optional*): |
|
Language identifier. |
|
add_blank (`bool`, *optional*, defaults to `True`): |
|
Whether to insert token id 0 in between the other tokens. |
|
normalize (`bool`, *optional*, defaults to `True`): |
|
Whether to normalize the input text by removing all casing and punctuation. |
|
phonemize (`bool`, *optional*, defaults to `True`): |
|
Whether to convert the input text into phonemes. |
|
is_uroman (`bool`, *optional*, defaults to `False`): |
|
Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing. |
|
""" |
|
|
|
vocab_files_names = VOCAB_FILES_NAMES |
|
model_input_names = [ |
|
"input_ids", |
|
|
|
"attention_mask", |
|
] |
|
|
|
def __init__( |
|
self, |
|
vocab_file, |
|
pad_token="<pad>", |
|
unk_token="<unk>", |
|
space_token=None, |
|
languages=None, |
|
add_blank=True, |
|
**kwargs, |
|
) -> None: |
|
with open(vocab_file, encoding="utf-8") as vocab_handle: |
|
self.encoder = json.load(vocab_handle) |
|
|
|
self.decoder = {v: k for k, v in self.encoder.items()} |
|
self.languages = languages |
|
self.add_blank = add_blank |
|
|
|
super().__init__( |
|
pad_token=pad_token, |
|
unk_token=unk_token, |
|
space_token=space_token, |
|
languages=languages, |
|
add_blank=add_blank, |
|
**kwargs, |
|
) |
|
|
|
@property |
|
def vocab_size(self): |
|
return len(self.encoder) |
|
|
|
def get_vocab(self): |
|
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} |
|
vocab.update(self.added_tokens_encoder) |
|
return vocab |
|
|
|
def zh_g2p(self, text: str) -> Tuple[str, List[int], List[int]]: |
|
"""Converts a string of Chinese text into a list of phonemes and tones.""" |
|
from pypinyin import lazy_pinyin, Style |
|
|
|
g2p_file = get_file_from_repo(self.name_or_path, "zh_g2p.json", subfolder="data") |
|
|
|
with open(g2p_file, encoding="utf-8") as f: |
|
g2p = json.load(f) |
|
|
|
phones = [] |
|
tones = [] |
|
word2ph = [] |
|
|
|
initials = lazy_pinyin(text, neutral_tone_with_five=True, style=Style.INITIALS, tone_sandhi=True) |
|
finals = lazy_pinyin(text, neutral_tone_with_five=True, style=Style.FINALS_TONE3, tone_sandhi=True) |
|
|
|
for initial, final in zip(initials, finals): |
|
tone = 0 |
|
if final[-1].isdigit(): |
|
pinyin = initial + final[:-1] |
|
tone = int(final[-1]) |
|
if initial: |
|
pinyin = re.sub(r"uei$", "ui", pinyin) |
|
pinyin = re.sub(r"iou$", "iu", pinyin) |
|
pinyin = re.sub(r"uen$", "un", pinyin) |
|
else: |
|
pinyin = re.sub(r"^ing$", "ying", pinyin) |
|
pinyin = re.sub(r"^i$", "yi", pinyin) |
|
pinyin = re.sub(r"^in$", "yin", pinyin) |
|
pinyin = re.sub(r"^u$", "wu", pinyin) |
|
pinyin = re.sub(r"^v", "yu", pinyin) |
|
pinyin = re.sub(r"^e", "e", pinyin) |
|
pinyin = re.sub(r"^i", "y", pinyin) |
|
pinyin = re.sub(r"^u", "w", pinyin) |
|
else: |
|
pinyin = initial + final |
|
if initial == final: |
|
tone = 0 |
|
phone = [initial] |
|
else: |
|
phone = g2p.get(pinyin, [self.unk_token]) |
|
if phone[0] == self.unk_token: |
|
tone = 0 |
|
phone = [self.unk_token] |
|
tones += [tone] * len(phone) |
|
phones += phone |
|
if initial != 'SP': |
|
word2ph.append(len(phone)) |
|
else: |
|
word2ph[-1] += 1 |
|
|
|
phones = "<|SEP|>".join(phones) |
|
return phones, tones, word2ph |
|
|
|
|
|
def convert_g2p(self, text: str, language: str, add_special_tokens: bool) -> Tuple[str, List[int], List[int]]: |
|
"""Converts a string of text into a list of phonemes and tones.""" |
|
if not is_phonemizer_available(): |
|
raise ImportError("Phonemizer is not available. Please install it using `pip install phonemizer`.") |
|
|
|
if language.startswith("zh"): |
|
phones, tones, word2ph = self.zh_g2p(text) |
|
else: |
|
raise ValueError(f"Language '{language}' not supported by VITS.") |
|
|
|
lang_ids = [self.languages.index(language)] * len(tones) |
|
|
|
if self.add_blank: |
|
tones = self._add_blank(tones, 0) |
|
lang_ids = self._add_blank(lang_ids, 0) |
|
|
|
for i in range(len(word2ph)): |
|
word2ph[i] = word2ph[i] * 2 |
|
word2ph[0] += 1 |
|
|
|
if add_special_tokens: |
|
word2ph = [0] + word2ph + [0] |
|
|
|
return phones, tones, lang_ids, word2ph |
|
|
|
def _add_blank(self, sequence: List[Union[str, int]], blank: Union[str, int]) -> List[Union[str, int]]: |
|
interspersed = [blank] * (len(sequence) * 2 + 1) |
|
interspersed[1::2] = sequence |
|
return interspersed |
|
|
|
def _tokenize(self, text: str) -> List[str]: |
|
"""Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters.""" |
|
tokens = [] |
|
|
|
if '<|SEP|>' in text: |
|
tokens = text.split('<|SEP|>') |
|
else: |
|
i = 0 |
|
while i < len(text): |
|
found = False |
|
for j in range(min(len(text), i + 2), i, -1): |
|
subtext = text[i:j] |
|
if subtext in self.encoder: |
|
tokens.append(subtext) |
|
i = j |
|
found = True |
|
break |
|
if not found: |
|
tokens.append(self.unk_token) |
|
i += 1 |
|
|
|
if self.add_blank: |
|
tokens = self._add_blank(tokens, self.pad_token) |
|
|
|
return tokens |
|
|
|
def convert_tokens_to_string(self, tokens: List[str]) -> str: |
|
if self.add_blank and len(tokens) > 1: |
|
tokens = tokens[1::2] |
|
return "".join(tokens) |
|
|
|
def _convert_token_to_id(self, token): |
|
"""Converts a token (str) in an id using the vocab.""" |
|
return self.encoder.get(token, self.encoder.get(self.unk_token)) |
|
|
|
def _convert_id_to_token(self, index): |
|
"""Converts an index (integer) in a token (str) using the vocab.""" |
|
return self.decoder.get(index) |
|
|
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Union[Tuple[str], None]: |
|
if not os.path.isdir(save_directory): |
|
logger.error(f"Vocabulary path ({save_directory}) should be a directory") |
|
return |
|
|
|
vocab_file = os.path.join( |
|
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] |
|
) |
|
|
|
with open(vocab_file, "w", encoding="utf-8") as f: |
|
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") |
|
|
|
return (vocab_file,) |
|
|