Spaces:
Running
Running
import os | |
import torch | |
from tokenizers import Tokenizer | |
from TTS.tts.utils.text.cleaners import english_cleaners | |
DEFAULT_VOCAB_FILE = os.path.join( | |
os.path.dirname(os.path.realpath(__file__)), "../../utils/assets/tortoise/tokenizer.json" | |
) | |
class VoiceBpeTokenizer: | |
def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, vocab_str=None): | |
self.tokenizer = None | |
if vocab_file is not None: | |
self.tokenizer = Tokenizer.from_file(vocab_file) | |
if vocab_str is not None: | |
self.tokenizer = Tokenizer.from_str(vocab_str) | |
def preprocess_text(self, txt): | |
txt = english_cleaners(txt) | |
return txt | |
def encode(self, txt): | |
txt = self.preprocess_text(txt) | |
txt = txt.replace(" ", "[SPACE]") | |
return self.tokenizer.encode(txt).ids | |
def decode(self, seq): | |
if isinstance(seq, torch.Tensor): | |
seq = seq.cpu().numpy() | |
txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "") | |
txt = txt.replace("[SPACE]", " ") | |
txt = txt.replace("[STOP]", "") | |
txt = txt.replace("[UNK]", "") | |
return txt | |