ing0's picture
infer
b96e750
raw
history blame
2.91 kB
# Copyright (c) 2024 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from diffrhythm.g2p.g2p import cleaners
from tokenizers import Tokenizer
from diffrhythm.g2p.g2p.text_tokenizers import TextTokenizer
import LangSegment
import json
import re
class PhonemeBpeTokenizer:
def __init__(self, vacab_path="./diffrhythm/g2p/g2p/vocab.json"):
self.lang2backend = {
"zh": "cmn",
"ja": "ja",
"en": "en-us",
"fr": "fr-fr",
"ko": "ko",
"de": "de",
}
self.text_tokenizers = {}
self.int_text_tokenizers()
with open(vacab_path, "r") as f:
json_data = f.read()
data = json.loads(json_data)
self.vocab = data["vocab"]
LangSegment.setfilters(["en", "zh", "ja", "ko", "fr", "de"])
def int_text_tokenizers(self):
for key, value in self.lang2backend.items():
self.text_tokenizers[key] = TextTokenizer(language=value)
def tokenize(self, text, sentence, language):
# 1. convert text to phoneme
phonemes = []
if language == "auto":
seglist = LangSegment.getTexts(text)
tmp_ph = []
for seg in seglist:
tmp_ph.append(
self._clean_text(
seg["text"], sentence, seg["lang"], ["cjekfd_cleaners"]
)
)
phonemes = "|_|".join(tmp_ph)
else:
phonemes = self._clean_text(text, sentence, language, ["cjekfd_cleaners"])
# print('clean text: ', phonemes)
# 2. tokenize phonemes
phoneme_tokens = self.phoneme2token(phonemes)
# print('encode: ', phoneme_tokens)
# # 3. decode tokens [optional]
# decoded_text = self.tokenizer.decode(phoneme_tokens)
# print('decoded: ', decoded_text)
return phonemes, phoneme_tokens
def _clean_text(self, text, sentence, language, cleaner_names):
for name in cleaner_names:
cleaner = getattr(cleaners, name)
if not cleaner:
raise Exception("Unknown cleaner: %s" % name)
text = cleaner(text, sentence, language, self.text_tokenizers)
return text
def phoneme2token(self, phonemes):
tokens = []
if isinstance(phonemes, list):
for phone in phonemes:
phone = phone.split("\t")[0]
phonemes_split = phone.split("|")
tokens.append(
[self.vocab[p] for p in phonemes_split if p in self.vocab]
)
else:
phonemes = phonemes.split("\t")[0]
phonemes_split = phonemes.split("|")
tokens = [self.vocab[p] for p in phonemes_split if p in self.vocab]
return tokens