Spaces:
Running
on
Zero
Running
on
Zero
# Copyright (c) 2024 Amphion. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
import re | |
import os | |
from typing import List, Pattern, Union | |
from phonemizer.utils import list2str, str2list | |
from phonemizer.backend import EspeakBackend | |
from phonemizer.backend.espeak.language_switch import LanguageSwitch | |
from phonemizer.backend.espeak.words_mismatch import WordMismatch | |
from phonemizer.punctuation import Punctuation | |
from phonemizer.separator import Separator | |
class TextTokenizer: | |
"""Phonemize Text.""" | |
def __init__( | |
self, | |
language="en-us", | |
backend="espeak", | |
separator=Separator(word="|_|", syllable="-", phone="|"), | |
preserve_punctuation=True, | |
with_stress: bool = False, | |
tie: Union[bool, str] = False, | |
language_switch: LanguageSwitch = "remove-flags", | |
words_mismatch: WordMismatch = "ignore", | |
) -> None: | |
self.preserve_punctuation_marks = ",.?!;:'…" | |
self.backend = EspeakBackend( | |
language, | |
punctuation_marks=self.preserve_punctuation_marks, | |
preserve_punctuation=preserve_punctuation, | |
with_stress=with_stress, | |
tie=tie, | |
language_switch=language_switch, | |
words_mismatch=words_mismatch, | |
) | |
self.separator = separator | |
# convert chinese punctuation to english punctuation | |
def convert_chinese_punctuation(self, text: str) -> str: | |
text = text.replace(",", ",") | |
text = text.replace("。", ".") | |
text = text.replace("!", "!") | |
text = text.replace("?", "?") | |
text = text.replace(";", ";") | |
text = text.replace(":", ":") | |
text = text.replace("、", ",") | |
text = text.replace("‘", "'") | |
text = text.replace("’", "'") | |
text = text.replace("⋯", "…") | |
text = text.replace("···", "…") | |
text = text.replace("・・・", "…") | |
text = text.replace("...", "…") | |
return text | |
def __call__(self, text, strip=True) -> List[str]: | |
text_type = type(text) | |
normalized_text = [] | |
for line in str2list(text): | |
line = self.convert_chinese_punctuation(line.strip()) | |
line = re.sub(r"[^\w\s_,\.\?!;:\'…]", "", line) | |
line = re.sub(r"\s*([,\.\?!;:\'…])\s*", r"\1", line) | |
line = re.sub(r"\s+", " ", line) | |
normalized_text.append(line) | |
# print("Normalized test: ", normalized_text[0]) | |
phonemized = self.backend.phonemize( | |
normalized_text, separator=self.separator, strip=strip, njobs=1 | |
) | |
if text_type == str: | |
phonemized = re.sub(r"([,\.\?!;:\'…])", r"|\1|", list2str(phonemized)) | |
phonemized = re.sub(r"\|+", "|", phonemized) | |
phonemized = phonemized.rstrip("|") | |
else: | |
for i in range(len(phonemized)): | |
phonemized[i] = re.sub(r"([,\.\?!;:\'…])", r"|\1|", phonemized[i]) | |
phonemized[i] = re.sub(r"\|+", "|", phonemized[i]) | |
phonemized[i] = phonemized[i].rstrip("|") | |
return phonemized | |