conex / espnet2 /text /phoneme_tokenizer.py
tobiasc's picture
Initial commit
ad16788
raw
history blame
6.65 kB
from pathlib import Path
from typing import Iterable
from typing import List
from typing import Optional
from typing import Union
import g2p_en
from typeguard import check_argument_types
from espnet2.text.abs_tokenizer import AbsTokenizer
def split_by_space(text) -> List[str]:
return text.split(" ")
def pyopenjtalk_g2p(text) -> List[str]:
import pyopenjtalk
# phones is a str object separated by space
phones = pyopenjtalk.g2p(text, kana=False)
phones = phones.split(" ")
return phones
def pyopenjtalk_g2p_accent(text) -> List[str]:
import pyopenjtalk
import re
phones = []
for labels in pyopenjtalk.run_frontend(text)[1]:
p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9])", labels)
if len(p) == 1:
phones += [p[0][0], p[0][2], p[0][1]]
return phones
def pyopenjtalk_g2p_accent_with_pause(text) -> List[str]:
import pyopenjtalk
import re
phones = []
for labels in pyopenjtalk.run_frontend(text)[1]:
if labels.split("-")[1].split("+")[0] == "pau":
phones += ["pau"]
continue
p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9])", labels)
if len(p) == 1:
phones += [p[0][0], p[0][2], p[0][1]]
return phones
def pyopenjtalk_g2p_kana(text) -> List[str]:
import pyopenjtalk
kanas = pyopenjtalk.g2p(text, kana=True)
return list(kanas)
def pypinyin_g2p(text) -> List[str]:
from pypinyin import pinyin
from pypinyin import Style
phones = [phone[0] for phone in pinyin(text, style=Style.TONE3)]
return phones
def pypinyin_g2p_phone(text) -> List[str]:
from pypinyin import pinyin
from pypinyin import Style
from pypinyin.style._utils import get_finals
from pypinyin.style._utils import get_initials
phones = [
p
for phone in pinyin(text, style=Style.TONE3)
for p in [
get_initials(phone[0], strict=True),
get_finals(phone[0], strict=True),
]
if len(p) != 0
]
return phones
class G2p_en:
"""On behalf of g2p_en.G2p.
g2p_en.G2p isn't pickalable and it can't be copied to the other processes
via multiprocessing module.
As a workaround, g2p_en.G2p is instantiated upon calling this class.
"""
def __init__(self, no_space: bool = False):
self.no_space = no_space
self.g2p = None
def __call__(self, text) -> List[str]:
if self.g2p is None:
self.g2p = g2p_en.G2p()
phones = self.g2p(text)
if self.no_space:
# remove space which represents word serapater
phones = list(filter(lambda s: s != " ", phones))
return phones
class Phonemizer:
"""Phonemizer module for various languages.
This is wrapper module of https://github.com/bootphon/phonemizer.
You can define various g2p modules by specifying options for phonemizer.
See available options:
https://github.com/bootphon/phonemizer/blob/master/phonemizer/phonemize.py#L32
"""
def __init__(
self,
word_separator: Optional[str] = None,
syllable_separator: Optional[str] = None,
**phonemize_kwargs,
):
# delayed import
from phonemizer import phonemize
from phonemizer.separator import Separator
self.phonemize = phonemize
self.separator = Separator(
word=word_separator, syllable=syllable_separator, phone=" "
)
self.phonemize_kwargs = phonemize_kwargs
def __call__(self, text) -> List[str]:
return self.phonemize(
text,
separator=self.separator,
**self.phonemize_kwargs,
).split()
class PhonemeTokenizer(AbsTokenizer):
def __init__(
self,
g2p_type: Union[None, str],
non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
space_symbol: str = "<space>",
remove_non_linguistic_symbols: bool = False,
):
assert check_argument_types()
if g2p_type is None:
self.g2p = split_by_space
elif g2p_type == "g2p_en":
self.g2p = G2p_en(no_space=False)
elif g2p_type == "g2p_en_no_space":
self.g2p = G2p_en(no_space=True)
elif g2p_type == "pyopenjtalk":
self.g2p = pyopenjtalk_g2p
elif g2p_type == "pyopenjtalk_kana":
self.g2p = pyopenjtalk_g2p_kana
elif g2p_type == "pyopenjtalk_accent":
self.g2p = pyopenjtalk_g2p_accent
elif g2p_type == "pyopenjtalk_accent_with_pause":
self.g2p = pyopenjtalk_g2p_accent_with_pause
elif g2p_type == "pypinyin_g2p":
self.g2p = pypinyin_g2p
elif g2p_type == "pypinyin_g2p_phone":
self.g2p = pypinyin_g2p_phone
elif g2p_type == "espeak_ng_arabic":
self.g2p = Phonemizer(language="ar", backend="espeak", with_stress=True)
else:
raise NotImplementedError(f"Not supported: g2p_type={g2p_type}")
self.g2p_type = g2p_type
self.space_symbol = space_symbol
if non_linguistic_symbols is None:
self.non_linguistic_symbols = set()
elif isinstance(non_linguistic_symbols, (Path, str)):
non_linguistic_symbols = Path(non_linguistic_symbols)
with non_linguistic_symbols.open("r", encoding="utf-8") as f:
self.non_linguistic_symbols = set(line.rstrip() for line in f)
else:
self.non_linguistic_symbols = set(non_linguistic_symbols)
self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
def __repr__(self):
return (
f"{self.__class__.__name__}("
f'g2p_type="{self.g2p_type}", '
f'space_symbol="{self.space_symbol}", '
f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
f")"
)
def text2tokens(self, line: str) -> List[str]:
tokens = []
while len(line) != 0:
for w in self.non_linguistic_symbols:
if line.startswith(w):
if not self.remove_non_linguistic_symbols:
tokens.append(line[: len(w)])
line = line[len(w) :]
break
else:
t = line[0]
tokens.append(t)
line = line[1:]
line = "".join(tokens)
tokens = self.g2p(line)
return tokens
def tokens2text(self, tokens: Iterable[str]) -> str:
# phoneme type is not invertible
return "".join(tokens)