lmzjms's picture
Upload 1162 files
0b32ad6 verified
"""
Basic G2P
Authors:
* Heng-Jui Chang 2022
"""
import logging
from collections import defaultdict
from typing import Dict, List, Tuple
from s3prl.util.download import _urls_to_filepaths
DEFAULT_LEXICON_URL = [
"https://huggingface.co/datasets/s3prl/g2p/raw/main/lexicon/librispeech-lexicon-200k-g2p.txt",
"https://huggingface.co/datasets/s3prl/g2p/raw/main/lexicon/librispeech-lexicon-allothers-g2p.txt",
]
__all__ = ["G2P"]
def parse_lexicon(line: str) -> Tuple[str, List[str]]:
line.replace("\t", " ")
word, *phonemes = line.split()
return word, phonemes
def read_lexicon_files(file_list: List[str]) -> Dict[str, List[str]]:
w2p_dict = defaultdict(list)
for file in file_list:
with open(file, "r") as fp:
lines = [line.strip() for line in fp]
for line in lines:
word, phonemes = parse_lexicon(line)
w2p_dict[word].append(phonemes)
w2p = {}
for word, phonemes_all in w2p_dict.items():
if len(phonemes_all) > 1:
logging.info(f"{len(phonemes_all)} phoneme sequences found for {word}.")
for i, phonemes in enumerate(phonemes_all):
logging.info(f"{i}. {phonemes}")
w2p[word] = phonemes_all[0]
logging.info("Taking the first phoneme sequences for a deterministic behavior.")
return w2p
class G2P:
"""Grapheme-to-phoneme
Args:
file_list (List[str], optional): List of lexicon files. Defaults to None.
allow_unk (bool): If false, raise Error when a word can not be recognized by this basic G2P
"""
def __init__(self, file_list: List[str] = None, allow_unk: bool = False):
self.allow_unk = allow_unk
if file_list is None:
file_list = _urls_to_filepaths(*DEFAULT_LEXICON_URL)
self.word2phone = read_lexicon_files(file_list)
def encode(self, text: str) -> str:
"""Converts grapheme-based sentences to phonemes
Args:
text (str): Sentence
Returns:
str: Phonemized sentence
"""
word_list = text.strip().upper().split(" ")
phonemes = []
for word in word_list:
if not self.allow_unk:
assert word in self.word2phone
phonemes += self.word2phone.get(word, ["<UNK>"])
return " ".join(phonemes)