lmzjms
/

wavlm-large

Model card Files Files and versions Community

wavlm-large / s3prl_s3prl_main /s3prl /dataio /encoder /g2p.py

lmzjms's picture

Upload 1162 files

0b32ad6 verified 9 months ago

history blame contribute delete

2.37 kB

	"""
	Basic G2P

	Authors:
	* Heng-Jui Chang 2022
	"""

	import logging
	from collections import defaultdict
	from typing import Dict, List, Tuple

	from s3prl.util.download import _urls_to_filepaths

	DEFAULT_LEXICON_URL = [
	"https://huggingface.co/datasets/s3prl/g2p/raw/main/lexicon/librispeech-lexicon-200k-g2p.txt",
	"https://huggingface.co/datasets/s3prl/g2p/raw/main/lexicon/librispeech-lexicon-allothers-g2p.txt",
	]

	__all__ = ["G2P"]


	def parse_lexicon(line: str) -> Tuple[str, List[str]]:
	line.replace("\t", " ")
	word, *phonemes = line.split()
	return word, phonemes


	def read_lexicon_files(file_list: List[str]) -> Dict[str, List[str]]:
	w2p_dict = defaultdict(list)
	for file in file_list:
	with open(file, "r") as fp:
	lines = [line.strip() for line in fp]
	for line in lines:
	word, phonemes = parse_lexicon(line)
	w2p_dict[word].append(phonemes)

	w2p = {}
	for word, phonemes_all in w2p_dict.items():
	if len(phonemes_all) > 1:
	logging.info(f"{len(phonemes_all)} phoneme sequences found for {word}.")
	for i, phonemes in enumerate(phonemes_all):
	logging.info(f"{i}. {phonemes}")
	w2p[word] = phonemes_all[0]
	logging.info("Taking the first phoneme sequences for a deterministic behavior.")

	return w2p


	class G2P:
	"""Grapheme-to-phoneme

	Args:
	file_list (List[str], optional): List of lexicon files. Defaults to None.
	allow_unk (bool): If false, raise Error when a word can not be recognized by this basic G2P
	"""

	def __init__(self, file_list: List[str] = None, allow_unk: bool = False):
	self.allow_unk = allow_unk

	if file_list is None:
	file_list = _urls_to_filepaths(*DEFAULT_LEXICON_URL)
	self.word2phone = read_lexicon_files(file_list)

	def encode(self, text: str) -> str:
	"""Converts grapheme-based sentences to phonemes

	Args:
	text (str): Sentence

	Returns:
	str: Phonemized sentence
	"""

	word_list = text.strip().upper().split(" ")
	phonemes = []
	for word in word_list:
	if not self.allow_unk:
	assert word in self.word2phone
	phonemes += self.word2phone.get(word, ["<UNK>"])

	return " ".join(phonemes)