ing0's picture
chinese_lexicon
566e204
raw
history blame
18.8 kB
# Copyright (c) 2024 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import re
import jieba
import cn2an
from pypinyin import lazy_pinyin, BOPOMOFO
from typing import List
from diffrhythm.g2p.g2p.chinese_model_g2p import BertPolyPredict
from diffrhythm.g2p.utils.front_utils import *
import os
from huggingface_hub import hf_hub_download
# from g2pw import G2PWConverter
# set blank level, {0:"none",1:"char", 2:"word"}
BLANK_LEVEL = 0
# conv = G2PWConverter(style='pinyin', enable_non_tradional_chinese=True)
resource_path = r"./diffrhythm/g2p"
poly_all_class_path = os.path.join(
resource_path, "sources", "g2p_chinese_model", "polychar.txt"
)
if not os.path.exists(poly_all_class_path):
print(
"Incorrect path for polyphonic character class dictionary: {}, please check...".format(
poly_all_class_path
)
)
exit()
poly_dict = generate_poly_lexicon(poly_all_class_path)
# Set up G2PW model parameters
g2pw_poly_model_path = os.path.join(resource_path, "sources", "g2p_chinese_model")
if not os.path.exists(g2pw_poly_model_path):
print(
"Incorrect path for g2pw polyphonic character model: {}, please check...".format(
g2pw_poly_model_path
)
)
exit()
json_file_path = os.path.join(
resource_path, "sources", "g2p_chinese_model", "polydict.json"
)
if not os.path.exists(json_file_path):
print(
"Incorrect path for g2pw id to pinyin dictionary: {}, please check...".format(
json_file_path
)
)
exit()
jsonr_file_path = os.path.join(
resource_path, "sources", "g2p_chinese_model", "polydict_r.json"
)
if not os.path.exists(jsonr_file_path):
print(
"Incorrect path for g2pw pinyin to id dictionary: {}, please check...".format(
jsonr_file_path
)
)
exit()
g2pw_poly_predict = BertPolyPredict(
g2pw_poly_model_path, jsonr_file_path, json_file_path
)
"""
Text clean time
"""
# List of (Latin alphabet, bopomofo) pairs:
_latin_to_bopomofo = [
(re.compile("%s" % x[0], re.IGNORECASE), x[1])
for x in [
("a", "ㄟˉ"),
("b", "ㄅㄧˋ"),
("c", "ㄙㄧˉ"),
("d", "ㄉㄧˋ"),
("e", "ㄧˋ"),
("f", "ㄝˊㄈㄨˋ"),
("g", "ㄐㄧˋ"),
("h", "ㄝˇㄑㄩˋ"),
("i", "ㄞˋ"),
("j", "ㄐㄟˋ"),
("k", "ㄎㄟˋ"),
("l", "ㄝˊㄛˋ"),
("m", "ㄝˊㄇㄨˋ"),
("n", "ㄣˉ"),
("o", "ㄡˉ"),
("p", "ㄆㄧˉ"),
("q", "ㄎㄧㄡˉ"),
("r", "ㄚˋ"),
("s", "ㄝˊㄙˋ"),
("t", "ㄊㄧˋ"),
("u", "ㄧㄡˉ"),
("v", "ㄨㄧˉ"),
("w", "ㄉㄚˋㄅㄨˋㄌㄧㄡˋ"),
("x", "ㄝˉㄎㄨˋㄙˋ"),
("y", "ㄨㄞˋ"),
("z", "ㄗㄟˋ"),
]
]
# List of (bopomofo, ipa) pairs:
_bopomofo_to_ipa = [
(re.compile("%s" % x[0]), x[1])
for x in [
("ㄅㄛ", "p⁼wo"),
("ㄆㄛ", "pʰwo"),
("ㄇㄛ", "mwo"),
("ㄈㄛ", "fwo"),
("ㄧㄢ", "|jɛn"),
("ㄩㄢ", "|ɥæn"),
("ㄧㄣ", "|in"),
("ㄩㄣ", "|ɥn"),
("ㄧㄥ", "|iŋ"),
("ㄨㄥ", "|ʊŋ"),
("ㄩㄥ", "|jʊŋ"),
# Add
("ㄧㄚ", "|ia"),
("ㄧㄝ", "|iɛ"),
("ㄧㄠ", "|iɑʊ"),
("ㄧㄡ", "|ioʊ"),
("ㄧㄤ", "|iɑŋ"),
("ㄨㄚ", "|ua"),
("ㄨㄛ", "|uo"),
("ㄨㄞ", "|uaɪ"),
("ㄨㄟ", "|ueɪ"),
("ㄨㄢ", "|uan"),
("ㄨㄣ", "|uən"),
("ㄨㄤ", "|uɑŋ"),
("ㄩㄝ", "|ɥɛ"),
# End
("ㄅ", "p⁼"),
("ㄆ", "pʰ"),
("ㄇ", "m"),
("ㄈ", "f"),
("ㄉ", "t⁼"),
("ㄊ", "tʰ"),
("ㄋ", "n"),
("ㄌ", "l"),
("ㄍ", "k⁼"),
("ㄎ", "kʰ"),
("ㄏ", "x"),
("ㄐ", "tʃ⁼"),
("ㄑ", "tʃʰ"),
("ㄒ", "ʃ"),
("ㄓ", "ts`⁼"),
("ㄔ", "ts`ʰ"),
("ㄕ", "s`"),
("ㄖ", "ɹ`"),
("ㄗ", "ts⁼"),
("ㄘ", "tsʰ"),
("ㄙ", "|s"),
("ㄚ", "|a"),
("ㄛ", "|o"),
("ㄜ", "|ə"),
("ㄝ", "|ɛ"),
("ㄞ", "|aɪ"),
("ㄟ", "|eɪ"),
("ㄠ", "|ɑʊ"),
("ㄡ", "|oʊ"),
("ㄢ", "|an"),
("ㄣ", "|ən"),
("ㄤ", "|ɑŋ"),
("ㄥ", "|əŋ"),
("ㄦ", "əɹ"),
("ㄧ", "|i"),
("ㄨ", "|u"),
("ㄩ", "|ɥ"),
("ˉ", "→|"),
("ˊ", "↑|"),
("ˇ", "↓↑|"),
("ˋ", "↓|"),
("˙", "|"),
]
]
must_not_er_words = {"女儿", "老儿", "男儿", "少儿", "小儿"}
chinese_lexicon_path = hf_hub_download(
repo_id="ASLP-lab/DiffRhythm",
filename="diffrhythm/g2p/sources/chinese_lexicon.txt",
repo_type="space"
)
word_pinyin_dict = {}
with open(chinese_lexicon_path, "r", encoding="utf-8") as fread:
txt_list = fread.readlines()
for txt in txt_list:
word, pinyin = txt.strip().split("\t")
word_pinyin_dict[word] = pinyin
fread.close()
pinyin_2_bopomofo_dict = {}
with open(
r"./diffrhythm/g2p/sources/pinyin_2_bpmf.txt", "r", encoding="utf-8"
) as fread:
txt_list = fread.readlines()
for txt in txt_list:
pinyin, bopomofo = txt.strip().split("\t")
pinyin_2_bopomofo_dict[pinyin] = bopomofo
fread.close()
tone_dict = {
"0": "˙",
"5": "˙",
"1": "",
"2": "ˊ",
"3": "ˇ",
"4": "ˋ",
}
bopomofos2pinyin_dict = {}
with open(
r"./diffrhythm/g2p/sources/bpmf_2_pinyin.txt", "r", encoding="utf-8"
) as fread:
txt_list = fread.readlines()
for txt in txt_list:
v, k = txt.strip().split("\t")
bopomofos2pinyin_dict[k] = v
fread.close()
def bpmf_to_pinyin(text):
bopomofo_list = text.split("|")
pinyin_list = []
for info in bopomofo_list:
pinyin = ""
for c in info:
if c in bopomofos2pinyin_dict:
pinyin += bopomofos2pinyin_dict[c]
if len(pinyin) == 0:
continue
if pinyin[-1] not in "01234":
pinyin += "1"
if pinyin[:-1] == "ve":
pinyin = "y" + pinyin
if pinyin[:-1] == "sh":
pinyin = pinyin[:-1] + "i" + pinyin[-1]
if pinyin == "sh":
pinyin = pinyin[:-1] + "i"
if pinyin[:-1] == "s":
pinyin = "si" + pinyin[-1]
if pinyin[:-1] == "c":
pinyin = "ci" + pinyin[-1]
if pinyin[:-1] == "i":
pinyin = "yi" + pinyin[-1]
if pinyin[:-1] == "iou":
pinyin = "you" + pinyin[-1]
if pinyin[:-1] == "ien":
pinyin = "yin" + pinyin[-1]
if "iou" in pinyin and pinyin[-4:-1] == "iou":
pinyin = pinyin[:-4] + "iu" + pinyin[-1]
if "uei" in pinyin:
if pinyin[:-1] == "uei":
pinyin = "wei" + pinyin[-1]
elif pinyin[-4:-1] == "uei":
pinyin = pinyin[:-4] + "ui" + pinyin[-1]
if "uen" in pinyin and pinyin[-4:-1] == "uen":
if pinyin[:-1] == "uen":
pinyin = "wen" + pinyin[-1]
elif pinyin[-4:-1] == "uei":
pinyin = pinyin[:-4] + "un" + pinyin[-1]
if "van" in pinyin and pinyin[-4:-1] == "van":
if pinyin[:-1] == "van":
pinyin = "yuan" + pinyin[-1]
elif pinyin[-4:-1] == "van":
pinyin = pinyin[:-4] + "uan" + pinyin[-1]
if "ueng" in pinyin and pinyin[-5:-1] == "ueng":
pinyin = pinyin[:-5] + "ong" + pinyin[-1]
if pinyin[:-1] == "veng":
pinyin = "yong" + pinyin[-1]
if "veng" in pinyin and pinyin[-5:-1] == "veng":
pinyin = pinyin[:-5] + "iong" + pinyin[-1]
if pinyin[:-1] == "ieng":
pinyin = "ying" + pinyin[-1]
if pinyin[:-1] == "u":
pinyin = "wu" + pinyin[-1]
if pinyin[:-1] == "v":
pinyin = "yv" + pinyin[-1]
if pinyin[:-1] == "ing":
pinyin = "ying" + pinyin[-1]
if pinyin[:-1] == "z":
pinyin = "zi" + pinyin[-1]
if pinyin[:-1] == "zh":
pinyin = "zhi" + pinyin[-1]
if pinyin[0] == "u":
pinyin = "w" + pinyin[1:]
if pinyin[0] == "i":
pinyin = "y" + pinyin[1:]
pinyin = pinyin.replace("ien", "in")
pinyin_list.append(pinyin)
return " ".join(pinyin_list)
# Convert numbers to Chinese pronunciation
def number_to_chinese(text):
# numbers = re.findall(r'\d+(?:\.?\d+)?', text)
# for number in numbers:
# text = text.replace(number, cn2an.an2cn(number), 1)
text = cn2an.transform(text, "an2cn")
return text
def normalization(text):
text = text.replace(",", ",")
text = text.replace("。", ".")
text = text.replace("!", "!")
text = text.replace("?", "?")
text = text.replace(";", ";")
text = text.replace(":", ":")
text = text.replace("、", ",")
text = text.replace("‘", "'")
text = text.replace("’", "'")
text = text.replace("⋯", "…")
text = text.replace("···", "…")
text = text.replace("・・・", "…")
text = text.replace("...", "…")
text = re.sub(r"\s+", "", text)
text = re.sub(r"[^\u4e00-\u9fff\s_,\.\?!;:\'…]", "", text)
text = re.sub(r"\s*([,\.\?!;:\'…])\s*", r"\1", text)
return text
def change_tone(bopomofo: str, tone: str) -> str:
if bopomofo[-1] not in "˙ˊˇˋ":
bopomofo = bopomofo + tone
else:
bopomofo = bopomofo[:-1] + tone
return bopomofo
def er_sandhi(word: str, bopomofos: List[str]) -> List[str]:
if len(word) > 1 and word[-1] == "儿" and word not in must_not_er_words:
bopomofos[-1] = change_tone(bopomofos[-1], "˙")
return bopomofos
def bu_sandhi(word: str, bopomofos: List[str]) -> List[str]:
valid_char = set(word)
if len(valid_char) == 1 and "不" in valid_char:
pass
elif word in ["不字"]:
pass
elif len(word) == 3 and word[1] == "不" and bopomofos[1][:-1] == "ㄅㄨ":
bopomofos[1] = bopomofos[1][:-1] + "˙"
else:
for i, char in enumerate(word):
if (
i + 1 < len(bopomofos)
and char == "不"
and i + 1 < len(word)
and 0 < len(bopomofos[i + 1])
and bopomofos[i + 1][-1] == "ˋ"
):
bopomofos[i] = bopomofos[i][:-1] + "ˊ"
return bopomofos
def yi_sandhi(word: str, bopomofos: List[str]) -> List[str]:
punc = ":,;。?!“”‘’':,;.?!()(){}【】[]-~`、 "
if word.find("一") != -1 and any(
[item.isnumeric() for item in word if item != "一"]
):
for i in range(len(word)):
if (
i == 0
and word[0] == "一"
and len(word) > 1
and word[1]
not in [
"零",
"一",
"二",
"三",
"四",
"五",
"六",
"七",
"八",
"九",
"十",
]
):
if len(bopomofos[0]) > 0 and bopomofos[1][-1] in ["ˋ", "˙"]:
bopomofos[0] = change_tone(bopomofos[0], "ˊ")
else:
bopomofos[0] = change_tone(bopomofos[0], "ˋ")
elif word[i] == "一":
bopomofos[i] = change_tone(bopomofos[i], "")
return bopomofos
elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
bopomofos[1] = change_tone(bopomofos[1], "˙")
elif word.startswith("第一"):
bopomofos[1] = change_tone(bopomofos[1], "")
elif word.startswith("一月") or word.startswith("一日") or word.startswith("一号"):
bopomofos[0] = change_tone(bopomofos[0], "")
else:
for i, char in enumerate(word):
if char == "一" and i + 1 < len(word):
if (
len(bopomofos) > i + 1
and len(bopomofos[i + 1]) > 0
and bopomofos[i + 1][-1] in {"ˋ"}
):
bopomofos[i] = change_tone(bopomofos[i], "ˊ")
else:
if word[i + 1] not in punc:
bopomofos[i] = change_tone(bopomofos[i], "ˋ")
else:
pass
return bopomofos
def merge_bu(seg: List) -> List:
new_seg = []
last_word = ""
for word in seg:
if word != "不":
if last_word == "不":
word = last_word + word
new_seg.append(word)
last_word = word
return new_seg
def merge_er(seg: List) -> List:
new_seg = []
for i, word in enumerate(seg):
if i - 1 >= 0 and word == "儿":
new_seg[-1] = new_seg[-1] + seg[i]
else:
new_seg.append(word)
return new_seg
def merge_yi(seg: List) -> List:
new_seg = []
# function 1
for i, word in enumerate(seg):
if (
i - 1 >= 0
and word == "一"
and i + 1 < len(seg)
and seg[i - 1] == seg[i + 1]
):
if i - 1 < len(new_seg):
new_seg[i - 1] = new_seg[i - 1] + "一" + new_seg[i - 1]
else:
new_seg.append(word)
new_seg.append(seg[i + 1])
else:
if i - 2 >= 0 and seg[i - 1] == "一" and seg[i - 2] == word:
continue
else:
new_seg.append(word)
seg = new_seg
new_seg = []
isnumeric_flag = False
for i, word in enumerate(seg):
if all([item.isnumeric() for item in word]) and not isnumeric_flag:
isnumeric_flag = True
new_seg.append(word)
else:
new_seg.append(word)
seg = new_seg
new_seg = []
# function 2
for i, word in enumerate(seg):
if new_seg and new_seg[-1] == "一":
new_seg[-1] = new_seg[-1] + word
else:
new_seg.append(word)
return new_seg
# Word Segmentation, and convert Chinese pronunciation to pinyin (bopomofo)
def chinese_to_bopomofo(text_short, sentence):
# bopomofos = conv(text_short)
words = jieba.lcut(text_short, cut_all=False)
words = merge_yi(words)
words = merge_bu(words)
words = merge_er(words)
text = ""
char_index = 0
for word in words:
bopomofos = []
if word in word_pinyin_dict and word not in poly_dict:
pinyin = word_pinyin_dict[word]
for py in pinyin.split(" "):
if py[:-1] in pinyin_2_bopomofo_dict and py[-1] in tone_dict:
bopomofos.append(
pinyin_2_bopomofo_dict[py[:-1]] + tone_dict[py[-1]]
)
if BLANK_LEVEL == 1:
bopomofos.append("_")
else:
bopomofos_lazy = lazy_pinyin(word, BOPOMOFO)
bopomofos += bopomofos_lazy
if BLANK_LEVEL == 1:
bopomofos.append("_")
else:
for i in range(len(word)):
c = word[i]
if c in poly_dict:
poly_pinyin = g2pw_poly_predict.predict_process(
[text_short, char_index + i]
)[0]
py = poly_pinyin[2:-1]
bopomofos.append(
pinyin_2_bopomofo_dict[py[:-1]] + tone_dict[py[-1]]
)
if BLANK_LEVEL == 1:
bopomofos.append("_")
elif c in word_pinyin_dict:
py = word_pinyin_dict[c]
bopomofos.append(
pinyin_2_bopomofo_dict[py[:-1]] + tone_dict[py[-1]]
)
if BLANK_LEVEL == 1:
bopomofos.append("_")
else:
bopomofos.append(c)
if BLANK_LEVEL == 1:
bopomofos.append("_")
if BLANK_LEVEL == 2:
bopomofos.append("_")
char_index += len(word)
if (
len(word) == 3
and bopomofos[0][-1] == "ˇ"
and bopomofos[1][-1] == "ˇ"
and bopomofos[-1][-1] == "ˇ"
):
bopomofos[0] = bopomofos[0] + "ˊ"
bopomofos[1] = bopomofos[1] + "ˊ"
if len(word) == 2 and bopomofos[0][-1] == "ˇ" and bopomofos[-1][-1] == "ˇ":
bopomofos[0] = bopomofos[0][:-1] + "ˊ"
bopomofos = bu_sandhi(word, bopomofos)
bopomofos = yi_sandhi(word, bopomofos)
bopomofos = er_sandhi(word, bopomofos)
if not re.search("[\u4e00-\u9fff]", word):
text += "|" + word
continue
for i in range(len(bopomofos)):
bopomofos[i] = re.sub(r"([\u3105-\u3129])$", r"\1ˉ", bopomofos[i])
if text != "":
text += "|"
text += "|".join(bopomofos)
return text
# Convert latin pronunciation to pinyin (bopomofo)
def latin_to_bopomofo(text):
for regex, replacement in _latin_to_bopomofo:
text = re.sub(regex, replacement, text)
return text
# Convert pinyin (bopomofo) to IPA
def bopomofo_to_ipa(text):
for regex, replacement in _bopomofo_to_ipa:
text = re.sub(regex, replacement, text)
return text
def _chinese_to_ipa(text, sentence):
text = number_to_chinese(text.strip())
text = normalization(text)
text = chinese_to_bopomofo(text, sentence)
# pinyin = bpmf_to_pinyin(text)
text = latin_to_bopomofo(text)
text = bopomofo_to_ipa(text)
text = re.sub("([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)", r"\1ɹ\2", text)
text = re.sub("([s][⁼ʰ]?)([→↓↑ ]+|$)", r"\1ɹ\2", text)
text = re.sub(r"^\||[^\w\s_,\.\?!;:\'…\|→↓↑⁼ʰ`]", "", text)
text = re.sub(r"([,\.\?!;:\'…])", r"|\1|", text)
text = re.sub(r"\|+", "|", text)
text = text.rstrip("|")
return text
# Convert Chinese to IPA
def chinese_to_ipa(text, sentence, text_tokenizer):
# phonemes = text_tokenizer(text.strip())
if type(text) == str:
return _chinese_to_ipa(text, sentence)
else:
result_ph = []
for t in text:
result_ph.append(_chinese_to_ipa(t, sentence))
return result_ph