|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import List |
|
from typing import Tuple |
|
|
|
import jieba |
|
from pypinyin import lazy_pinyin |
|
from pypinyin import Style |
|
|
|
|
|
class ToneSandhi(): |
|
def __init__(self): |
|
self.must_neural_tone_words = { |
|
'麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝', |
|
'难为', '队伍', '阔气', '闺女', '门道', '锄头', '铺盖', '铃铛', '铁匠', '钥匙', '里脊', |
|
'里头', '部分', '那么', '道士', '造化', '迷糊', '连累', '这么', '这个', '运气', '过去', |
|
'软和', '转悠', '踏实', '跳蚤', '跟头', '趔趄', '财主', '豆腐', '讲究', '记性', '记号', |
|
'认识', '规矩', '见识', '裁缝', '补丁', '衣裳', '衣服', '衙门', '街坊', '行李', '行当', |
|
'蛤蟆', '蘑菇', '薄荷', '葫芦', '葡萄', '萝卜', '荸荠', '苗条', '苗头', '苍蝇', '芝麻', |
|
'舒服', '舒坦', '舌头', '自在', '膏药', '脾气', '脑袋', '脊梁', '能耐', '胳膊', '胭脂', |
|
'胡萝', '胡琴', '胡同', '聪明', '耽误', '耽搁', '耷拉', '耳朵', '老爷', '老实', '老婆', |
|
'老头', '老太', '翻腾', '罗嗦', '罐头', '编辑', '结实', '红火', '累赘', '糨糊', '糊涂', |
|
'精神', '粮食', '簸箕', '篱笆', '算计', '算盘', '答应', '笤帚', '笑语', '笑话', '窟窿', |
|
'窝囊', '窗户', '稳当', '稀罕', '称呼', '秧歌', '秀气', '秀才', '福气', '祖宗', '砚台', |
|
'码头', '石榴', '石头', '石匠', '知识', '眼睛', '眯缝', '眨巴', '眉毛', '相声', '盘算', |
|
'白净', '痢疾', '痛快', '疟疾', '疙瘩', '疏忽', '畜生', '生意', '甘蔗', '琵琶', '琢磨', |
|
'琉璃', '玻璃', '玫瑰', '玄乎', '狐狸', '状元', '特务', '牲口', '牙碜', '牌楼', '爽快', |
|
'爱人', '热闹', '烧饼', '烟筒', '烂糊', '点心', '炊帚', '灯笼', '火候', '漂亮', '滑溜', |
|
'溜达', '温和', '清楚', '消息', '浪头', '活泼', '比方', '正经', '欺负', '模糊', '槟榔', |
|
'棺材', '棒槌', '棉花', '核桃', '栅栏', '柴火', '架势', '枕头', '枇杷', '机灵', '本事', |
|
'木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾', |
|
'收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼', |
|
'抬举', '护士', '折腾', '扫帚', '打量', '打算', '打点', '打扮', '打听', '打发', '扎实', |
|
'扁担', '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头', |
|
'念叨', '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', |
|
'干事', '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', |
|
'屁股', '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', |
|
'实在', '官司', '学问', '学生', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈', |
|
'姑娘', '姐夫', '妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方', |
|
'大意', '大夫', '多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴', |
|
'嘱咐', '嘟囔', '嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦', |
|
'咳嗽', '和尚', '告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝', |
|
'叫唤', '口袋', '厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹', |
|
'功夫', '力气', '前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息', |
|
'凑合', '凉快', '冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤', |
|
'佩服', '作坊', '体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家', |
|
'交情', '云彩', '事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故', |
|
'不由', '不在', '下水', '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨', |
|
'父亲', '母亲', '咕噜', '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', |
|
'幸福', '熟悉', '计划', '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', |
|
'凤凰', '拖沓', '寒碜', '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', |
|
'扫把', '惦记' |
|
} |
|
self.must_not_neural_tone_words = { |
|
"男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎" |
|
} |
|
self.punc = ":,;。?!“”‘’':,;.?!" |
|
|
|
|
|
|
|
|
|
|
|
|
|
def _neural_sandhi(self, word: str, pos: str, |
|
finals: List[str]) -> List[str]: |
|
|
|
|
|
for j, item in enumerate(word): |
|
if j - 1 >= 0 and item == word[j - 1] and pos[0] in { |
|
"n", "v", "a" |
|
} and word not in self.must_not_neural_tone_words: |
|
finals[j] = finals[j][:-1] + "5" |
|
ge_idx = word.find("个") |
|
if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": |
|
finals[-1] = finals[-1][:-1] + "5" |
|
elif len(word) >= 1 and word[-1] in "的地得": |
|
finals[-1] = finals[-1][:-1] + "5" |
|
|
|
|
|
|
|
elif len(word) > 1 and word[-1] in "们子" and pos in { |
|
"r", "n" |
|
} and word not in self.must_not_neural_tone_words: |
|
finals[-1] = finals[-1][:-1] + "5" |
|
|
|
elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}: |
|
finals[-1] = finals[-1][:-1] + "5" |
|
|
|
elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开": |
|
finals[-1] = finals[-1][:-1] + "5" |
|
|
|
elif (ge_idx >= 1 and |
|
(word[ge_idx - 1].isnumeric() or |
|
word[ge_idx - 1] in "几有两半多各整每做是")) or word == '个': |
|
finals[ge_idx] = finals[ge_idx][:-1] + "5" |
|
else: |
|
if word in self.must_neural_tone_words or word[ |
|
-2:] in self.must_neural_tone_words: |
|
finals[-1] = finals[-1][:-1] + "5" |
|
|
|
word_list = self._split_word(word) |
|
finals_list = [finals[:len(word_list[0])], finals[len(word_list[0]):]] |
|
for i, word in enumerate(word_list): |
|
|
|
if word in self.must_neural_tone_words or word[ |
|
-2:] in self.must_neural_tone_words: |
|
finals_list[i][-1] = finals_list[i][-1][:-1] + "5" |
|
finals = sum(finals_list, []) |
|
return finals |
|
|
|
def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]: |
|
|
|
if len(word) == 3 and word[1] == "不": |
|
finals[1] = finals[1][:-1] + "5" |
|
else: |
|
for i, char in enumerate(word): |
|
|
|
if char == "不" and i + 1 < len(word) and finals[i + |
|
1][-1] == "4": |
|
finals[i] = finals[i][:-1] + "2" |
|
return finals |
|
|
|
def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: |
|
|
|
if word.find("一") != -1 and all( |
|
[item.isnumeric() for item in word if item != "一"]): |
|
return finals |
|
|
|
elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]: |
|
finals[1] = finals[1][:-1] + "5" |
|
|
|
elif word.startswith("第一"): |
|
finals[1] = finals[1][:-1] + "1" |
|
else: |
|
for i, char in enumerate(word): |
|
if char == "一" and i + 1 < len(word): |
|
|
|
if finals[i + 1][-1] == "4": |
|
finals[i] = finals[i][:-1] + "2" |
|
|
|
else: |
|
|
|
if word[i + 1] not in self.punc: |
|
finals[i] = finals[i][:-1] + "4" |
|
return finals |
|
|
|
def _split_word(self, word: str) -> List[str]: |
|
word_list = jieba.cut_for_search(word) |
|
word_list = sorted(word_list, key=lambda i: len(i), reverse=False) |
|
first_subword = word_list[0] |
|
first_begin_idx = word.find(first_subword) |
|
if first_begin_idx == 0: |
|
second_subword = word[len(first_subword):] |
|
new_word_list = [first_subword, second_subword] |
|
else: |
|
second_subword = word[:-len(first_subword)] |
|
new_word_list = [second_subword, first_subword] |
|
return new_word_list |
|
|
|
def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: |
|
if len(word) == 2 and self._all_tone_three(finals): |
|
finals[0] = finals[0][:-1] + "2" |
|
elif len(word) == 3: |
|
word_list = self._split_word(word) |
|
if self._all_tone_three(finals): |
|
|
|
if len(word_list[0]) == 2: |
|
finals[0] = finals[0][:-1] + "2" |
|
finals[1] = finals[1][:-1] + "2" |
|
|
|
elif len(word_list[0]) == 1: |
|
finals[1] = finals[1][:-1] + "2" |
|
else: |
|
finals_list = [ |
|
finals[:len(word_list[0])], finals[len(word_list[0]):] |
|
] |
|
if len(finals_list) == 2: |
|
for i, sub in enumerate(finals_list): |
|
|
|
if self._all_tone_three(sub) and len(sub) == 2: |
|
finals_list[i][0] = finals_list[i][0][:-1] + "2" |
|
|
|
elif i == 1 and not self._all_tone_three(sub) and finals_list[i][0][-1] == "3" and \ |
|
finals_list[0][-1][-1] == "3": |
|
|
|
finals_list[0][-1] = finals_list[0][-1][:-1] + "2" |
|
finals = sum(finals_list, []) |
|
|
|
elif len(word) == 4: |
|
finals_list = [finals[:2], finals[2:]] |
|
finals = [] |
|
for sub in finals_list: |
|
if self._all_tone_three(sub): |
|
sub[0] = sub[0][:-1] + "2" |
|
finals += sub |
|
|
|
return finals |
|
|
|
def _all_tone_three(self, finals: List[str]) -> bool: |
|
return all(x[-1] == "3" for x in finals) |
|
|
|
|
|
|
|
def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
|
new_seg = [] |
|
last_word = "" |
|
for word, pos in seg: |
|
if last_word == "不": |
|
word = last_word + word |
|
if word != "不": |
|
new_seg.append((word, pos)) |
|
last_word = word[:] |
|
if last_word == "不": |
|
new_seg.append((last_word, 'd')) |
|
last_word = "" |
|
return new_seg |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
|
new_seg = [] |
|
|
|
for i, (word, pos) in enumerate(seg): |
|
if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][ |
|
0] == seg[i + 1][0] and seg[i - 1][1] == "v": |
|
new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] |
|
else: |
|
if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][ |
|
0] == word and pos == "v": |
|
continue |
|
else: |
|
new_seg.append([word, pos]) |
|
seg = new_seg |
|
new_seg = [] |
|
|
|
for i, (word, pos) in enumerate(seg): |
|
if new_seg and new_seg[-1][0] == "一": |
|
new_seg[-1][0] = new_seg[-1][0] + word |
|
else: |
|
new_seg.append([word, pos]) |
|
return new_seg |
|
|
|
|
|
def _merge_continuous_three_tones( |
|
self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
|
new_seg = [] |
|
sub_finals_list = [ |
|
lazy_pinyin( |
|
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) |
|
for (word, pos) in seg |
|
] |
|
assert len(sub_finals_list) == len(seg) |
|
merge_last = [False] * len(seg) |
|
for i, (word, pos) in enumerate(seg): |
|
if i - 1 >= 0 and self._all_tone_three( |
|
sub_finals_list[i - 1]) and self._all_tone_three( |
|
sub_finals_list[i]) and not merge_last[i - 1]: |
|
|
|
if not self._is_reduplication(seg[i - 1][0]) and len( |
|
seg[i - 1][0]) + len(seg[i][0]) <= 3: |
|
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] |
|
merge_last[i] = True |
|
else: |
|
new_seg.append([word, pos]) |
|
else: |
|
new_seg.append([word, pos]) |
|
|
|
return new_seg |
|
|
|
def _is_reduplication(self, word: str) -> bool: |
|
return len(word) == 2 and word[0] == word[1] |
|
|
|
|
|
def _merge_continuous_three_tones_2( |
|
self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
|
new_seg = [] |
|
sub_finals_list = [ |
|
lazy_pinyin( |
|
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) |
|
for (word, pos) in seg |
|
] |
|
assert len(sub_finals_list) == len(seg) |
|
merge_last = [False] * len(seg) |
|
for i, (word, pos) in enumerate(seg): |
|
if i - 1 >= 0 and sub_finals_list[i - 1][-1][-1] == "3" and sub_finals_list[i][0][-1] == "3" and not \ |
|
merge_last[i - 1]: |
|
|
|
if not self._is_reduplication(seg[i - 1][0]) and len( |
|
seg[i - 1][0]) + len(seg[i][0]) <= 3: |
|
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] |
|
merge_last[i] = True |
|
else: |
|
new_seg.append([word, pos]) |
|
else: |
|
new_seg.append([word, pos]) |
|
return new_seg |
|
|
|
def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
|
new_seg = [] |
|
for i, (word, pos) in enumerate(seg): |
|
if i - 1 >= 0 and word == "儿" and seg[i-1][0] != "#": |
|
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] |
|
else: |
|
new_seg.append([word, pos]) |
|
return new_seg |
|
|
|
def _merge_reduplication( |
|
self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
|
new_seg = [] |
|
for i, (word, pos) in enumerate(seg): |
|
if new_seg and word == new_seg[-1][0]: |
|
new_seg[-1][0] = new_seg[-1][0] + seg[i][0] |
|
else: |
|
new_seg.append([word, pos]) |
|
return new_seg |
|
|
|
def pre_merge_for_modify( |
|
self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: |
|
seg = self._merge_bu(seg) |
|
try: |
|
seg = self._merge_yi(seg) |
|
except: |
|
print("_merge_yi failed") |
|
seg = self._merge_reduplication(seg) |
|
seg = self._merge_continuous_three_tones(seg) |
|
seg = self._merge_continuous_three_tones_2(seg) |
|
seg = self._merge_er(seg) |
|
return seg |
|
|
|
def modified_tone(self, word: str, pos: str, |
|
finals: List[str]) -> List[str]: |
|
finals = self._bu_sandhi(word, finals) |
|
finals = self._yi_sandhi(word, finals) |
|
finals = self._neural_sandhi(word, pos, finals) |
|
finals = self._three_sandhi(word, finals) |
|
return finals |
|
|