|
|
|
|
|
|
|
|
|
|
|
import os |
|
import sys |
|
|
|
from diffrhythm.g2p.g2p import PhonemeBpeTokenizer |
|
from diffrhythm.g2p.utils.g2p import phonemizer_g2p |
|
import tqdm |
|
from typing import List |
|
import json |
|
import os |
|
import re |
|
|
|
|
|
def ph_g2p(text, language): |
|
|
|
return phonemizer_g2p(text=text, language=language) |
|
|
|
|
|
def g2p(text, sentence, language): |
|
|
|
return text_tokenizer.tokenize(text=text, sentence=sentence, language=language) |
|
|
|
|
|
def is_chinese(char): |
|
if char >= "\u4e00" and char <= "\u9fa5": |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
def is_alphabet(char): |
|
if (char >= "\u0041" and char <= "\u005a") or ( |
|
char >= "\u0061" and char <= "\u007a" |
|
): |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
def is_other(char): |
|
if not (is_chinese(char) or is_alphabet(char)): |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
def get_segment(text: str) -> List[str]: |
|
|
|
segments = [] |
|
types = [] |
|
flag = 0 |
|
temp_seg = "" |
|
temp_lang = "" |
|
|
|
|
|
for i, ch in enumerate(text): |
|
if is_chinese(ch): |
|
types.append("zh") |
|
elif is_alphabet(ch): |
|
types.append("en") |
|
else: |
|
types.append("other") |
|
|
|
assert len(types) == len(text) |
|
|
|
for i in range(len(types)): |
|
|
|
if flag == 0: |
|
temp_seg += text[i] |
|
temp_lang = types[i] |
|
flag = 1 |
|
else: |
|
if temp_lang == "other": |
|
if types[i] == temp_lang: |
|
temp_seg += text[i] |
|
else: |
|
temp_seg += text[i] |
|
temp_lang = types[i] |
|
else: |
|
if types[i] == temp_lang: |
|
temp_seg += text[i] |
|
elif types[i] == "other": |
|
temp_seg += text[i] |
|
else: |
|
segments.append((temp_seg, temp_lang)) |
|
temp_seg = text[i] |
|
temp_lang = types[i] |
|
flag = 1 |
|
|
|
segments.append((temp_seg, temp_lang)) |
|
return segments |
|
|
|
|
|
def chn_eng_g2p(text: str): |
|
|
|
segments = get_segment(text) |
|
all_phoneme = "" |
|
all_tokens = [] |
|
|
|
for index in range(len(segments)): |
|
seg = segments[index] |
|
phoneme, token = g2p(seg[0], text, seg[1]) |
|
all_phoneme += phoneme + "|" |
|
all_tokens += token |
|
|
|
if seg[1] == "en" and index == len(segments) - 1 and all_phoneme[-2] == "_": |
|
all_phoneme = all_phoneme[:-2] |
|
all_tokens = all_tokens[:-1] |
|
return all_phoneme, all_tokens |
|
|
|
|
|
text_tokenizer = PhonemeBpeTokenizer() |
|
with open("./diffrhythm/g2p/g2p/vocab.json", "r") as f: |
|
json_data = f.read() |
|
data = json.loads(json_data) |
|
vocab = data["vocab"] |
|
|
|
if __name__ == '__main__': |
|
phone, token = chn_eng_g2p("你好,hello world") |
|
phone, token = chn_eng_g2p("你好,hello world, Bonjour, 테스트 해 보겠습니다, 五月雨緑") |
|
print(phone) |
|
print(token) |
|
|
|
|
|
phone, token = text_tokenizer.tokenize("緑", "", "auto") |
|
|
|
|
|
print(phone) |
|
print(token) |