Spaces:
Running
on
T4
Running
on
T4
File size: 6,260 Bytes
4300fed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
# Convert Japanese text to phonemes which is
# compatible with Julius https://github.com/julius-speech/segmentation-kit
import re
import unicodedata
from transformers import AutoTokenizer
from . import punctuation, symbols
from num2words import num2words
from melo.text.ko_dictionary import english_dictionary, etc_dictionary
from anyascii import anyascii
from jamo import hangul_to_jamo
def normalize(text):
text = text.strip()
text = re.sub("[โบ-โบโบ-โปณโผ-โฟใ
ใใก-ใฉใธ-ใบใปใ-ไถตไธ-้ฟ่ฑ-้ถดไพฎ-้ ปไธฆ-้พ]", "", text)
text = normalize_with_dictionary(text, etc_dictionary)
text = normalize_english(text)
text = text.lower()
return text
def normalize_with_dictionary(text, dic):
if any(key in text for key in dic.keys()):
pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
return pattern.sub(lambda x: dic[x.group()], text)
return text
def normalize_english(text):
def fn(m):
word = m.group()
if word in english_dictionary:
return english_dictionary.get(word)
return word
text = re.sub("([A-Za-z]+)", fn, text)
return text
g2p_kr = None
def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
"""
The input and output values look the same, but they are different in Unicode.
example :
input = 'ํ๋' (Unicode : \ud558\ub298), (ํ + ๋)
output = 'แแ
กแแ
ณแฏ' (Unicode :\u1112\u1161\u1102\u1173\u11af), (แ + แ
ก + แ + แ
ณ + แฏ)
"""
global g2p_kr # pylint: disable=global-statement
if g2p_kr is None:
from g2pkk import G2p
g2p_kr = G2p()
if character == "english":
from anyascii import anyascii
text = normalize(text)
text = g2p_kr(text)
text = anyascii(text)
return text
text = normalize(text)
text = g2p_kr(text)
text = list(hangul_to_jamo(text)) # 'ํ๋' --> ['แ', 'แ
ก', 'แ', 'แ
ณ', 'แฏ']
return "".join(text)
def text_normalize(text):
# res = unicodedata.normalize("NFKC", text)
# res = japanese_convert_numbers_to_words(res)
# # res = "".join([i for i in res if is_japanese_character(i)])
# res = replace_punctuation(res)
text = normalize(text)
return text
def distribute_phone(n_phone, n_word):
phones_per_word = [0] * n_word
for task in range(n_phone):
min_tasks = min(phones_per_word)
min_index = phones_per_word.index(min_tasks)
phones_per_word[min_index] += 1
return phones_per_word
# tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v3')
model_id = 'kykim/bert-kor-base'
tokenizer = AutoTokenizer.from_pretrained(model_id)
def g2p(norm_text):
tokenized = tokenizer.tokenize(norm_text)
phs = []
ph_groups = []
for t in tokenized:
if not t.startswith("#"):
ph_groups.append([t])
else:
ph_groups[-1].append(t.replace("#", ""))
word2ph = []
for group in ph_groups:
text = ""
for ch in group:
text += ch
if text == '[UNK]':
phs += ['_']
word2ph += [1]
continue
elif text in punctuation:
phs += [text]
word2ph += [1]
continue
# import pdb; pdb.set_trace()
# phonemes = japanese_text_to_phonemes(text)
# text = g2p_kr(text)
phonemes = korean_text_to_phonemes(text)
# import pdb; pdb.set_trace()
# # phonemes = [i for i in phonemes if i in symbols]
# for i in phonemes:
# assert i in symbols, (group, norm_text, tokenized, i)
phone_len = len(phonemes)
word_len = len(group)
aaa = distribute_phone(phone_len, word_len)
assert len(aaa) == word_len
word2ph += aaa
phs += phonemes
phones = ["_"] + phs + ["_"]
tones = [0 for i in phones]
word2ph = [1] + word2ph + [1]
assert len(word2ph) == len(tokenized) + 2
return phones, tones, word2ph
def get_bert_feature(text, word2ph, device='cuda'):
from . import japanese_bert
return japanese_bert.get_bert_feature(text, word2ph, device=device, model_id=model_id)
if __name__ == "__main__":
# tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
from text.symbols import symbols
text = "์ ์ ์ผ์ ๊ฐ์น์ ํฐํ์ธ ๋์ค๋ค์ด ํ ์ผ์ ์๋ฏธ๋ฅผ ์ ์๋๋ค. ์์ผ๋ก๋ ์ ์ ์ผ์ ์๋ถ์ฌ์ ๊ฐ๊ณ ์ด์๊ฐ ๊ฒ๋๋ค"
import json
# genshin_data = json.load(open('/data/zwl/workspace/StarRail_Datasets/Index & Scripts/Index/1.3/Korean.json'))
genshin_data = json.load(open('/data/zwl/workspace/Genshin_Datasets/Index & Script/AI Hobbyist Version/Index/4.1/KR_output.json'))
from tqdm import tqdm
new_symbols = []
for key, item in tqdm(genshin_data.items()):
texts = item.get('voiceContent', '')
if isinstance(texts, list):
texts = ','.join(texts)
if texts is None:
continue
if len(texts) == 0:
continue
text = text_normalize(text)
phones, tones, word2ph = g2p(text)
bert = get_bert_feature(text, word2ph)
import pdb; pdb.set_trace()
for ph in phones:
if ph not in symbols and ph not in new_symbols:
new_symbols.append(ph)
print('update!, now symbols:')
print(new_symbols)
with open('korean_symbol.txt', 'w') as f:
f.write(f'{new_symbols}')
# if __name__ == '__main__':
# from pykakasi import kakasi
# # Initialize kakasi object
# kakasi = kakasi()
# # Set options for converting Chinese characters to Katakana
# kakasi.setMode("J", "H") # Chinese to Katakana
# kakasi.setMode("K", "H") # Hiragana to Katakana
# # Convert Chinese characters to Katakana
# conv = kakasi.getConverter()
# katakana_text = conv.do('ใใใๅใฏใใใชใจ็ณใใพใใใใกใใฎๅฐใใใใในใฏๆๅญใใๆจๆถใ้
ใใฆใใพใใใฟใพใใใใใชใใฎๅใฏ?') # Replace with your Chinese text
# print(katakana_text) # Output: ใใผใใชใปใซใค |