vishred18's picture
Upload 364 files
d5ee97c
raw
history blame
14.8 kB
๏ปฟ# -*- coding: utf-8 -*-
# Copyright 2020 TensorFlowTTS Team, Jaehyoung Kim(@crux153) and Taehoon Kim(@carpedm20)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow
"""Korean related helpers."""
import ast
import json
import os
import re
from jamo import h2j, hangul_to_jamo, j2h, jamo_to_hcj
etc_dictionary = {
"2 30๋Œ€": "์ด์‚ผ์‹ญ๋Œ€",
"20~30๋Œ€": "์ด์‚ผ์‹ญ๋Œ€",
"20, 30๋Œ€": "์ด์‹ญ๋Œ€ ์‚ผ์‹ญ๋Œ€",
"1+1": "์›ํ”Œ๋Ÿฌ์Šค์›",
"3์—์„œ 6๊ฐœ์›”์ธ": "3๊ฐœ์›”์—์„œ ์œก๊ฐœ์›”์ธ",
}
english_dictionary = {
"Devsisters": "๋ฐ๋ธŒ์‹œ์Šคํ„ฐ์ฆˆ",
"track": "ํŠธ๋ž™",
# krbook
"LA": "์—˜์—์ด",
"LG": "์—˜์ง€",
"KOREA": "์ฝ”๋ฆฌ์•„",
"JSA": "์ œ์ด์—์Šค์—์ด",
"PGA": "ํ”ผ์ง€์—์ด",
"GA": "์ง€์—์ด",
"idol": "์•„์ด๋Œ",
"KTX": "์ผ€์ดํ‹ฐ์—‘์Šค",
"AC": "์—์ด์”จ",
"DVD": "๋””๋น„๋””",
"US": "์œ ์—์Šค",
"CNN": "์”จ์—”์—”",
"LPGA": "์—˜ํ”ผ์ง€์—์ด",
"P": "ํ”ผ",
"L": "์—˜",
"T": "ํ‹ฐ",
"B": "๋น„",
"C": "์”จ",
"BIFF": "๋น„์•„์ด์—ํ”„์—ํ”„",
"GV": "์ง€๋น„",
# JTBC
"IT": "์•„์ดํ‹ฐ",
"IQ": "์•„์ดํ",
"JTBC": "์ œ์ดํ‹ฐ๋น„์”จ",
"trickle down effect": "ํŠธ๋ฆฌํด ๋‹ค์šด ์ดํŽ™ํŠธ",
"trickle up effect": "ํŠธ๋ฆฌํด ์—… ์ดํŽ™ํŠธ",
"down": "๋‹ค์šด",
"up": "์—…",
"FCK": "์—ํ”„์”จ์ผ€์ด",
"AP": "์—์ดํ”ผ",
"WHERETHEWILDTHINGSARE": "",
"Rashomon Effect": "",
"O": "์˜ค",
"OO": "์˜ค์˜ค",
"B": "๋น„",
"GDP": "์ง€๋””ํ”ผ",
"CIPA": "์”จ์•„์ดํ”ผ์—์ด",
"YS": "์™€์ด์—์Šค",
"Y": "์™€์ด",
"S": "์—์Šค",
"JTBC": "์ œ์ดํ‹ฐ๋น„์”จ",
"PC": "ํ”ผ์”จ",
"bill": "๋นŒ",
"Halmuny": "ํ•˜๋ชจ๋‹ˆ", #####
"X": "์—‘์Šค",
"SNS": "์—์Šค์—”์—์Šค",
"ability": "์–ด๋นŒ๋ฆฌํ‹ฐ",
"shy": "",
"CCTV": "์”จ์”จํ‹ฐ๋น„",
"IT": "์•„์ดํ‹ฐ",
"the tenth man": "๋” ํ…์“ฐ ๋งจ", ####
"L": "์—˜",
"PC": "ํ”ผ์”จ",
"YSDJJPMB": "", ########
"Content Attitude Timing": "์ปจํ…ํŠธ ์• ํ‹ฐํŠœ๋“œ ํƒ€์ด๋ฐ",
"CAT": "์บฃ",
"IS": "์•„์ด์—์Šค",
"K": "์ผ€์ด",
"Y": "์™€์ด",
"KDI": "์ผ€์ด๋””์•„์ด",
"DOC": "๋””์˜ค์”จ",
"CIA": "์”จ์•„์ด์—์ด",
"PBS": "ํ”ผ๋น„์—์Šค",
"D": "๋””",
"PPropertyPositionPowerPrisonP" "S": "์—์Šค",
"francisco": "ํ”„๋ž€์‹œ์Šค์ฝ”",
"I": "์•„์ด",
"III": "์•„์ด์•„์ด", ######
"No joke": "๋…ธ ์กฐํฌ",
"BBK": "๋น„๋น„์ผ€์ด",
"LA": "์—˜์—์ด",
"Don": "",
"t worry be happy": " ์›Œ๋ฆฌ ๋น„ ํ•ดํ”ผ",
"NO": "์—”์˜ค", #####
"it was our sky": "์ž‡ ์›Œ์ฆˆ ์•„์›Œ ์Šค์นด์ด",
"it is our sky": "์ž‡ ์ด์ฆˆ ์•„์›Œ ์Šค์นด์ด", ####
"NEIS": "์—”์ด์•„์ด์—์Šค", #####
"IMF": "์•„์ด์— ์—ํ”„",
"apology": "์–ดํด๋กœ์ง€",
"humble": "ํ—˜๋ธ”",
"M": "์— ",
"Nowhere Man": "๋…ธ์›จ์–ด ๋งจ",
"The Tenth Man": "๋” ํ…์“ฐ ๋งจ",
"PBS": "ํ”ผ๋น„์—์Šค",
"BBC": "๋น„๋น„์”จ",
"MRJ": "์— ์•Œ์ œ์ด",
"CCTV": "์”จ์”จํ‹ฐ๋น„",
"Pick me up": "ํ”ฝ ๋ฏธ ์—…",
"DNA": "๋””์—”์—์ด",
"UN": "์œ ์—”",
"STOP": "์Šคํƒ‘", #####
"PRESS": "ํ”„๋ ˆ์Šค", #####
"not to be": "๋‚ซ ํˆฌ๋น„",
"Denial": "๋””๋‚˜์ด์–ผ",
"G": "์ง€",
"IMF": "์•„์ด์— ์—ํ”„",
"GDP": "์ง€๋””ํ”ผ",
"JTBC": "์ œ์ดํ‹ฐ๋น„์”จ",
"Time flies like an arrow": "ํƒ€์ž„ ํ”Œ๋ผ์ด์ฆˆ ๋ผ์ดํฌ ์–ธ ์• ๋กœ์šฐ",
"DDT": "๋””๋””ํ‹ฐ",
"AI": "์—์ด์•„์ด",
"Z": "์ œํŠธ",
"OECD": "์˜ค์ด์”จ๋””",
"N": "์•ค",
"A": "์—์ด",
"MB": "์— ๋น„",
"EH": "์ด์—์ด์น˜",
"IS": "์•„์ด์—์Šค",
"TV": "ํ‹ฐ๋น„",
"MIT": "์— ์•„์ดํ‹ฐ",
"KBO": "์ผ€์ด๋น„์˜ค",
"I love America": "์•„์ด ๋Ÿฌ๋ธŒ ์•„๋ฉ”๋ฆฌ์นด",
"SF": "์—์Šค์—ํ”„",
"Q": "ํ",
"KFX": "์ผ€์ด์—ํ”„์—‘์Šค",
"PM": "ํ”ผ์— ",
"Prime Minister": "ํ”„๋ผ์ž„ ๋ฏธ๋‹ˆ์Šคํ„ฐ",
"Swordline": "์Šค์›Œ๋“œ๋ผ์ธ",
"TBS": "ํ‹ฐ๋น„์—์Šค",
"DDT": "๋””๋””ํ‹ฐ",
"CS": "์”จ์—์Šค",
"Reflecting Absence": "๋ฆฌํ”Œ๋ ‰ํŒ… ์•ฑ์„ผ์Šค",
"PBS": "ํ”ผ๋น„์—์Šค",
"Drum being beaten by everyone": "๋“œ๋Ÿผ ๋น™ ๋น„ํŠผ ๋ฐ”์ด ์—๋ธŒ๋ฆฌ์›",
"negative pressure": "๋„ค๊ฑฐํ‹ฐ๋ธŒ ํ”„๋ ˆ์…”",
"F": "์—ํ”„",
"KIA": "๊ธฐ์•„",
"FTA": "์—ํ”„ํ‹ฐ์—์ด",
"Que sais-je": "",
"UFC": "์œ ์—ํ”„์”จ",
"P": "ํ”ผ",
"DJ": "๋””์ œ์ด",
"Chaebol": "์ฑ„๋ฒŒ",
"BBC": "๋น„๋น„์”จ",
"OECD": "์˜ค์ด์”จ๋””",
"BC": "์‚์”จ",
"C": "์”จ",
"B": "์”จ",
"KY": "์ผ€์ด์™€์ด",
"K": "์ผ€์ด",
"CEO": "์”จ์ด์˜ค",
"YH": "์™€์ด์—์น˜",
"IS": "์•„์ด์—์Šค",
"who are you": "ํ›„ ์–ผ ์œ ",
"Y": "์™€์ด",
"The Devils Advocate": "๋” ๋ฐ๋นŒ์ฆˆ ์–ด๋“œ๋ณด์นดํŠธ",
"YS": "์™€์ด์—์Šค",
"so sorry": "์˜ ์˜๋ฆฌ",
"Santa": "์‚ฐํƒ€",
"Big Endian": "๋น… ์—”๋””์•ˆ",
"Small Endian": "์Šค๋ชฐ ์—”๋””์•ˆ",
"Oh Captain My Captain": "์˜ค ์บกํ‹ด ๋งˆ์ด ์บกํ‹ด",
"AIB": "์—์ด์•„์ด๋น„",
"K": "์ผ€์ด",
"PBS": "ํ”ผ๋น„์—์Šค",
# IU
"ASMR": "์—์ด์—์Šค์— ์•Œ",
"V": "๋ธŒ์ด",
"PD": "ํ”ผ๋””",
"CD": "์”จ๋””",
"ANR": "์—์ด์—”์•Œ",
"Twenty Three": "ํˆฌ์—”ํ‹ฐ ์“ฐ๋ฆฌ",
"Through The Night": "์“ฐ๋ฃจ ๋” ๋‚˜์ž‡",
"MD": "์— ๋””",
}
num_to_kor = {
"0": "์˜",
"1": "์ผ",
"2": "์ด",
"3": "์‚ผ",
"4": "์‚ฌ",
"5": "์˜ค",
"6": "์œก",
"7": "์น ",
"8": "ํŒ”",
"9": "๊ตฌ",
}
unit_to_kor1 = {"%": "ํผ์„ผํŠธ", "cm": "์„ผ์น˜๋ฏธํ„ฐ", "mm": "๋ฐ€๋ฆฌ๋ฏธํ„ฐ", "km": "ํ‚ฌ๋กœ๋ฏธํ„ฐ", "kg": "ํ‚ฌ๋กœ๊ทธ๋žŒ"}
unit_to_kor2 = {"m": "๋ฏธํ„ฐ"}
upper_to_kor = {
"A": "์—์ด",
"B": "๋น„",
"C": "์”จ",
"D": "๋””",
"E": "์ด",
"F": "์—ํ”„",
"G": "์ง€",
"H": "์—์ด์น˜",
"I": "์•„์ด",
"J": "์ œ์ด",
"K": "์ผ€์ด",
"L": "์—˜",
"M": "์— ",
"N": "์—”",
"O": "์˜ค",
"P": "ํ”ผ",
"Q": "ํ",
"R": "์•Œ",
"S": "์—์Šค",
"T": "ํ‹ฐ",
"U": "์œ ",
"V": "๋ธŒ์ด",
"W": "๋”๋ธ”์œ ",
"X": "์—‘์Šค",
"Y": "์™€์ด",
"Z": "์ง€",
}
"""
์ดˆ์„ฑ๊ณผ ์ข…์„ฑ์€ ๊ฐ™์•„๋ณด์ด์ง€๋งŒ, ๋‹ค๋ฅธ character์ด๋‹ค.
'_-!'(),-.:;? แ„€แ„แ„‚แ„ƒแ„„แ„…แ„†แ„‡แ„ˆแ„‰แ„Šแ„‹แ„Œแ„แ„Žแ„แ„แ„‘แ„’แ…กแ…ขแ…ฃแ…คแ…ฅแ…ฆแ…งแ…จแ…ฉแ…ชแ…ซแ…ฌแ…ญแ…ฎแ…ฏแ…ฐแ…ฑแ…ฒแ…ณแ…ดแ…ตแ†จแ†ฉแ†ชแ†ซแ†ฌแ†ญแ†ฎแ†ฏแ†ฐแ†ฑแ†ฒแ†ณแ†ดแ†ตแ†ถแ†ทแ†ธแ†นแ†บแ†ปแ†ผแ†ฝแ†พแ†ฟแ‡€แ‡แ‡‚~'
'_': 0, '-': 7, '!': 2, "'": 3, '(': 4, ')': 5, ',': 6, '.': 8, ':': 9, ';': 10,
'?': 11, ' ': 12, 'แ„€': 13, 'แ„': 14, 'แ„‚': 15, 'แ„ƒ': 16, 'แ„„': 17, 'แ„…': 18, 'แ„†': 19, 'แ„‡': 20,
'แ„ˆ': 21, 'แ„‰': 22, 'แ„Š': 23, 'แ„‹': 24, 'แ„Œ': 25, 'แ„': 26, 'แ„Ž': 27, 'แ„': 28, 'แ„': 29, 'แ„‘': 30,
'แ„’': 31, 'แ…ก': 32, 'แ…ข': 33, 'แ…ฃ': 34, 'แ…ค': 35, 'แ…ฅ': 36, 'แ…ฆ': 37, 'แ…ง': 38, 'แ…จ': 39, 'แ…ฉ': 40,
'แ…ช': 41, 'แ…ซ': 42, 'แ…ฌ': 43, 'แ…ญ': 44, 'แ…ฎ': 45, 'แ…ฏ': 46, 'แ…ฐ': 47, 'แ…ฑ': 48, 'แ…ฒ': 49, 'แ…ณ': 50,
'แ…ด': 51, 'แ…ต': 52, 'แ†จ': 53, 'แ†ฉ': 54, 'แ†ช': 55, 'แ†ซ': 56, 'แ†ฌ': 57, 'แ†ญ': 58, 'แ†ฎ': 59, 'แ†ฏ': 60,
'แ†ฐ': 61, 'แ†ฑ': 62, 'แ†ฒ': 63, 'แ†ณ': 64, 'แ†ด': 65, 'แ†ต': 66, 'แ†ถ': 67, 'แ†ท': 68, 'แ†ธ': 69, 'แ†น': 70,
'แ†บ': 71, 'แ†ป': 72, 'แ†ผ': 73, 'แ†ฝ': 74, 'แ†พ': 75, 'แ†ฟ': 76, 'แ‡€': 77, 'แ‡': 78, 'แ‡‚': 79, '~': 80
"""
_pad = "pad"
_eos = "eos"
_punctuation = "!'(),-.:;? "
_special = "-"
_jamo_leads = [chr(_) for _ in range(0x1100, 0x1113)]
_jamo_vowels = [chr(_) for _ in range(0x1161, 0x1176)]
_jamo_tails = [chr(_) for _ in range(0x11A8, 0x11C3)]
_letters = _jamo_leads + _jamo_vowels + _jamo_tails
symbols = [_pad] + list(_special) + list(_punctuation) + _letters + [_eos]
_symbol_to_id = {c: i for i, c in enumerate(symbols)}
_id_to_symbol = {i: c for i, c in enumerate(symbols)}
quote_checker = """([`"'๏ผ‚โ€œโ€˜])(.+?)([`"'๏ผ‚โ€โ€™])"""
def is_lead(char):
return char in _jamo_leads
def is_vowel(char):
return char in _jamo_vowels
def is_tail(char):
return char in _jamo_tails
def get_mode(char):
if is_lead(char):
return 0
elif is_vowel(char):
return 1
elif is_tail(char):
return 2
else:
return -1
def _get_text_from_candidates(candidates):
if len(candidates) == 0:
return ""
elif len(candidates) == 1:
return jamo_to_hcj(candidates[0])
else:
return j2h(**dict(zip(["lead", "vowel", "tail"], candidates)))
def jamo_to_korean(text):
text = h2j(text)
idx = 0
new_text = ""
candidates = []
while True:
if idx >= len(text):
new_text += _get_text_from_candidates(candidates)
break
char = text[idx]
mode = get_mode(char)
if mode == 0:
new_text += _get_text_from_candidates(candidates)
candidates = [char]
elif mode == -1:
new_text += _get_text_from_candidates(candidates)
new_text += char
candidates = []
else:
candidates.append(char)
idx += 1
return new_text
def compare_sentence_with_jamo(text1, text2):
return h2j(text1) != h2j(text2)
def tokenize(text, as_id=False):
# jamo package์— ์žˆ๋Š” hangul_to_jamo๋ฅผ ์ด์šฉํ•˜์—ฌ ํ•œ๊ธ€ string์„ ์ดˆ์„ฑ/์ค‘์„ฑ/์ข…์„ฑ์œผ๋กœ ๋‚˜๋ˆˆ๋‹ค.
text = normalize(text)
tokens = list(
hangul_to_jamo(text)
) # '์กด๊ฒฝํ•˜๋Š”' --> ['แ„Œ', 'แ…ฉ', 'แ†ซ', 'แ„€', 'แ…ง', 'แ†ผ', 'แ„’', 'แ…ก', 'แ„‚', 'แ…ณ', 'แ†ซ', '~']
if as_id:
return [_symbol_to_id[token] for token in tokens]
else:
return [token for token in tokens]
def tokenizer_fn(iterator):
return (token for x in iterator for token in tokenize(x, as_id=False))
def normalize(text):
text = text.strip()
text = re.sub("\(\d+์ผ\)", "", text)
text = re.sub("\([โบ€-โบ™โบ›-โปณโผ€-โฟ•ใ€…ใ€‡ใ€ก-ใ€ฉใ€ธ-ใ€บใ€ปใ€-ไถตไธ€-้ฟƒ่ฑˆ-้ถดไพฎ-้ ปไธฆ-้พŽ]+\)", "", text)
text = normalize_with_dictionary(text, etc_dictionary)
text = normalize_english(text)
text = re.sub("[a-zA-Z]+", normalize_upper, text)
text = normalize_quote(text)
text = normalize_number(text)
return text
def normalize_with_dictionary(text, dic):
if any(key in text for key in dic.keys()):
pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
return pattern.sub(lambda x: dic[x.group()], text)
else:
return text
def normalize_english(text):
def fn(m):
word = m.group()
if word in english_dictionary:
return english_dictionary.get(word)
else:
return word
text = re.sub("([A-Za-z]+)", fn, text)
return text
def normalize_upper(text):
text = text.group(0)
if all([char.isupper() for char in text]):
return "".join(upper_to_kor[char] for char in text)
else:
return text
def normalize_quote(text):
def fn(found_text):
from nltk import sent_tokenize # NLTK doesn't along with multiprocessing
found_text = found_text.group()
unquoted_text = found_text[1:-1]
sentences = sent_tokenize(unquoted_text)
return " ".join(["'{}'".format(sent) for sent in sentences])
return re.sub(quote_checker, fn, text)
number_checker = "([+-]?\d[\d,]*)[\.]?\d*"
count_checker = "(์‹œ|๋ช…|๊ฐ€์ง€|์‚ด|๋งˆ๋ฆฌ|ํฌ๊ธฐ|์†ก์ด|์ˆ˜|ํ†จ|ํ†ต|์ |๊ฐœ|๋ฒŒ|์ฒ™|์ฑ„|๋‹ค๋ฐœ|๊ทธ๋ฃจ|์ž๋ฃจ|์ค„|์ผค๋ ˆ|๊ทธ๋ฆ‡|์ž”|๋งˆ๋””|์ƒ์ž|์‚ฌ๋žŒ|๊ณก|๋ณ‘|ํŒ)"
def normalize_number(text):
text = normalize_with_dictionary(text, unit_to_kor1)
text = normalize_with_dictionary(text, unit_to_kor2)
text = re.sub(
number_checker + count_checker, lambda x: number_to_korean(x, True), text
)
text = re.sub(number_checker, lambda x: number_to_korean(x, False), text)
return text
num_to_kor1 = [""] + list("์ผ์ด์‚ผ์‚ฌ์˜ค์œก์น ํŒ”๊ตฌ")
num_to_kor2 = [""] + list("๋งŒ์–ต์กฐ๊ฒฝํ•ด")
num_to_kor3 = [""] + list("์‹ญ๋ฐฑ์ฒœ")
# count_to_kor1 = [""] + ["ํ•˜๋‚˜","๋‘˜","์…‹","๋„ท","๋‹ค์„ฏ","์—ฌ์„ฏ","์ผ๊ณฑ","์—ฌ๋Ÿ","์•„ํ™‰"]
count_to_kor1 = [""] + ["ํ•œ", "๋‘", "์„ธ", "๋„ค", "๋‹ค์„ฏ", "์—ฌ์„ฏ", "์ผ๊ณฑ", "์—ฌ๋Ÿ", "์•„ํ™‰"]
count_tenth_dict = {
"์‹ญ": "์—ด",
"๋‘์‹ญ": "์Šค๋ฌผ",
"์„ธ์‹ญ": "์„œ๋ฅธ",
"๋„ค์‹ญ": "๋งˆํ”",
"๋‹ค์„ฏ์‹ญ": "์‰ฐ",
"์—ฌ์„ฏ์‹ญ": "์˜ˆ์ˆœ",
"์ผ๊ณฑ์‹ญ": "์ผํ”",
"์—ฌ๋Ÿ์‹ญ": "์—ฌ๋“ ",
"์•„ํ™‰์‹ญ": "์•„ํ”",
}
def number_to_korean(num_str, is_count=False):
if is_count:
num_str, unit_str = num_str.group(1), num_str.group(2)
else:
num_str, unit_str = num_str.group(), ""
num_str = num_str.replace(",", "")
num = ast.literal_eval(num_str)
if num == 0:
return "์˜"
check_float = num_str.split(".")
if len(check_float) == 2:
digit_str, float_str = check_float
elif len(check_float) >= 3:
raise Exception(" [!] Wrong number format")
else:
digit_str, float_str = check_float[0], None
if is_count and float_str is not None:
raise Exception(" [!] `is_count` and float number does not fit each other")
digit = int(digit_str)
if digit_str.startswith("-"):
digit, digit_str = abs(digit), str(abs(digit))
kor = ""
size = len(str(digit))
tmp = []
for i, v in enumerate(digit_str, start=1):
v = int(v)
if v != 0:
if is_count:
tmp += count_to_kor1[v]
else:
tmp += num_to_kor1[v]
tmp += num_to_kor3[(size - i) % 4]
if (size - i) % 4 == 0 and len(tmp) != 0:
kor += "".join(tmp)
tmp = []
kor += num_to_kor2[int((size - i) / 4)]
if is_count:
if kor.startswith("ํ•œ") and len(kor) > 1:
kor = kor[1:]
if any(word in kor for word in count_tenth_dict):
kor = re.sub(
"|".join(count_tenth_dict.keys()),
lambda x: count_tenth_dict[x.group()],
kor,
)
if not is_count and kor.startswith("์ผ") and len(kor) > 1:
kor = kor[1:]
if float_str is not None:
kor += "์ฉœ "
kor += re.sub("\d", lambda x: num_to_kor[x.group()], float_str)
if num_str.startswith("+"):
kor = "ํ”Œ๋Ÿฌ์Šค " + kor
elif num_str.startswith("-"):
kor = "๋งˆ์ด๋„ˆ์Šค " + kor
return kor + unit_str