|
๏ปฟ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Korean related helpers.""" |
|
|
|
import ast |
|
import json |
|
import os |
|
import re |
|
|
|
from jamo import h2j, hangul_to_jamo, j2h, jamo_to_hcj |
|
|
|
etc_dictionary = { |
|
"2 30๋": "์ด์ผ์ญ๋", |
|
"20~30๋": "์ด์ผ์ญ๋", |
|
"20, 30๋": "์ด์ญ๋ ์ผ์ญ๋", |
|
"1+1": "์ํ๋ฌ์ค์", |
|
"3์์ 6๊ฐ์์ธ": "3๊ฐ์์์ ์ก๊ฐ์์ธ", |
|
} |
|
|
|
english_dictionary = { |
|
"Devsisters": "๋ฐ๋ธ์์คํฐ์ฆ", |
|
"track": "ํธ๋", |
|
|
|
"LA": "์์์ด", |
|
"LG": "์์ง", |
|
"KOREA": "์ฝ๋ฆฌ์", |
|
"JSA": "์ ์ด์์ค์์ด", |
|
"PGA": "ํผ์ง์์ด", |
|
"GA": "์ง์์ด", |
|
"idol": "์์ด๋", |
|
"KTX": "์ผ์ดํฐ์์ค", |
|
"AC": "์์ด์จ", |
|
"DVD": "๋๋น๋", |
|
"US": "์ ์์ค", |
|
"CNN": "์จ์์", |
|
"LPGA": "์ํผ์ง์์ด", |
|
"P": "ํผ", |
|
"L": "์", |
|
"T": "ํฐ", |
|
"B": "๋น", |
|
"C": "์จ", |
|
"BIFF": "๋น์์ด์ํ์ํ", |
|
"GV": "์ง๋น", |
|
|
|
"IT": "์์ดํฐ", |
|
"IQ": "์์ดํ", |
|
"JTBC": "์ ์ดํฐ๋น์จ", |
|
"trickle down effect": "ํธ๋ฆฌํด ๋ค์ด ์ดํํธ", |
|
"trickle up effect": "ํธ๋ฆฌํด ์
์ดํํธ", |
|
"down": "๋ค์ด", |
|
"up": "์
", |
|
"FCK": "์ํ์จ์ผ์ด", |
|
"AP": "์์ดํผ", |
|
"WHERETHEWILDTHINGSARE": "", |
|
"Rashomon Effect": "", |
|
"O": "์ค", |
|
"OO": "์ค์ค", |
|
"B": "๋น", |
|
"GDP": "์ง๋ํผ", |
|
"CIPA": "์จ์์ดํผ์์ด", |
|
"YS": "์์ด์์ค", |
|
"Y": "์์ด", |
|
"S": "์์ค", |
|
"JTBC": "์ ์ดํฐ๋น์จ", |
|
"PC": "ํผ์จ", |
|
"bill": "๋น", |
|
"Halmuny": "ํ๋ชจ๋", |
|
"X": "์์ค", |
|
"SNS": "์์ค์์์ค", |
|
"ability": "์ด๋น๋ฆฌํฐ", |
|
"shy": "", |
|
"CCTV": "์จ์จํฐ๋น", |
|
"IT": "์์ดํฐ", |
|
"the tenth man": "๋ ํ
์ฐ ๋งจ", |
|
"L": "์", |
|
"PC": "ํผ์จ", |
|
"YSDJJPMB": "", |
|
"Content Attitude Timing": "์ปจํ
ํธ ์ ํฐํ๋ ํ์ด๋ฐ", |
|
"CAT": "์บฃ", |
|
"IS": "์์ด์์ค", |
|
"K": "์ผ์ด", |
|
"Y": "์์ด", |
|
"KDI": "์ผ์ด๋์์ด", |
|
"DOC": "๋์ค์จ", |
|
"CIA": "์จ์์ด์์ด", |
|
"PBS": "ํผ๋น์์ค", |
|
"D": "๋", |
|
"PPropertyPositionPowerPrisonP" "S": "์์ค", |
|
"francisco": "ํ๋์์ค์ฝ", |
|
"I": "์์ด", |
|
"III": "์์ด์์ด", |
|
"No joke": "๋
ธ ์กฐํฌ", |
|
"BBK": "๋น๋น์ผ์ด", |
|
"LA": "์์์ด", |
|
"Don": "", |
|
"t worry be happy": " ์๋ฆฌ ๋น ํดํผ", |
|
"NO": "์์ค", |
|
"it was our sky": "์ ์์ฆ ์์ ์ค์นด์ด", |
|
"it is our sky": "์ ์ด์ฆ ์์ ์ค์นด์ด", |
|
"NEIS": "์์ด์์ด์์ค", |
|
"IMF": "์์ด์ ์ํ", |
|
"apology": "์ดํด๋ก์ง", |
|
"humble": "ํ๋ธ", |
|
"M": "์ ", |
|
"Nowhere Man": "๋
ธ์จ์ด ๋งจ", |
|
"The Tenth Man": "๋ ํ
์ฐ ๋งจ", |
|
"PBS": "ํผ๋น์์ค", |
|
"BBC": "๋น๋น์จ", |
|
"MRJ": "์ ์์ ์ด", |
|
"CCTV": "์จ์จํฐ๋น", |
|
"Pick me up": "ํฝ ๋ฏธ ์
", |
|
"DNA": "๋์์์ด", |
|
"UN": "์ ์", |
|
"STOP": "์คํ", |
|
"PRESS": "ํ๋ ์ค", |
|
"not to be": "๋ซ ํฌ๋น", |
|
"Denial": "๋๋์ด์ผ", |
|
"G": "์ง", |
|
"IMF": "์์ด์ ์ํ", |
|
"GDP": "์ง๋ํผ", |
|
"JTBC": "์ ์ดํฐ๋น์จ", |
|
"Time flies like an arrow": "ํ์ ํ๋ผ์ด์ฆ ๋ผ์ดํฌ ์ธ ์ ๋ก์ฐ", |
|
"DDT": "๋๋ํฐ", |
|
"AI": "์์ด์์ด", |
|
"Z": "์ ํธ", |
|
"OECD": "์ค์ด์จ๋", |
|
"N": "์ค", |
|
"A": "์์ด", |
|
"MB": "์ ๋น", |
|
"EH": "์ด์์ด์น", |
|
"IS": "์์ด์์ค", |
|
"TV": "ํฐ๋น", |
|
"MIT": "์ ์์ดํฐ", |
|
"KBO": "์ผ์ด๋น์ค", |
|
"I love America": "์์ด ๋ฌ๋ธ ์๋ฉ๋ฆฌ์นด", |
|
"SF": "์์ค์ํ", |
|
"Q": "ํ", |
|
"KFX": "์ผ์ด์ํ์์ค", |
|
"PM": "ํผ์ ", |
|
"Prime Minister": "ํ๋ผ์ ๋ฏธ๋์คํฐ", |
|
"Swordline": "์ค์๋๋ผ์ธ", |
|
"TBS": "ํฐ๋น์์ค", |
|
"DDT": "๋๋ํฐ", |
|
"CS": "์จ์์ค", |
|
"Reflecting Absence": "๋ฆฌํ๋ ํ
์ฑ์ผ์ค", |
|
"PBS": "ํผ๋น์์ค", |
|
"Drum being beaten by everyone": "๋๋ผ ๋น ๋นํผ ๋ฐ์ด ์๋ธ๋ฆฌ์", |
|
"negative pressure": "๋ค๊ฑฐํฐ๋ธ ํ๋ ์
", |
|
"F": "์ํ", |
|
"KIA": "๊ธฐ์", |
|
"FTA": "์ํํฐ์์ด", |
|
"Que sais-je": "", |
|
"UFC": "์ ์ํ์จ", |
|
"P": "ํผ", |
|
"DJ": "๋์ ์ด", |
|
"Chaebol": "์ฑ๋ฒ", |
|
"BBC": "๋น๋น์จ", |
|
"OECD": "์ค์ด์จ๋", |
|
"BC": "์์จ", |
|
"C": "์จ", |
|
"B": "์จ", |
|
"KY": "์ผ์ด์์ด", |
|
"K": "์ผ์ด", |
|
"CEO": "์จ์ด์ค", |
|
"YH": "์์ด์์น", |
|
"IS": "์์ด์์ค", |
|
"who are you": "ํ ์ผ ์ ", |
|
"Y": "์์ด", |
|
"The Devils Advocate": "๋ ๋ฐ๋น์ฆ ์ด๋๋ณด์นดํธ", |
|
"YS": "์์ด์์ค", |
|
"so sorry": "์ ์๋ฆฌ", |
|
"Santa": "์ฐํ", |
|
"Big Endian": "๋น
์๋์", |
|
"Small Endian": "์ค๋ชฐ ์๋์", |
|
"Oh Captain My Captain": "์ค ์บกํด ๋ง์ด ์บกํด", |
|
"AIB": "์์ด์์ด๋น", |
|
"K": "์ผ์ด", |
|
"PBS": "ํผ๋น์์ค", |
|
|
|
"ASMR": "์์ด์์ค์ ์", |
|
"V": "๋ธ์ด", |
|
"PD": "ํผ๋", |
|
"CD": "์จ๋", |
|
"ANR": "์์ด์์", |
|
"Twenty Three": "ํฌ์ํฐ ์ฐ๋ฆฌ", |
|
"Through The Night": "์ฐ๋ฃจ ๋ ๋์", |
|
"MD": "์ ๋", |
|
} |
|
|
|
num_to_kor = { |
|
"0": "์", |
|
"1": "์ผ", |
|
"2": "์ด", |
|
"3": "์ผ", |
|
"4": "์ฌ", |
|
"5": "์ค", |
|
"6": "์ก", |
|
"7": "์น ", |
|
"8": "ํ", |
|
"9": "๊ตฌ", |
|
} |
|
|
|
unit_to_kor1 = {"%": "ํผ์ผํธ", "cm": "์ผ์น๋ฏธํฐ", "mm": "๋ฐ๋ฆฌ๋ฏธํฐ", "km": "ํฌ๋ก๋ฏธํฐ", "kg": "ํฌ๋ก๊ทธ๋"} |
|
unit_to_kor2 = {"m": "๋ฏธํฐ"} |
|
|
|
upper_to_kor = { |
|
"A": "์์ด", |
|
"B": "๋น", |
|
"C": "์จ", |
|
"D": "๋", |
|
"E": "์ด", |
|
"F": "์ํ", |
|
"G": "์ง", |
|
"H": "์์ด์น", |
|
"I": "์์ด", |
|
"J": "์ ์ด", |
|
"K": "์ผ์ด", |
|
"L": "์", |
|
"M": "์ ", |
|
"N": "์", |
|
"O": "์ค", |
|
"P": "ํผ", |
|
"Q": "ํ", |
|
"R": "์", |
|
"S": "์์ค", |
|
"T": "ํฐ", |
|
"U": "์ ", |
|
"V": "๋ธ์ด", |
|
"W": "๋๋ธ์ ", |
|
"X": "์์ค", |
|
"Y": "์์ด", |
|
"Z": "์ง", |
|
} |
|
|
|
|
|
""" |
|
์ด์ฑ๊ณผ ์ข
์ฑ์ ๊ฐ์๋ณด์ด์ง๋ง, ๋ค๋ฅธ character์ด๋ค. |
|
|
|
'_-!'(),-.:;? แแแแแแ
แแแแแแแแแแแแแแ
กแ
ขแ
ฃแ
คแ
ฅแ
ฆแ
งแ
จแ
ฉแ
ชแ
ซแ
ฌแ
ญแ
ฎแ
ฏแ
ฐแ
ฑแ
ฒแ
ณแ
ดแ
ตแจแฉแชแซแฌแญแฎแฏแฐแฑแฒแณแดแตแถแทแธแนแบแปแผแฝแพแฟแแแ~' |
|
|
|
'_': 0, '-': 7, '!': 2, "'": 3, '(': 4, ')': 5, ',': 6, '.': 8, ':': 9, ';': 10, |
|
'?': 11, ' ': 12, 'แ': 13, 'แ': 14, 'แ': 15, 'แ': 16, 'แ': 17, 'แ
': 18, 'แ': 19, 'แ': 20, |
|
'แ': 21, 'แ': 22, 'แ': 23, 'แ': 24, 'แ': 25, 'แ': 26, 'แ': 27, 'แ': 28, 'แ': 29, 'แ': 30, |
|
'แ': 31, 'แ
ก': 32, 'แ
ข': 33, 'แ
ฃ': 34, 'แ
ค': 35, 'แ
ฅ': 36, 'แ
ฆ': 37, 'แ
ง': 38, 'แ
จ': 39, 'แ
ฉ': 40, |
|
'แ
ช': 41, 'แ
ซ': 42, 'แ
ฌ': 43, 'แ
ญ': 44, 'แ
ฎ': 45, 'แ
ฏ': 46, 'แ
ฐ': 47, 'แ
ฑ': 48, 'แ
ฒ': 49, 'แ
ณ': 50, |
|
'แ
ด': 51, 'แ
ต': 52, 'แจ': 53, 'แฉ': 54, 'แช': 55, 'แซ': 56, 'แฌ': 57, 'แญ': 58, 'แฎ': 59, 'แฏ': 60, |
|
'แฐ': 61, 'แฑ': 62, 'แฒ': 63, 'แณ': 64, 'แด': 65, 'แต': 66, 'แถ': 67, 'แท': 68, 'แธ': 69, 'แน': 70, |
|
'แบ': 71, 'แป': 72, 'แผ': 73, 'แฝ': 74, 'แพ': 75, 'แฟ': 76, 'แ': 77, 'แ': 78, 'แ': 79, '~': 80 |
|
""" |
|
|
|
_pad = "pad" |
|
_eos = "eos" |
|
_punctuation = "!'(),-.:;? " |
|
_special = "-" |
|
|
|
_jamo_leads = [chr(_) for _ in range(0x1100, 0x1113)] |
|
_jamo_vowels = [chr(_) for _ in range(0x1161, 0x1176)] |
|
_jamo_tails = [chr(_) for _ in range(0x11A8, 0x11C3)] |
|
|
|
_letters = _jamo_leads + _jamo_vowels + _jamo_tails |
|
|
|
symbols = [_pad] + list(_special) + list(_punctuation) + _letters + [_eos] |
|
|
|
_symbol_to_id = {c: i for i, c in enumerate(symbols)} |
|
_id_to_symbol = {i: c for i, c in enumerate(symbols)} |
|
|
|
quote_checker = """([`"'๏ผโโ])(.+?)([`"'๏ผโโ])""" |
|
|
|
|
|
def is_lead(char): |
|
return char in _jamo_leads |
|
|
|
|
|
def is_vowel(char): |
|
return char in _jamo_vowels |
|
|
|
|
|
def is_tail(char): |
|
return char in _jamo_tails |
|
|
|
|
|
def get_mode(char): |
|
if is_lead(char): |
|
return 0 |
|
elif is_vowel(char): |
|
return 1 |
|
elif is_tail(char): |
|
return 2 |
|
else: |
|
return -1 |
|
|
|
|
|
def _get_text_from_candidates(candidates): |
|
if len(candidates) == 0: |
|
return "" |
|
elif len(candidates) == 1: |
|
return jamo_to_hcj(candidates[0]) |
|
else: |
|
return j2h(**dict(zip(["lead", "vowel", "tail"], candidates))) |
|
|
|
|
|
def jamo_to_korean(text): |
|
text = h2j(text) |
|
|
|
idx = 0 |
|
new_text = "" |
|
candidates = [] |
|
|
|
while True: |
|
if idx >= len(text): |
|
new_text += _get_text_from_candidates(candidates) |
|
break |
|
|
|
char = text[idx] |
|
mode = get_mode(char) |
|
|
|
if mode == 0: |
|
new_text += _get_text_from_candidates(candidates) |
|
candidates = [char] |
|
elif mode == -1: |
|
new_text += _get_text_from_candidates(candidates) |
|
new_text += char |
|
candidates = [] |
|
else: |
|
candidates.append(char) |
|
|
|
idx += 1 |
|
return new_text |
|
|
|
|
|
def compare_sentence_with_jamo(text1, text2): |
|
return h2j(text1) != h2j(text2) |
|
|
|
|
|
def tokenize(text, as_id=False): |
|
|
|
text = normalize(text) |
|
tokens = list( |
|
hangul_to_jamo(text) |
|
) |
|
|
|
if as_id: |
|
return [_symbol_to_id[token] for token in tokens] |
|
else: |
|
return [token for token in tokens] |
|
|
|
|
|
def tokenizer_fn(iterator): |
|
return (token for x in iterator for token in tokenize(x, as_id=False)) |
|
|
|
|
|
def normalize(text): |
|
text = text.strip() |
|
|
|
text = re.sub("\(\d+์ผ\)", "", text) |
|
text = re.sub("\([โบ-โบโบ-โปณโผ-โฟใ
ใใก-ใฉใธ-ใบใปใ-ไถตไธ-้ฟ่ฑ-้ถดไพฎ-้ ปไธฆ-้พ]+\)", "", text) |
|
|
|
text = normalize_with_dictionary(text, etc_dictionary) |
|
text = normalize_english(text) |
|
text = re.sub("[a-zA-Z]+", normalize_upper, text) |
|
|
|
text = normalize_quote(text) |
|
text = normalize_number(text) |
|
|
|
return text |
|
|
|
|
|
def normalize_with_dictionary(text, dic): |
|
if any(key in text for key in dic.keys()): |
|
pattern = re.compile("|".join(re.escape(key) for key in dic.keys())) |
|
return pattern.sub(lambda x: dic[x.group()], text) |
|
else: |
|
return text |
|
|
|
|
|
def normalize_english(text): |
|
def fn(m): |
|
word = m.group() |
|
if word in english_dictionary: |
|
return english_dictionary.get(word) |
|
else: |
|
return word |
|
|
|
text = re.sub("([A-Za-z]+)", fn, text) |
|
return text |
|
|
|
|
|
def normalize_upper(text): |
|
text = text.group(0) |
|
|
|
if all([char.isupper() for char in text]): |
|
return "".join(upper_to_kor[char] for char in text) |
|
else: |
|
return text |
|
|
|
|
|
def normalize_quote(text): |
|
def fn(found_text): |
|
from nltk import sent_tokenize |
|
|
|
found_text = found_text.group() |
|
unquoted_text = found_text[1:-1] |
|
|
|
sentences = sent_tokenize(unquoted_text) |
|
return " ".join(["'{}'".format(sent) for sent in sentences]) |
|
|
|
return re.sub(quote_checker, fn, text) |
|
|
|
|
|
number_checker = "([+-]?\d[\d,]*)[\.]?\d*" |
|
count_checker = "(์|๋ช
|๊ฐ์ง|์ด|๋ง๋ฆฌ|ํฌ๊ธฐ|์ก์ด|์|ํจ|ํต|์ |๊ฐ|๋ฒ|์ฒ|์ฑ|๋ค๋ฐ|๊ทธ๋ฃจ|์๋ฃจ|์ค|์ผค๋ |๊ทธ๋ฆ|์|๋ง๋|์์|์ฌ๋|๊ณก|๋ณ|ํ)" |
|
|
|
|
|
def normalize_number(text): |
|
text = normalize_with_dictionary(text, unit_to_kor1) |
|
text = normalize_with_dictionary(text, unit_to_kor2) |
|
text = re.sub( |
|
number_checker + count_checker, lambda x: number_to_korean(x, True), text |
|
) |
|
text = re.sub(number_checker, lambda x: number_to_korean(x, False), text) |
|
return text |
|
|
|
|
|
num_to_kor1 = [""] + list("์ผ์ด์ผ์ฌ์ค์ก์น ํ๊ตฌ") |
|
num_to_kor2 = [""] + list("๋ง์ต์กฐ๊ฒฝํด") |
|
num_to_kor3 = [""] + list("์ญ๋ฐฑ์ฒ") |
|
|
|
|
|
count_to_kor1 = [""] + ["ํ", "๋", "์ธ", "๋ค", "๋ค์ฏ", "์ฌ์ฏ", "์ผ๊ณฑ", "์ฌ๋", "์ํ"] |
|
|
|
count_tenth_dict = { |
|
"์ญ": "์ด", |
|
"๋์ญ": "์ค๋ฌผ", |
|
"์ธ์ญ": "์๋ฅธ", |
|
"๋ค์ญ": "๋งํ", |
|
"๋ค์ฏ์ญ": "์ฐ", |
|
"์ฌ์ฏ์ญ": "์์", |
|
"์ผ๊ณฑ์ญ": "์ผํ", |
|
"์ฌ๋์ญ": "์ฌ๋ ", |
|
"์ํ์ญ": "์ํ", |
|
} |
|
|
|
|
|
def number_to_korean(num_str, is_count=False): |
|
if is_count: |
|
num_str, unit_str = num_str.group(1), num_str.group(2) |
|
else: |
|
num_str, unit_str = num_str.group(), "" |
|
|
|
num_str = num_str.replace(",", "") |
|
num = ast.literal_eval(num_str) |
|
|
|
if num == 0: |
|
return "์" |
|
|
|
check_float = num_str.split(".") |
|
if len(check_float) == 2: |
|
digit_str, float_str = check_float |
|
elif len(check_float) >= 3: |
|
raise Exception(" [!] Wrong number format") |
|
else: |
|
digit_str, float_str = check_float[0], None |
|
|
|
if is_count and float_str is not None: |
|
raise Exception(" [!] `is_count` and float number does not fit each other") |
|
|
|
digit = int(digit_str) |
|
|
|
if digit_str.startswith("-"): |
|
digit, digit_str = abs(digit), str(abs(digit)) |
|
|
|
kor = "" |
|
size = len(str(digit)) |
|
tmp = [] |
|
|
|
for i, v in enumerate(digit_str, start=1): |
|
v = int(v) |
|
|
|
if v != 0: |
|
if is_count: |
|
tmp += count_to_kor1[v] |
|
else: |
|
tmp += num_to_kor1[v] |
|
|
|
tmp += num_to_kor3[(size - i) % 4] |
|
|
|
if (size - i) % 4 == 0 and len(tmp) != 0: |
|
kor += "".join(tmp) |
|
tmp = [] |
|
kor += num_to_kor2[int((size - i) / 4)] |
|
|
|
if is_count: |
|
if kor.startswith("ํ") and len(kor) > 1: |
|
kor = kor[1:] |
|
|
|
if any(word in kor for word in count_tenth_dict): |
|
kor = re.sub( |
|
"|".join(count_tenth_dict.keys()), |
|
lambda x: count_tenth_dict[x.group()], |
|
kor, |
|
) |
|
|
|
if not is_count and kor.startswith("์ผ") and len(kor) > 1: |
|
kor = kor[1:] |
|
|
|
if float_str is not None: |
|
kor += "์ฉ " |
|
kor += re.sub("\d", lambda x: num_to_kor[x.group()], float_str) |
|
|
|
if num_str.startswith("+"): |
|
kor = "ํ๋ฌ์ค " + kor |
|
elif num_str.startswith("-"): |
|
kor = "๋ง์ด๋์ค " + kor |
|
|
|
return kor + unit_str |
|
|