Spaces:
Runtime error
Runtime error
# coding: utf-8 | |
# Code based on | |
import re | |
import os | |
import ast | |
import json | |
from jamo import hangul_to_jamo, h2j, j2h | |
from .ko_dictionary import english_dictionary, etc_dictionary | |
PAD = '_' | |
EOS = '~' | |
PUNC = '!\'(),-.:;?' | |
SPACE = ' ' | |
JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)]) | |
JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)]) | |
JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)]) | |
VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE | |
ALL_SYMBOLS = PAD + EOS + VALID_CHARS | |
char_to_id = {c: i for i, c in enumerate(ALL_SYMBOLS)} | |
id_to_char = {i: c for i, c in enumerate(ALL_SYMBOLS)} | |
quote_checker = """([`"'๏ผโโ])(.+?)([`"'๏ผโโ])""" | |
def is_lead(char): | |
return char in JAMO_LEADS | |
def is_vowel(char): | |
return char in JAMO_VOWELS | |
def is_tail(char): | |
return char in JAMO_TAILS | |
def get_mode(char): | |
if is_lead(char): | |
return 0 | |
elif is_vowel(char): | |
return 1 | |
elif is_tail(char): | |
return 2 | |
else: | |
return -1 | |
def _get_text_from_candidates(candidates): | |
if len(candidates) == 0: | |
return "" | |
elif len(candidates) == 1: | |
return _jamo_char_to_hcj(candidates[0]) | |
else: | |
return j2h(**dict(zip(["lead", "vowel", "tail"], candidates))) | |
def jamo_to_korean(text): | |
text = h2j(text) | |
idx = 0 | |
new_text = "" | |
candidates = [] | |
while True: | |
if idx >= len(text): | |
new_text += _get_text_from_candidates(candidates) | |
break | |
char = text[idx] | |
mode = get_mode(char) | |
if mode == 0: | |
new_text += _get_text_from_candidates(candidates) | |
candidates = [char] | |
elif mode == -1: | |
new_text += _get_text_from_candidates(candidates) | |
new_text += char | |
candidates = [] | |
else: | |
candidates.append(char) | |
idx += 1 | |
return new_text | |
num_to_kor = { | |
'0': '์', | |
'1': '์ผ', | |
'2': '์ด', | |
'3': '์ผ', | |
'4': '์ฌ', | |
'5': '์ค', | |
'6': '์ก', | |
'7': '์น ', | |
'8': 'ํ', | |
'9': '๊ตฌ', | |
} | |
unit_to_kor1 = { | |
'%': 'ํผ์ผํธ', | |
'cm': '์ผ์น๋ฏธํฐ', | |
'mm': '๋ฐ๋ฆฌ๋ฏธํฐ', | |
'km': 'ํฌ๋ก๋ฏธํฐ', | |
'kg': 'ํฌ๋ก๊ทธ๋', | |
} | |
unit_to_kor2 = { | |
'm': '๋ฏธํฐ', | |
} | |
upper_to_kor = { | |
'A': '์์ด', | |
'B': '๋น', | |
'C': '์จ', | |
'D': '๋', | |
'E': '์ด', | |
'F': '์ํ', | |
'G': '์ง', | |
'H': '์์ด์น', | |
'I': '์์ด', | |
'J': '์ ์ด', | |
'K': '์ผ์ด', | |
'L': '์', | |
'M': '์ ', | |
'N': '์', | |
'O': '์ค', | |
'P': 'ํผ', | |
'Q': 'ํ', | |
'R': '์', | |
'S': '์์ค', | |
'T': 'ํฐ', | |
'U': '์ ', | |
'V': '๋ธ์ด', | |
'W': '๋๋ธ์ ', | |
'X': '์์ค', | |
'Y': '์์ด', | |
'Z': '์ง', | |
} | |
def compare_sentence_with_jamo(text1, text2): | |
return h2j(text1) != h2j(text2) | |
def tokenize(text, as_id=False): | |
# jamo package์ ์๋ hangul_to_jamo๋ฅผ ์ด์ฉํ์ฌ ํ๊ธ string์ ์ด์ฑ/์ค์ฑ/์ข ์ฑ์ผ๋ก ๋๋๋ค. | |
text = normalize(text) | |
tokens = list(hangul_to_jamo(text)) # '์กด๊ฒฝํ๋' --> ['แ', 'แ ฉ', 'แซ', 'แ', 'แ ง', 'แผ', 'แ', 'แ ก', 'แ', 'แ ณ', 'แซ', '~'] | |
if as_id: | |
return [char_to_id[token] for token in tokens] + [char_to_id[EOS]] | |
else: | |
return [token for token in tokens] + [EOS] | |
def tokenizer_fn(iterator): | |
return (token for x in iterator for token in tokenize(x, as_id=False)) | |
def normalize(text): | |
text = text.strip() | |
text = re.sub('\(\d+์ผ\)', '', text) | |
text = re.sub('\([โบ-โบโบ-โปณโผ-โฟใ ใใก-ใฉใธ-ใบใปใ-ไถตไธ-้ฟ่ฑ-้ถดไพฎ-้ ปไธฆ-้พ]+\)', '', text) | |
text = normalize_with_dictionary(text, etc_dictionary) | |
text = normalize_english(text) | |
text = re.sub('[a-zA-Z]+', normalize_upper, text) | |
text = normalize_quote(text) | |
text = normalize_number(text) | |
return text | |
def normalize_with_dictionary(text, dic): | |
if any(key in text for key in dic.keys()): | |
pattern = re.compile('|'.join(re.escape(key) for key in dic.keys())) | |
return pattern.sub(lambda x: dic[x.group()], text) | |
else: | |
return text | |
def normalize_english(text): | |
def fn(m): | |
word = m.group() | |
if word in english_dictionary: | |
return english_dictionary.get(word) | |
else: | |
return word | |
text = re.sub("([A-Za-z]+)", fn, text) | |
return text | |
def normalize_upper(text): | |
text = text.group(0) | |
if all([char.isupper() for char in text]): | |
return "".join(upper_to_kor[char] for char in text) | |
else: | |
return text | |
def normalize_quote(text): | |
def fn(found_text): | |
from nltk import sent_tokenize # NLTK doesn't along with multiprocessing | |
found_text = found_text.group() | |
unquoted_text = found_text[1:-1] | |
sentences = sent_tokenize(unquoted_text) | |
return " ".join(["'{}'".format(sent) for sent in sentences]) | |
return re.sub(quote_checker, fn, text) | |
number_checker = "([+-]?\d[\d,]*)[\.]?\d*" | |
count_checker = "(์|๋ช |๊ฐ์ง|์ด|๋ง๋ฆฌ|ํฌ๊ธฐ|์ก์ด|์|ํจ|ํต|์ |๊ฐ|๋ฒ|์ฒ|์ฑ|๋ค๋ฐ|๊ทธ๋ฃจ|์๋ฃจ|์ค|์ผค๋ |๊ทธ๋ฆ|์|๋ง๋|์์|์ฌ๋|๊ณก|๋ณ|ํ)" | |
def normalize_number(text): | |
text = normalize_with_dictionary(text, unit_to_kor1) | |
text = normalize_with_dictionary(text, unit_to_kor2) | |
text = re.sub(number_checker + count_checker, | |
lambda x: number_to_korean(x, True), text) | |
text = re.sub(number_checker, | |
lambda x: number_to_korean(x, False), text) | |
return text | |
num_to_kor1 = [""] + list("์ผ์ด์ผ์ฌ์ค์ก์น ํ๊ตฌ") | |
num_to_kor2 = [""] + list("๋ง์ต์กฐ๊ฒฝํด") | |
num_to_kor3 = [""] + list("์ญ๋ฐฑ์ฒ") | |
# count_to_kor1 = [""] + ["ํ๋","๋","์ ","๋ท","๋ค์ฏ","์ฌ์ฏ","์ผ๊ณฑ","์ฌ๋","์ํ"] | |
count_to_kor1 = [""] + ["ํ", "๋", "์ธ", "๋ค", "๋ค์ฏ", "์ฌ์ฏ", "์ผ๊ณฑ", "์ฌ๋", "์ํ"] | |
count_tenth_dict = { | |
"์ญ": "์ด", | |
"๋์ญ": "์ค๋ฌผ", | |
"์ธ์ญ": "์๋ฅธ", | |
"๋ค์ญ": "๋งํ", | |
"๋ค์ฏ์ญ": "์ฐ", | |
"์ฌ์ฏ์ญ": "์์", | |
"์ผ๊ณฑ์ญ": "์ผํ", | |
"์ฌ๋์ญ": "์ฌ๋ ", | |
"์ํ์ญ": "์ํ", | |
} | |
def number_to_korean(num_str, is_count=False): | |
if is_count: | |
num_str, unit_str = num_str.group(1), num_str.group(2) | |
else: | |
num_str, unit_str = num_str.group(), "" | |
num_str = num_str.replace(',', '') | |
# print("before ast : ", num_str, "dtype : ",type(num_str)) | |
try: | |
num = ast.literal_eval(num_str) | |
# print("After ast :", num,"dtype : ",type(num)) | |
except Exception: | |
num_str = re.sub('^0+', '', num_str) | |
num = ast.literal_eval(num_str) | |
if num == 0: | |
return "์" | |
check_float = num_str.split('.') | |
if len(check_float) == 2: | |
digit_str, float_str = check_float | |
elif len(check_float) >= 3: | |
raise Exception(" [!] Wrong number format") | |
else: | |
digit_str, float_str = check_float[0], None | |
if is_count and float_str is not None: | |
raise Exception(" [!] `is_count` and float number does not fit each other") | |
digit = int(digit_str) | |
if digit_str.startswith("-"): | |
digit, digit_str = abs(digit), str(abs(digit)) | |
kor = "" | |
size = len(str(digit)) | |
tmp = [] | |
for i, v in enumerate(digit_str, start=1): | |
v = int(v) | |
if v != 0: | |
if is_count: | |
tmp += count_to_kor1[v] | |
else: | |
tmp += num_to_kor1[v] | |
tmp += num_to_kor3[(size - i) % 4] | |
if (size - i) % 4 == 0 and len(tmp) != 0: | |
kor += "".join(tmp) | |
tmp = [] | |
kor += num_to_kor2[int((size - i) / 4)] | |
if is_count: | |
if kor.startswith("ํ") and len(kor) > 1: | |
kor = kor[1:] | |
if any(word in kor for word in count_tenth_dict): | |
kor = re.sub( | |
'|'.join(count_tenth_dict.keys()), | |
lambda x: count_tenth_dict[x.group()], kor) | |
if not is_count and kor.startswith("์ผ") and len(kor) > 1: | |
kor = kor[1:] | |
if float_str is not None: | |
kor += "์ฉ " | |
kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str) | |
if num_str.startswith("+"): | |
kor = "ํ๋ฌ์ค " + kor | |
elif num_str.startswith("-"): | |
kor = "๋ง์ด๋์ค " + kor | |
return kor + unit_str | |
if __name__ == "__main__": | |
def test_normalize(text): | |
print(text) | |
print(normalize(text)) | |
print("=" * 30) | |
test_normalize("JTBC๋ JTBCs๋ฅผ DY๋ A๊ฐ Absolute") | |
test_normalize("์ค๋(13์ผ) 3,600๋ง๋ฆฌ ๊ฐ์์ง๊ฐ") | |
test_normalize("60.3%") | |
test_normalize('"์ ๋"(็ช็ช) ์ ๋๋ค.') | |
test_normalize('๋น๋์์์ฅ์ด ์ง๋ 1์ ์ด๋ฐ ๋ง์ ํ์ต๋๋ค. โ๋ ๊ทธ๋ฅ ์ฐ๋ผ์ง์ฒ๋ผ ๋ํํ๋ ์คํ์ผ์ด๋คโ') | |
test_normalize("์ง๊ธ์ -12.35%์๊ณ ์ข ๋ฅ๋ 5๊ฐ์ง์ 19๊ฐ์ง, ๊ทธ๋ฆฌ๊ณ 55๊ฐ์ง์๋ค") | |
test_normalize("JTBC๋ TH์ K ์์ด 2017๋ 9์ 12์ผ ์คํ 12์์ 24์ด์ด ๋๋ค") | |
print(list(hangul_to_jamo(list(hangul_to_jamo('๋น๋์์์ฅ์ด ์ง๋ 1์ ์ด๋ฐ ๋ง์ ํ์ต๋๋ค? โ๋ ๊ทธ๋ฅ ์ฐ๋ผ์ง์ฒ๋ผ ๋ํํ๋ ์คํ์ผ์ด๋คโ'))))) |