|
import os |
|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline |
|
|
|
en2ko = "KoJLabs/nllb-finetuned-en2ko" |
|
ko2en = "KoJLabs/nllb-finetuned-ko2en" |
|
style = "KoJLabs/bart-speech-style-converter" |
|
|
|
en2ko_model = AutoModelForSeq2SeqLM.from_pretrained(en2ko) |
|
ko2en_model = AutoModelForSeq2SeqLM.from_pretrained(ko2en) |
|
style_model = AutoModelForSeq2SeqLM.from_pretrained(style) |
|
|
|
en2ko_tokenizer = AutoTokenizer.from_pretrained(en2ko) |
|
ko2en_tokenizer = AutoTokenizer.from_pretrained(ko2en) |
|
style_tokenizer = AutoTokenizer.from_pretrained(style) |
|
|
|
def _post_process(self, text): |
|
textList = [] |
|
emojiList = [] |
|
twit = Twitter() |
|
|
|
posText = twit.pos(text) |
|
posArray = np.array(posText) |
|
|
|
for i in range(len(posArray)): |
|
if posArray[i][1] == 'KoreanParticle': |
|
emojiList.append(posArray[i][0]) |
|
|
|
for i in range(len(emojiList)): |
|
splitText = text.split(emojiList[i], maxsplit=1) |
|
|
|
if splitText[0] == '': |
|
textList.append('') |
|
else: |
|
textList.append(splitText[0]) |
|
|
|
try: |
|
if len(splitText[1:]) > 1: |
|
text = ''.join(splitText[1:]).strip() |
|
else: |
|
text = splitText[1:][0].strip() |
|
|
|
except: |
|
break |
|
|
|
try: |
|
if text in emojiList[i+1]: |
|
pass |
|
except: |
|
textList.append(splitText[-1]) |
|
emojiList.append('') |
|
break |
|
|
|
|
|
if len(emojiList) < 1: |
|
emojiList.append('') |
|
textList.append(text) |
|
|
|
return emojiList, textList |
|
|
|
def translation(source, target, text): |
|
formats = {"English":"eng_Latn", "Korean":"kor_Hang"} |
|
src = formats[source] |
|
tgt = formats[target] |
|
|
|
if src == "eng_Latn": |
|
translator = pipeline( |
|
'translation', |
|
model=en2ko_model, |
|
tokenizer=en2ko_tokenizer, |
|
src_lang=src, |
|
tgt_lang=tgt, |
|
) |
|
|
|
if src == "kor_Hang": |
|
translator = pipeline( |
|
'translation', |
|
model=ko2en_model, |
|
tokenizer=ko2en_tokenizer, |
|
src_lang=src, |
|
tgt_lang=tgt |
|
) |
|
|
|
output = translator(text) |
|
translated_text = output[0]['translation_text'] |
|
|
|
if (text == '') or (text == '!') or (text == '?') or (text == '.') or (text == ','): |
|
return text |
|
else: |
|
return translated_text |
|
|
|
def augmentation(text): |
|
emojiList, textList = _post_process(text) |
|
|
|
ko2en_translator = pipeline( |
|
'translation', |
|
model=ko2en_model, |
|
tokenizer=ko2en_tokenizer, |
|
src_lang="kor_Hang", |
|
tgt_lang="eng_Latn" |
|
) |
|
|
|
output = ko2en_translator(textList) |
|
outputs = [] |
|
|
|
for out in output: |
|
outputs.append(out['translation_text']) |
|
ko2en_text = outputs |
|
|
|
en2ko_translator = pipeline( |
|
'translation', |
|
model=en2ko_model, |
|
tokenizer=en2ko_tokenizer, |
|
src_lang="eng_Latn", |
|
tgt_lang="kor_Hang", |
|
) |
|
|
|
output = en2ko_translator(ko2en_text) |
|
|
|
en2ko_text = [] |
|
for txt in en2ko_text: |
|
en2ko_text.append(txt['translation_text']) |
|
|
|
outList = [] |
|
for emo, txt in zip(emojiList, en2ko_text): |
|
output = txt + emo |
|
outList.append(output) |
|
output = ''.join(outList).strip() |
|
|
|
return output |
|
|
|
|
|
def conversion(source, text): |
|
formats = { |
|
"formal":"λ¬Έμ΄μ²΄", |
|
"informal":"ꡬμ΄μ²΄", |
|
"android":"μλλ‘μ΄λ", |
|
"azae":"μμ¬", |
|
"chat":"μ±ν
", |
|
"choding":"μ΄λ±νμ", |
|
"emoticon":"μ΄λͺ¨ν°μ½", |
|
"enfp":"enfp", |
|
"gentle":"μ μ¬", |
|
"halbae":"ν μλ²μ§", |
|
"halmae":"ν λ¨Έλ", |
|
"joongding":"μ€νμ", |
|
"king":"μ", |
|
"naruto":"λ루ν ", |
|
"seonbi":"μ λΉ", |
|
"sosim":"μμ¬ν", |
|
"translator":"λ²μκΈ°", |
|
} |
|
style = formats[source] |
|
|
|
input_text = f"{style} νμμΌλ‘ λ³ν:" + text |
|
|
|
converter = pipeline( |
|
'text2text-generation', |
|
model=style_model, |
|
tokenizer=style_tokenizer, |
|
) |
|
|
|
output = converter(input_text) |
|
generated_text = output[0]['generated_text'] |
|
|
|
return generated_text |
|
|
|
|
|
|
|
lang = ['English','Korean'] |
|
style = ['formal', 'informal', 'android', 'azae', 'chat', 'choding', 'emoticon', 'enfp', \ |
|
'gentle', 'halbae', 'halmae', 'joongding', 'king', 'naruto', 'seonbi', 'sosim', 'translator'] |
|
|
|
|
|
translation_app = gr.Interface( |
|
fn=translation, |
|
inputs=[gr.inputs.Dropdown(choices=lang, label='Source Language'), gr.inputs.Dropdown(choices=lang, label='Target Language'), gr.inputs.Textbox(lines=5, label='Text to Translate')], |
|
outputs=[gr.outputs.Textbox(label='Translated Text')], |
|
title="Translation", |
|
enable_queue=True, |
|
) |
|
|
|
augmentation_app = gr.Interface( |
|
fn=augmentation, |
|
inputs=[gr.inputs.Textbox(lines=5, label='Korean Text to Augmentation')], |
|
outputs=[gr.outputs.Textbox(label='Augmented Text')], |
|
title="Korean Data Augmentation (w.backtranslation)", |
|
enable_queue=True, |
|
) |
|
|
|
conversion_app = gr.Interface( |
|
fn=conversion, |
|
inputs=[gr.inputs.Dropdown(choices=style, label='Speech Style'), gr.inputs.Textbox(lines=5, label='Text to style conversion')], |
|
outputs=[gr.outputs.Textbox(label='Converted Text')], |
|
title="Speech Style Conversion", |
|
enable_queue=True, |
|
) |
|
|
|
demo = gr.TabbedInterface([translation_app, augmentation_app, conversion_app], \ |
|
["Translation", "Augmentation", "Speech Style conversion"],\ |
|
title = 'π₯If you want to download as pip package, \ |
|
please visit our github. (https://github.com/KoJLabs/KoTAN) π₯' |
|
) |
|
|
|
demo.launch() |