KoTAN / app.py
jisukim8873's picture
minor update
131e75e
import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
en2ko = "KoJLabs/nllb-finetuned-en2ko"
ko2en = "KoJLabs/nllb-finetuned-ko2en"
style = "KoJLabs/bart-speech-style-converter"
en2ko_model = AutoModelForSeq2SeqLM.from_pretrained(en2ko)
ko2en_model = AutoModelForSeq2SeqLM.from_pretrained(ko2en)
style_model = AutoModelForSeq2SeqLM.from_pretrained(style)
en2ko_tokenizer = AutoTokenizer.from_pretrained(en2ko)
ko2en_tokenizer = AutoTokenizer.from_pretrained(ko2en)
style_tokenizer = AutoTokenizer.from_pretrained(style)
def _post_process(self, text):
textList = []
emojiList = []
twit = Twitter()
posText = twit.pos(text)
posArray = np.array(posText)
for i in range(len(posArray)):
if posArray[i][1] == 'KoreanParticle':
emojiList.append(posArray[i][0])
for i in range(len(emojiList)):
splitText = text.split(emojiList[i], maxsplit=1)
if splitText[0] == '':
textList.append('')
else:
textList.append(splitText[0])
try:
if len(splitText[1:]) > 1:
text = ''.join(splitText[1:]).strip()
else:
text = splitText[1:][0].strip()
except:
break
try:
if text in emojiList[i+1]:
pass
except:
textList.append(splitText[-1])
emojiList.append('')
break
## 이λͺ¨μ§€ μ—†λŠ” 경우
if len(emojiList) < 1:
emojiList.append('')
textList.append(text)
return emojiList, textList
def translation(source, target, text):
formats = {"English":"eng_Latn", "Korean":"kor_Hang"}
src = formats[source]
tgt = formats[target]
if src == "eng_Latn":
translator = pipeline(
'translation',
model=en2ko_model,
tokenizer=en2ko_tokenizer,
src_lang=src,
tgt_lang=tgt,
)
if src == "kor_Hang":
translator = pipeline(
'translation',
model=ko2en_model,
tokenizer=ko2en_tokenizer,
src_lang=src,
tgt_lang=tgt
)
output = translator(text)
translated_text = output[0]['translation_text']
if (text == '') or (text == '!') or (text == '?') or (text == '.') or (text == ','):
return text
else:
return translated_text
def augmentation(text):
emojiList, textList = _post_process(text)
ko2en_translator = pipeline(
'translation',
model=ko2en_model,
tokenizer=ko2en_tokenizer,
src_lang="kor_Hang",
tgt_lang="eng_Latn"
)
output = ko2en_translator(textList)
outputs = []
for out in output:
outputs.append(out['translation_text'])
ko2en_text = outputs
en2ko_translator = pipeline(
'translation',
model=en2ko_model,
tokenizer=en2ko_tokenizer,
src_lang="eng_Latn",
tgt_lang="kor_Hang",
)
output = en2ko_translator(ko2en_text)
en2ko_text = []
for txt in en2ko_text:
en2ko_text.append(txt['translation_text'])
outList = []
for emo, txt in zip(emojiList, en2ko_text):
output = txt + emo
outList.append(output)
output = ''.join(outList).strip()
return output
def conversion(source, text):
formats = {
"formal":"문어체",
"informal":"ꡬ어체",
"android":"μ•ˆλ“œλ‘œμ΄λ“œ",
"azae":"μ•„μž¬",
"chat":"μ±„νŒ…",
"choding":"μ΄ˆλ“±ν•™μƒ",
"emoticon":"이λͺ¨ν‹°μ½˜",
"enfp":"enfp",
"gentle":"신사",
"halbae":"할아버지",
"halmae":"ν• λ¨Έλ‹ˆ",
"joongding":"쀑학생",
"king":"μ™•",
"naruto":"λ‚˜λ£¨ν† ",
"seonbi":"μ„ λΉ„",
"sosim":"μ†Œμ‹¬ν•œ",
"translator":"λ²ˆμ—­κΈ°",
}
style = formats[source]
input_text = f"{style} ν˜•μ‹μœΌλ‘œ λ³€ν™˜:" + text
converter = pipeline(
'text2text-generation',
model=style_model,
tokenizer=style_tokenizer,
)
output = converter(input_text)
generated_text = output[0]['generated_text']
return generated_text
lang = ['English','Korean']
style = ['formal', 'informal', 'android', 'azae', 'chat', 'choding', 'emoticon', 'enfp', \
'gentle', 'halbae', 'halmae', 'joongding', 'king', 'naruto', 'seonbi', 'sosim', 'translator']
translation_app = gr.Interface(
fn=translation,
inputs=[gr.inputs.Dropdown(choices=lang, label='Source Language'), gr.inputs.Dropdown(choices=lang, label='Target Language'), gr.inputs.Textbox(lines=5, label='Text to Translate')],
outputs=[gr.outputs.Textbox(label='Translated Text')],
title="Translation",
enable_queue=True,
)
augmentation_app = gr.Interface(
fn=augmentation,
inputs=[gr.inputs.Textbox(lines=5, label='Korean Text to Augmentation')],
outputs=[gr.outputs.Textbox(label='Augmented Text')],
title="Korean Data Augmentation (w.backtranslation)",
enable_queue=True,
)
conversion_app = gr.Interface(
fn=conversion,
inputs=[gr.inputs.Dropdown(choices=style, label='Speech Style'), gr.inputs.Textbox(lines=5, label='Text to style conversion')],
outputs=[gr.outputs.Textbox(label='Converted Text')],
title="Speech Style Conversion",
enable_queue=True,
)
demo = gr.TabbedInterface([translation_app, augmentation_app, conversion_app], \
["Translation", "Augmentation", "Speech Style conversion"],\
title = 'πŸ”₯If you want to download as pip package, \
please visit our github. (https://github.com/KoJLabs/KoTAN) πŸ”₯'
)
demo.launch()