import os import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline en2ko = "KoJLabs/nllb-finetuned-en2ko" ko2en = "KoJLabs/nllb-finetuned-ko2en" style = "KoJLabs/bart-speech-style-converter" en2ko_model = AutoModelForSeq2SeqLM.from_pretrained(en2ko) ko2en_model = AutoModelForSeq2SeqLM.from_pretrained(ko2en) style_model = AutoModelForSeq2SeqLM.from_pretrained(style) en2ko_tokenizer = AutoTokenizer.from_pretrained(en2ko) ko2en_tokenizer = AutoTokenizer.from_pretrained(ko2en) style_tokenizer = AutoTokenizer.from_pretrained(style) def _post_process(self, text): textList = [] emojiList = [] twit = Twitter() posText = twit.pos(text) posArray = np.array(posText) for i in range(len(posArray)): if posArray[i][1] == 'KoreanParticle': emojiList.append(posArray[i][0]) for i in range(len(emojiList)): splitText = text.split(emojiList[i], maxsplit=1) if splitText[0] == '': textList.append('') else: textList.append(splitText[0]) try: if len(splitText[1:]) > 1: text = ''.join(splitText[1:]).strip() else: text = splitText[1:][0].strip() except: break try: if text in emojiList[i+1]: pass except: textList.append(splitText[-1]) emojiList.append('') break ## 이모지 없는 경우 if len(emojiList) < 1: emojiList.append('') textList.append(text) return emojiList, textList def translation(source, target, text): formats = {"English":"eng_Latn", "Korean":"kor_Hang"} src = formats[source] tgt = formats[target] if src == "eng_Latn": translator = pipeline( 'translation', model=en2ko_model, tokenizer=en2ko_tokenizer, src_lang=src, tgt_lang=tgt, ) if src == "kor_Hang": translator = pipeline( 'translation', model=ko2en_model, tokenizer=ko2en_tokenizer, src_lang=src, tgt_lang=tgt ) output = translator(text) translated_text = output[0]['translation_text'] if (text == '') or (text == '!') or (text == '?') or (text == '.') or (text == ','): return text else: return translated_text def augmentation(text): emojiList, textList = _post_process(text) ko2en_translator = pipeline( 'translation', model=ko2en_model, tokenizer=ko2en_tokenizer, src_lang="kor_Hang", tgt_lang="eng_Latn" ) output = ko2en_translator(textList) outputs = [] for out in output: outputs.append(out['translation_text']) ko2en_text = outputs en2ko_translator = pipeline( 'translation', model=en2ko_model, tokenizer=en2ko_tokenizer, src_lang="eng_Latn", tgt_lang="kor_Hang", ) output = en2ko_translator(ko2en_text) en2ko_text = [] for txt in en2ko_text: en2ko_text.append(txt['translation_text']) outList = [] for emo, txt in zip(emojiList, en2ko_text): output = txt + emo outList.append(output) output = ''.join(outList).strip() return output def conversion(source, text): formats = { "formal":"문어체", "informal":"구어체", "android":"안드로이드", "azae":"아재", "chat":"채팅", "choding":"초등학생", "emoticon":"이모티콘", "enfp":"enfp", "gentle":"신사", "halbae":"할아버지", "halmae":"할머니", "joongding":"중학생", "king":"왕", "naruto":"나루토", "seonbi":"선비", "sosim":"소심한", "translator":"번역기", } style = formats[source] input_text = f"{style} 형식으로 변환:" + text converter = pipeline( 'text2text-generation', model=style_model, tokenizer=style_tokenizer, ) output = converter(input_text) generated_text = output[0]['generated_text'] return generated_text lang = ['English','Korean'] style = ['formal', 'informal', 'android', 'azae', 'chat', 'choding', 'emoticon', 'enfp', \ 'gentle', 'halbae', 'halmae', 'joongding', 'king', 'naruto', 'seonbi', 'sosim', 'translator'] translation_app = gr.Interface( fn=translation, inputs=[gr.inputs.Dropdown(choices=lang, label='Source Language'), gr.inputs.Dropdown(choices=lang, label='Target Language'), gr.inputs.Textbox(lines=5, label='Text to Translate')], outputs=[gr.outputs.Textbox(label='Translated Text')], title="Translation", enable_queue=True, ) augmentation_app = gr.Interface( fn=augmentation, inputs=[gr.inputs.Textbox(lines=5, label='Korean Text to Augmentation')], outputs=[gr.outputs.Textbox(label='Augmented Text')], title="Korean Data Augmentation (w.backtranslation)", enable_queue=True, ) conversion_app = gr.Interface( fn=conversion, inputs=[gr.inputs.Dropdown(choices=style, label='Speech Style'), gr.inputs.Textbox(lines=5, label='Text to style conversion')], outputs=[gr.outputs.Textbox(label='Converted Text')], title="Speech Style Conversion", enable_queue=True, ) demo = gr.TabbedInterface([translation_app, augmentation_app, conversion_app], \ ["Translation", "Augmentation", "Speech Style conversion"],\ title = '🔥If you want to download as pip package, \ please visit our github. (https://github.com/KoJLabs/KoTAN) 🔥' ) demo.launch()