Spaces:

NHNDQ
/

KoTAN

Runtime error

App Files Files Community

jisukim8873 commited on Jun 16, 2023

Commit

131e75e

1 Parent(s): c7f4c0d

minor update

Browse files

Files changed (1) hide show

app.py +67 -5

app.py CHANGED Viewed

@@ -14,6 +14,50 @@ en2ko_tokenizer = AutoTokenizer.from_pretrained(en2ko)
 ko2en_tokenizer = AutoTokenizer.from_pretrained(ko2en)
 style_tokenizer = AutoTokenizer.from_pretrained(style)
 def translation(source, target, text):
     formats = {"English":"eng_Latn", "Korean":"kor_Hang"}
     src = formats[source]
@@ -40,9 +84,14 @@ def translation(source, target, text):
     output = translator(text)
     translated_text = output[0]['translation_text']
-    return translated_text
 def augmentation(text):
     ko2en_translator = pipeline(
             'translation',
             model=ko2en_model,
@@ -51,8 +100,12 @@ def augmentation(text):
             tgt_lang="eng_Latn"
         )
-    output = ko2en_translator(text)
-    ko2en_text = output[0]['translation_text']
     en2ko_translator = pipeline(
             'translation',
@@ -63,9 +116,18 @@ def augmentation(text):
         )
     output = en2ko_translator(ko2en_text)
-    en2ko_text = output[0]['translation_text']
-    return en2ko_text
 def conversion(source, text):

 ko2en_tokenizer = AutoTokenizer.from_pretrained(ko2en)
 style_tokenizer = AutoTokenizer.from_pretrained(style)
+def _post_process(self, text):
+        textList = []
+        emojiList = []
+        twit = Twitter()
+        posText = twit.pos(text)
+        posArray = np.array(posText)
+        for i in range(len(posArray)):
+            if posArray[i][1] == 'KoreanParticle':
+                emojiList.append(posArray[i][0])
+        for i in range(len(emojiList)):
+            splitText = text.split(emojiList[i], maxsplit=1)
+            if splitText[0] == '':
+                textList.append('')
+            else:
+                textList.append(splitText[0])
+            try:
+                if len(splitText[1:]) > 1:
+                    text = ''.join(splitText[1:]).strip()
+                else:
+                    text = splitText[1:][0].strip()
+            except:
+                break
+            try:
+                if text in emojiList[i+1]:
+                    pass
+            except:
+                textList.append(splitText[-1])
+                emojiList.append('')
+                break
+        ## 이모지 없는 경우
+        if len(emojiList) < 1:
+            emojiList.append('')
+            textList.append(text)
+        return emojiList, textList
 def translation(source, target, text):
     formats = {"English":"eng_Latn", "Korean":"kor_Hang"}
     src = formats[source]
     output = translator(text)
     translated_text = output[0]['translation_text']
+    if (text == '') or (text == '!') or (text == '?') or (text == '.') or (text == ','):
+        return text
+    else:
+        return translated_text
 def augmentation(text):
+    emojiList, textList = _post_process(text)
     ko2en_translator = pipeline(
             'translation',
             model=ko2en_model,
             tgt_lang="eng_Latn"
         )
+    output = ko2en_translator(textList)
+    outputs = []
+    for out in output:
+        outputs.append(out['translation_text'])
+    ko2en_text = outputs
     en2ko_translator = pipeline(
             'translation',
         )
     output = en2ko_translator(ko2en_text)
+    en2ko_text = []
+    for txt in en2ko_text:
+        en2ko_text.append(txt['translation_text'])
+    outList = []
+    for emo, txt in zip(emojiList, en2ko_text):
+        output = txt + emo
+        outList.append(output)
+        output = ''.join(outList).strip()
+    return output
 def conversion(source, text):