jisukim8873 commited on
Commit
131e75e
·
1 Parent(s): c7f4c0d

minor update

Browse files
Files changed (1) hide show
  1. app.py +67 -5
app.py CHANGED
@@ -14,6 +14,50 @@ en2ko_tokenizer = AutoTokenizer.from_pretrained(en2ko)
14
  ko2en_tokenizer = AutoTokenizer.from_pretrained(ko2en)
15
  style_tokenizer = AutoTokenizer.from_pretrained(style)
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def translation(source, target, text):
18
  formats = {"English":"eng_Latn", "Korean":"kor_Hang"}
19
  src = formats[source]
@@ -40,9 +84,14 @@ def translation(source, target, text):
40
  output = translator(text)
41
  translated_text = output[0]['translation_text']
42
 
43
- return translated_text
 
 
 
44
 
45
  def augmentation(text):
 
 
46
  ko2en_translator = pipeline(
47
  'translation',
48
  model=ko2en_model,
@@ -51,8 +100,12 @@ def augmentation(text):
51
  tgt_lang="eng_Latn"
52
  )
53
 
54
- output = ko2en_translator(text)
55
- ko2en_text = output[0]['translation_text']
 
 
 
 
56
 
57
  en2ko_translator = pipeline(
58
  'translation',
@@ -63,9 +116,18 @@ def augmentation(text):
63
  )
64
 
65
  output = en2ko_translator(ko2en_text)
66
- en2ko_text = output[0]['translation_text']
 
 
 
 
 
 
 
 
 
67
 
68
- return en2ko_text
69
 
70
 
71
  def conversion(source, text):
 
14
  ko2en_tokenizer = AutoTokenizer.from_pretrained(ko2en)
15
  style_tokenizer = AutoTokenizer.from_pretrained(style)
16
 
17
+ def _post_process(self, text):
18
+ textList = []
19
+ emojiList = []
20
+ twit = Twitter()
21
+
22
+ posText = twit.pos(text)
23
+ posArray = np.array(posText)
24
+
25
+ for i in range(len(posArray)):
26
+ if posArray[i][1] == 'KoreanParticle':
27
+ emojiList.append(posArray[i][0])
28
+
29
+ for i in range(len(emojiList)):
30
+ splitText = text.split(emojiList[i], maxsplit=1)
31
+
32
+ if splitText[0] == '':
33
+ textList.append('')
34
+ else:
35
+ textList.append(splitText[0])
36
+
37
+ try:
38
+ if len(splitText[1:]) > 1:
39
+ text = ''.join(splitText[1:]).strip()
40
+ else:
41
+ text = splitText[1:][0].strip()
42
+
43
+ except:
44
+ break
45
+
46
+ try:
47
+ if text in emojiList[i+1]:
48
+ pass
49
+ except:
50
+ textList.append(splitText[-1])
51
+ emojiList.append('')
52
+ break
53
+
54
+ ## 이모지 없는 경우
55
+ if len(emojiList) < 1:
56
+ emojiList.append('')
57
+ textList.append(text)
58
+
59
+ return emojiList, textList
60
+
61
  def translation(source, target, text):
62
  formats = {"English":"eng_Latn", "Korean":"kor_Hang"}
63
  src = formats[source]
 
84
  output = translator(text)
85
  translated_text = output[0]['translation_text']
86
 
87
+ if (text == '') or (text == '!') or (text == '?') or (text == '.') or (text == ','):
88
+ return text
89
+ else:
90
+ return translated_text
91
 
92
  def augmentation(text):
93
+ emojiList, textList = _post_process(text)
94
+
95
  ko2en_translator = pipeline(
96
  'translation',
97
  model=ko2en_model,
 
100
  tgt_lang="eng_Latn"
101
  )
102
 
103
+ output = ko2en_translator(textList)
104
+ outputs = []
105
+
106
+ for out in output:
107
+ outputs.append(out['translation_text'])
108
+ ko2en_text = outputs
109
 
110
  en2ko_translator = pipeline(
111
  'translation',
 
116
  )
117
 
118
  output = en2ko_translator(ko2en_text)
119
+
120
+ en2ko_text = []
121
+ for txt in en2ko_text:
122
+ en2ko_text.append(txt['translation_text'])
123
+
124
+ outList = []
125
+ for emo, txt in zip(emojiList, en2ko_text):
126
+ output = txt + emo
127
+ outList.append(output)
128
+ output = ''.join(outList).strip()
129
 
130
+ return output
131
 
132
 
133
  def conversion(source, text):