muhammadayman commited on
Commit
c47d401
·
1 Parent(s): a947b14

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -58
app.py CHANGED
@@ -2,71 +2,20 @@ import sys
2
  import gradio as gr
3
  from transformers import AutoTokenizer
4
  import torch
5
- import json, re
6
-
7
  tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
8
  model = torch.load("helsinki_fineTuned.pt", map_location=torch.device('cpu'))
9
  model.eval()
10
-
11
-
12
- # Open Keywords
13
- with open('merged.json', encoding='utf8') as merged:
14
- data = json.load(merged)
15
- keyword_map = {}
16
- for en in data:
17
- keyword_map[en.lower()] = data[en]
18
- merged.close()
19
-
20
- # Getting keywords from the file
21
- def getKeywords(word):
22
- words = word.lower().rstrip()
23
- if keyword_map.get(words):
24
- return keyword_map[words], True
25
- return word, False
26
-
27
-
28
- # Replace keywords with the translation
29
- def final_output(text):
30
- reg = re.compile('[a-zA-Z][ \-()a-zA-Z]+')
31
- keywords = re.findall(reg, text)
32
- text_split = re.split(reg, text)
33
- n=len(keywords)
34
- for i in range(n):
35
- word, found = getKeywords(keywords[i])
36
- if found:
37
- text_split[i]+=(word + " (" + keywords[i].rstrip() + ")")
38
- else:
39
- text_split[i]+=word
40
- #print(keywords)
41
- #print(' '.join(text_split))
42
- return ' '.join(text_split)
43
-
44
-
45
-
46
- def translate(input):
47
- translation = []
48
- #text_list = articles['text'][0].split('.')
49
- text_list = input.split('.')
50
- for i in range(len(text_list)):
51
- encode = model.generate(**tokenizer.prepare_seq2seq_batch(text_list[i],return_tensors='pt'))
52
  text_ar = tokenizer.batch_decode(encode,skip_special_tokens=True)[0]
53
- text_post = final_output(text_ar)
54
- translation.append(text_post)
55
- article_ar = ".".join(translation)
56
- return article_ar
57
- # Wrap up function
58
-
59
-
60
- translate_interface = gr.Interface(fn = translate,
61
  allow_flagging = True,
62
  flagging_dir = 'flagged/logs',
63
  title = 'Translating "English Data Science" content into Arabic',
64
- inputs=gr.inputs.Textbox(lines = 7, label = 'English content'),
65
  outputs="text",
66
- examples = [
67
- ['In the last few years the RNN-based architectures have shown the best performance in machine translation problems, but still they have some problems that had to be solved. First, they have a difficulty to cope with long-range dependencies (also LSTM when it has to deal with really long sentences). Secondly, each hidden state depends on the previous one which impossible to parallelize and makes it inefficient on GPUs.']
68
- ,
69
- ['Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from noisy, structured and unstructured data,[1][2] and apply knowledge and actionable insights from data across a broad range of application domains. Data science is related to data mining, machine learning and big data.']
70
- ]
71
  )
72
  translate_interface.launch(inline = False)
 
2
  import gradio as gr
3
  from transformers import AutoTokenizer
4
  import torch
 
 
5
  tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
6
  model = torch.load("helsinki_fineTuned.pt", map_location=torch.device('cpu'))
7
  model.eval()
8
+ def translate_gradio(input):
9
+ tokenized_text = tokenizer.prepare_seq2seq_batch([input], return_tensors='pt')
10
+ encode = model.generate(**tokenized_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  text_ar = tokenizer.batch_decode(encode,skip_special_tokens=True)[0]
12
+ return text_ar
13
+ translate_interface = gr.Interface(fn = translate_gradio,
 
 
 
 
 
 
14
  allow_flagging = True,
15
  flagging_dir = 'flagged/logs',
16
  title = 'Translating "English Data Science" content into Arabic',
17
+ inputs=gr.inputs.Textbox(lines = 7, label = 'english content'),
18
  outputs="text",
19
+ examples = [['In the last few years the RNN-based architectures have shown the best performance in machine translation problems, but still they have some problems that had to be solved. First, they have a difficulty to cope with long-range dependencies (also LSTM when it has to deal with really long sentences). Secondly, each hidden state depends on the previous one which impossible to parallelize and makes it inefficient on GPUs.']]
 
 
 
 
20
  )
21
  translate_interface.launch(inline = False)