Paulie-Aditya commited on
Commit
d1f4023
·
verified ·
1 Parent(s): f00dd8a

Setting up

Browse files
Files changed (3) hide show
  1. README.md +16 -12
  2. app.py +22 -0
  3. main.py +77 -0
README.md CHANGED
@@ -1,12 +1,16 @@
1
- ---
2
- title: Text To Text Translator
3
- emoji: 📚
4
- colorFrom: red
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 4.32.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
+ ---
2
+ title: Text_to_Text_Translator
3
+ app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 4.32.0
6
+ ---
7
+ ## Text to Text Translator
8
+
9
+ Built a Text to Text Translator using NLTK and Transformers.
10
+ - Supports Translation of English to Bengali, Tamil, Telugu, Gujarati, Marathi and Hindi.
11
+ - Uses BanglaT5 which achieved an exceptional score of <b>25.2</b> on SacreBLEU metric while mt5 (Industry Standard) scored much lower at <b>22.5</b>
12
+
13
+
14
+ Future Work:
15
+ - Adding functionality of uploading Images and Files
16
+ - OCR will run on these files and provide translation automatically
app.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #User Interface
2
+
3
+ import gradio as gr
4
+ import main
5
+
6
+ def test(text, src, dest):
7
+ ans = main.main_translation(text,dest,src)
8
+ return ans['output']
9
+ demo = gr.Interface(
10
+ test,
11
+ ["textbox",
12
+ gr.Dropdown(
13
+ [("English", "en_XX"), ("Hindi","hi_IN"), ("Bengali","bn_IN"), ("Gujarati","gu_IN"), ("Tamil","ta_IN"), ("Telugu","te_IN"), ("Marathi","mr_IN")], label="Source", info="Select the Source Language!"
14
+ ),
15
+ gr.Dropdown(
16
+ [("English", "en_XX"), ("Hindi","hi_IN"), ("Bengali","bn_IN"), ("Gujarati","gu_IN"), ("Tamil","ta_IN"), ("Telugu","te_IN"), ("Marathi","mr_IN")], label="Destination", info="Select the Destination Language!"
17
+ ),
18
+ ],
19
+ outputs=["textbox"],
20
+ )
21
+
22
+ demo.launch(share=True)
main.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from transformers import pipeline
3
+ import nltk
4
+ from nltk import sent_tokenize
5
+ from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
6
+ from transformers import pipeline
7
+
8
+ # nltk.download('punkt') # Run only once
9
+
10
+ tokenizer = MBart50TokenizerFast.from_pretrained("SnypzZz/Llama2-13b-Language-translate", src_lang="en_XX")
11
+ #pipe = pipeline("text2text-generation", model="SnypzZz/Llama2-13b-Language-translate", tokenizer=tokenizer)
12
+ model = None
13
+ model_loaded = False
14
+
15
+ api_token_header = ""
16
+ with open('./secret.py', 'r') as f:
17
+ api_token_header = f.read()
18
+
19
+ def load_model():
20
+ global model, model_loaded
21
+ model = MBartForConditionalGeneration.from_pretrained("SnypzZz/Llama2-13b-Language-translate")
22
+ model_loaded =True
23
+ return model
24
+
25
+ def translation(text,dest_lang,dest_lang_code, src_lang_code):
26
+
27
+ if(dest_lang_code == src_lang_code):
28
+ return "Please select different languages to translate between."
29
+
30
+ # headers = {"Authorization": f"Bearer {secrets_sih.api_token_header}"}
31
+ headers = {"Authorization": f"Bearer {api_token_header}"}
32
+
33
+ # Bengali Done
34
+ if(dest_lang == "Bengali" and src_lang_code == "en_XX"):
35
+ API_URL = "https://api-inference.huggingface.co/models/csebuetnlp/banglat5_nmt_en_bn"
36
+ def query(payload):
37
+ response = requests.post(API_URL, headers=headers, json=payload)
38
+ return response.json()
39
+ output = query({
40
+ "inputs": text,
41
+ })
42
+ print(output)
43
+ return output[0]['translation_text']
44
+ else:
45
+ global model
46
+ if model:
47
+ pass
48
+ else:
49
+ model = load_model()
50
+ loaded_model = model
51
+ tokenizer = MBart50TokenizerFast.from_pretrained("SnypzZz/Llama2-13b-Language-translate", src_lang=src_lang_code)
52
+ #model_inputs = tokenizer(text, return_tensors="pt")
53
+ loaded_model_inputs = tokenizer(text, return_tensors="pt")
54
+
55
+ # translate
56
+ generated_tokens = loaded_model.generate(
57
+ **loaded_model_inputs,
58
+ forced_bos_token_id=tokenizer.lang_code_to_id[dest_lang_code]
59
+ )
60
+ output = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
61
+ print(output)
62
+ return output[0]
63
+
64
+ def main_translation(text,dest_lang_code,src_lang_code):
65
+
66
+ codes = {"en_XX":"English","bn_IN":"Bengali", "en_GB":"English","gu_IN":"Gujarati","hi_IN":"Hindi","ta_IN":"Tamil","te_IN":"Telugu","mr_IN":"Marathi"}
67
+ dest_lang = codes[dest_lang_code]
68
+ src_lang = codes[src_lang_code]
69
+
70
+ sentences = sent_tokenize(text)
71
+ output = ""
72
+ for line in sentences:
73
+ output += translation(line,dest_lang,dest_lang_code, src_lang_code)
74
+ return {"output":output}
75
+
76
+
77
+ print(main_translation("hello world", "hi_IN", "en_XX"))