Commit
·
12f6a20
1
Parent(s):
a872010
Upload languages.py
Browse files- languages.py +85 -0
languages.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from docx import Document
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
import transformers
|
5 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
6 |
+
import torch
|
7 |
+
from mosestokenizer import *
|
8 |
+
from indicnlp.tokenize import sentence_tokenize
|
9 |
+
|
10 |
+
|
11 |
+
# import zipfile
|
12 |
+
# with zipfile.ZipFile(r"C:\Users\Prince Raj\Desktop\BOT\cuad-training\data.zip") as zip_ref:
|
13 |
+
# zip_ref.extractall(r"C:\Users\Prince Raj\Desktop\BOT\cuad-training")
|
14 |
+
os.chdir(r"C:\Users\Prince Raj\Desktop\BOT\transformers")
|
15 |
+
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
|
16 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
|
17 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
18 |
+
model = model.to(device)
|
19 |
+
|
20 |
+
lang_dict = {
|
21 |
+
'english' : 'eng_Latn',
|
22 |
+
'assamese' : 'asm_Beng',
|
23 |
+
'awadhi' : 'awa_Deva' ,
|
24 |
+
'bengali' : 'ben_Beng',
|
25 |
+
'bhojpuri' : 'bho_Deva',
|
26 |
+
'gujarati' : 'guj_Gujr',
|
27 |
+
'hindi' : 'hin_Deva',
|
28 |
+
'kannada' : 'kan_Knda',
|
29 |
+
'kashmiri' : 'kas_Deva',
|
30 |
+
'maithili' : 'mai_Deva',
|
31 |
+
'malayalam' : 'mal_Mlym',
|
32 |
+
'marathi' : 'mar_Deva',
|
33 |
+
'odia' : 'ory_Orya',
|
34 |
+
'punjabi' : 'pan_Guru',
|
35 |
+
'sanskrit' : 'san_Deva',
|
36 |
+
'sindhi' : 'snd_Arab' ,
|
37 |
+
'tamil' : 'tam_Taml' ,
|
38 |
+
'telugu' : 'tel_Telu',
|
39 |
+
'urdu' : 'urd_Arab'
|
40 |
+
}
|
41 |
+
|
42 |
+
def translate_sentence(article, target):
|
43 |
+
inputs = tokenizer(article, return_tensors="pt").to(device)
|
44 |
+
|
45 |
+
translated_tokens = model.generate(
|
46 |
+
**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[lang_dict[target]], max_length=100)
|
47 |
+
|
48 |
+
return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
|
49 |
+
|
50 |
+
|
51 |
+
INDIC_DICT = {"assamese" :"as", 'bengali' : 'bn', 'gujarati' : 'gu',
|
52 |
+
'hindi' : 'hi',
|
53 |
+
'kannada' : 'kn',
|
54 |
+
'malayalam' : 'ml',
|
55 |
+
'marathi' : 'mr',
|
56 |
+
'odia' : 'or',
|
57 |
+
'punjabi' : 'pa',
|
58 |
+
'tamil' : 'ta' ,
|
59 |
+
'telugu' : 'te'}
|
60 |
+
|
61 |
+
def split_sentences(paragraph, language):
|
62 |
+
if language in INDIC_DICT.keys():
|
63 |
+
return sentence_tokenize.sentence_split(paragraph, lang=INDIC_DICT[language])
|
64 |
+
elif language == 'en':
|
65 |
+
with MosesSentenceSplitter('en') as splitter:
|
66 |
+
return splitter([paragraph])
|
67 |
+
else:
|
68 |
+
return paragraph.split(".")
|
69 |
+
|
70 |
+
def languages(paragraph, source, target):
|
71 |
+
if len(paragraph.split()) < 100:
|
72 |
+
return translate_sentence(paragraph, target)
|
73 |
+
else:
|
74 |
+
sentences = split_sentences(paragraph, source)
|
75 |
+
outputs = []
|
76 |
+
for each_sentence in sentences:
|
77 |
+
outputs.append(translate_sentence(each_sentence, target))
|
78 |
+
return " ".join(outputs)
|
79 |
+
|
80 |
+
sys.modules[__name__] = languages
|
81 |
+
|
82 |
+
# sent = "I am hungry now"
|
83 |
+
# src = "english"
|
84 |
+
# trg = "hindi"
|
85 |
+
# print(languages(sent, src, trg))
|