|
from nltk.tokenize import sent_tokenize |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
import torch |
|
import src.exception.Exception.Exception as ExceptionCustom |
|
|
|
|
|
METHOD = "TRANSLATE" |
|
|
|
tokenizerROMENG = AutoTokenizer.from_pretrained("BlackKakapo/opus-mt-ro-en") |
|
modelROMENG = AutoModelForSeq2SeqLM.from_pretrained("BlackKakapo/opus-mt-ro-en") |
|
|
|
tokenizerENGROM = AutoTokenizer.from_pretrained("BlackKakapo/opus-mt-en-ro") |
|
modelENGROM = AutoModelForSeq2SeqLM.from_pretrained("BlackKakapo/opus-mt-en-ro") |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
modelROMENG.to(device) |
|
modelENGROM.to(device) |
|
|
|
|
|
def paraphraseTranslateMethod(requestValue : str): |
|
|
|
exception = "" |
|
result_value = "" |
|
|
|
|
|
exception = ExceptionCustom.checkForException(requestValue, METHOD) |
|
if exception != "": |
|
return "", exception |
|
|
|
tokenized_sent_list = sent_tokenize(requestValue) |
|
|
|
for SENTENCE in tokenized_sent_list: |
|
|
|
input_ids1 = tokenizerROMENG(SENTENCE, return_tensors='pt').to(device) |
|
|
|
output1 = modelROMENG.generate( |
|
input_ids=input_ids1.input_ids, |
|
do_sample=True, |
|
max_length=256, |
|
top_k=90, |
|
top_p=0.97, |
|
early_stopping=False |
|
) |
|
|
|
result1 = tokenizerROMENG.batch_decode(output1, skip_special_tokens=True)[0] |
|
|
|
input_ids = tokenizerENGROM(result1, return_tensors='pt').to(device) |
|
|
|
output = modelENGROM.generate( |
|
input_ids=input_ids.input_ids, |
|
do_sample=True, |
|
max_length=256, |
|
top_k=90, |
|
top_p=0.97, |
|
early_stopping=False |
|
) |
|
|
|
result = tokenizerENGROM.batch_decode(output, skip_special_tokens=True)[0] |
|
|
|
result_value += result + " " |
|
|
|
return result_value, "" |