Proba_Nos / translate.py
JJFrancisco's picture
Create translate.py
85387ea verified
import ctranslate2
from subword_nmt.apply_bpe import BPE
import codecs
import re
def apply_subwording(sample_text, model_code_path):
# APPLY BPE WITH SUBWORD-NMT
model = codecs.open(model_code_path, encoding='utf-8')
bpe = BPE(model)
subwording_text = ""
for line in sample_text.splitlines():
subwording_line = bpe.process_line(line)
subwording_text = subwording_text + subwording_line + "\n"
return subwording_text
def remove_subwording_marks(translated_text):
return re.sub("@@ ", "", translated_text)
def translate_nos(sample_text, model):
tokenizer_model = model[0]
translator_model = model[1]
# Apply subwording
subwording_text = apply_subwording(sample_text, tokenizer_model)
# Translate entry
translator = ctranslate2.Translator(translator_model, device="cpu")
output =""
for line in subwording_text.splitlines():
line = line.strip()
r = translator.translate_batch(
[line.split()], replace_unknowns=True, beam_size=5, batch_type='examples'
)
results =' '.join(r[0].hypotheses[0])+"\n"
output = output + results
# Remove subwording
output = remove_subwording_marks(output)
return output