import ctranslate2 from subword_nmt.apply_bpe import BPE import codecs import re def apply_subwording(sample_text, model_code_path): # APPLY BPE WITH SUBWORD-NMT model = codecs.open(model_code_path, encoding='utf-8') bpe = BPE(model) subwording_text = "" for line in sample_text.splitlines(): subwording_line = bpe.process_line(line) subwording_text = subwording_text + subwording_line + "\n" return subwording_text def remove_subwording_marks(translated_text): return re.sub("@@ ", "", translated_text) def translate_nos(sample_text, model): tokenizer_model = model[0] translator_model = model[1] # Apply subwording subwording_text = apply_subwording(sample_text, tokenizer_model) # Translate entry translator = ctranslate2.Translator(translator_model, device="cpu") output ="" for line in subwording_text.splitlines(): line = line.strip() r = translator.translate_batch( [line.split()], replace_unknowns=True, beam_size=5, batch_type='examples' ) results =' '.join(r[0].hypotheses[0])+"\n" output = output + results # Remove subwording output = remove_subwording_marks(output) return output