JJFrancisco commited on
Commit
85387ea
1 Parent(s): 51b2112

Create translate.py

Browse files
Files changed (1) hide show
  1. translate.py +36 -0
translate.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctranslate2
2
+ from subword_nmt.apply_bpe import BPE
3
+ import codecs
4
+ import re
5
+
6
+ def apply_subwording(sample_text, model_code_path):
7
+ # APPLY BPE WITH SUBWORD-NMT
8
+ model = codecs.open(model_code_path, encoding='utf-8')
9
+ bpe = BPE(model)
10
+ subwording_text = ""
11
+ for line in sample_text.splitlines():
12
+ subwording_line = bpe.process_line(line)
13
+ subwording_text = subwording_text + subwording_line + "\n"
14
+ return subwording_text
15
+
16
+ def remove_subwording_marks(translated_text):
17
+ return re.sub("@@ ", "", translated_text)
18
+
19
+ def translate_nos(sample_text, model):
20
+ tokenizer_model = model[0]
21
+ translator_model = model[1]
22
+ # Apply subwording
23
+ subwording_text = apply_subwording(sample_text, tokenizer_model)
24
+ # Translate entry
25
+ translator = ctranslate2.Translator(translator_model, device="cpu")
26
+ output =""
27
+ for line in subwording_text.splitlines():
28
+ line = line.strip()
29
+ r = translator.translate_batch(
30
+ [line.split()], replace_unknowns=True, beam_size=5, batch_type='examples'
31
+ )
32
+ results =' '.join(r[0].hypotheses[0])+"\n"
33
+ output = output + results
34
+ # Remove subwording
35
+ output = remove_subwording_marks(output)
36
+ return output