File size: 1,206 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import ctranslate2
from mosestokenizer import MosesSentenceSplitter, MosesTokenizer

from indicnlp.tokenize import sentence_tokenize, indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

import codecs
from subword_nmt.apply_bpe import BPE

## Tokenize
tokenize = MosesTokenizer('en')
## BPE
codes = codecs.open("en-hi/bpe-codes/codes.en", encoding='utf-8')
bpe = BPE(codes)
## Translate
translator = ctranslate2.Translator("en-hi/model_deploy/",
    # compute_type="int8"
    )

INP="input-files/flores/eng.devtest"
OUT="output-translation/flores/test.hi"

inp_lines = open(INP, 'r').readlines()
inp_lines = [line.strip("\n") for line in inp_lines]

out_file = open(OUT, 'w+')

# Lowercase
inp_lines = [line.lower() for line in inp_lines]

# Tokenize
inp_lines = [' '.join(tokenize(line)) for line in inp_lines]

# Apply BPE
inp_lines = [bpe.process_line(line).split(" ") for line in inp_lines]

# Translate
out_lines = translator.translate_batch(inp_lines, beam_size=5, max_batch_size=16)

# Remove BPE
out_lines = [(' '.join(line.hypotheses[0]) + " ").replace("@@ ", "") for line in out_lines]

for line in out_lines:
    out_file.write(line + "\n")
out_file.close()