File size: 616 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# converts a SentencePiece vocabulary to the format expected by dynamic data
# (essentially converts float expected counts to "fixed precision" int pseudo
# counts)
import sys
import math
from onmt.constants import DefaultTokens

OMIT = (DefaultTokens.UNK, DefaultTokens.BOS, DefaultTokens.EOS)


def convert(lines):
    for line in lines:
        w, c = line.rstrip('\n').split(None, 1)
        if w in OMIT:
            continue
        c = math.exp(float(c)) * 1000000
        c = int(c) + 1
        yield w, c


if __name__ == '__main__':
    for c, w in convert(sys.stdin):
        print('{}\t{}'.format(c, w))