# converts a SentencePiece vocabulary to the format expected by dynamic data | |
# (essentially converts float expected counts to "fixed precision" int pseudo | |
# counts) | |
import sys | |
import math | |
from onmt.constants import DefaultTokens | |
OMIT = (DefaultTokens.UNK, DefaultTokens.BOS, DefaultTokens.EOS) | |
def convert(lines): | |
for line in lines: | |
w, c = line.rstrip('\n').split(None, 1) | |
if w in OMIT: | |
continue | |
c = math.exp(float(c)) * 1000000 | |
c = int(c) + 1 | |
yield w, c | |
if __name__ == '__main__': | |
for c, w in convert(sys.stdin): | |
print('{}\t{}'.format(c, w)) | |