File size: 5,761 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich <sennrich [AT] cl.uzh.ch>
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
# This script creates tables that store phrase pair frequencies rather than
# probabilities.
#
# These count tables can be used for a delayed, online computation of the
# original phrase translation features.
#
# The benefit is that models can be combined quickly, with the same results
# as if we trained a model on the concatenation of all data (excepting
# differences in word alignment).
#
# Also, each model can be given a weight, which is applied to all frequencies
# of the model for the combination.
# Note: the input phrase table must have alignment information;
# it must be unsmoothed;
# additionally, the phrase table type PhraseDictionaryMultiModelCounts
# requires the lexical counts files lex.counts.e2f and lex.counts.f2e
# (obtained by using the option --write-lexical-counts in
# train-model.perl)
# The results may differ from training on the concatenation of all data due
# to differences in word alignment, and rounding errors.
from __future__ import unicode_literals
import sys
import os
import gzip
from tempfile import NamedTemporaryFile
from subprocess import Popen, PIPE
if len(sys.argv) < 3 or len(sys.argv) > 4:
sys.stderr.write(
'Usage: ' +
sys.argv[0] + " in_file out_path [prune_count]\n"
"This script will create the files out_path/count-table.gz and "
"out_path/count-table-target.gz\n")
exit()
def handle_file(filename, action, fileobj=None, mode='r'):
"""support reading either from stdin, plain file or gzipped file"""
if action == 'open':
if mode == 'r':
mode = 'rb'
if mode == 'rb' and filename != '-' and not os.path.exists(filename):
if os.path.exists(filename + '.gz'):
filename = filename + '.gz'
else:
sys.stderr.write(
"Error: unable to open file. " +
filename + " - aborting.\n")
exit()
if filename.endswith('.gz'):
fileobj = gzip.open(filename, mode)
elif filename == '-':
fileobj = sys.stdin
else:
fileobj = open(filename, mode)
return fileobj
elif action == 'close' and filename != '-':
fileobj.close()
def sort_and_uniq(infile, outfile):
cmd = ['sort', infile]
fobj = handle_file(outfile, 'open', mode='w')
sys.stderr.write(
"Executing: LC_ALL=C " +
' '.join(cmd) +
' | uniq | gzip -c > ' + outfile + '\n')
p_sort = Popen(cmd, env={'LC_ALL': 'C'}, stdout=PIPE)
p_uniq = Popen(['uniq'], stdin=p_sort.stdout, stdout=PIPE)
p_compress = Popen(['gzip', '-c'], stdin=p_uniq.stdout, stdout=fobj)
p_compress.wait()
fobj.close()
def create_count_lines(fobj, countobj, countobj_target, prune=0):
i = 0
original_pos = 0
source = b""
store_lines = set()
for line in fobj:
if not i % 100000:
sys.stderr.write('.')
i += 1
line = line.split(b' ||| ')
current_source = line[0]
scores = line[2].split()
comments = line[4].split()
fs = comments[1]
ft = comments[0]
try:
fst = comments[2]
except IndexError:
fst = str(int(round(float(scores[0]) * float(ft)))).encode()
line[2] = b' '.join([fst, ft, fs])
if prune:
if current_source != source:
write_batch(store_lines, countobj, prune)
source = current_source
store_lines = set()
original_pos = 0
store_lines.add((float(fst), original_pos, b' ||| '.join(line)))
original_pos += 1
else:
countobj.write(b' ||| '.join(line))
# Target count file.
# If you use string formatting to make this look nicer, you may break
# Python 3 compatibility.
tline = b' ||| '.join([line[1], b'X', ft]) + b' ||| |||\n'
countobj_target.write(tline)
if prune:
write_batch(store_lines, countobj, prune)
countobj.close()
countobj_target.close()
def write_batch(store_lines, outfile, prune):
top20 = sorted(store_lines, reverse=True)[:prune]
# Write in original_order.
for score, original_pos, store_line in sorted(top20, key=lambda x: x[1]):
outfile.write(store_line)
if __name__ == '__main__':
if len(sys.argv) == 4:
prune = int(sys.argv[3])
else:
prune = 0
fileobj = handle_file(sys.argv[1], 'open')
out_path = sys.argv[2]
count_table_file = gzip.open(
os.path.join(out_path, 'count-table.gz'), 'w')
count_table_target_file = os.path.join(out_path, 'count-table-target.gz')
count_table_target_file_temp = NamedTemporaryFile(delete=False)
try:
sys.stderr.write(
"Creating temporary file for unsorted target counts file: " +
count_table_target_file_temp.name + '\n')
create_count_lines(
fileobj, count_table_file, count_table_target_file_temp, prune)
count_table_target_file_temp.close()
sys.stderr.write(
"Finished writing, "
"now re-sorting and compressing target count file.\n")
sort_and_uniq(
count_table_target_file_temp. name, count_table_target_file)
os.remove(count_table_target_file_temp.name)
sys.stderr.write('Done\n')
except BaseException:
os.remove(count_table_target_file_temp.name)
raise
|