|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Simulate post-editing of MT output. |
|
|
|
Incrementally updates the dynamic phrase tables in the moses server. |
|
""" |
|
|
|
import argparse |
|
import os |
|
import sys |
|
import time |
|
import xmlrpclib |
|
import moses |
|
from subprocess import ( |
|
PIPE, |
|
Popen, |
|
) |
|
|
|
|
|
mserver = moses.MosesServer() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def split_args(all_args): |
|
""" |
|
Split argument list all_args into arguments specific to this script and |
|
arguments relating to the moses server. An isolated double dash acts as |
|
the separator between the two types of arguments. |
|
""" |
|
my_args = [] |
|
mo_args = [] |
|
arglist = mo_args |
|
i = 0 |
|
|
|
|
|
|
|
while i < len(all_args): |
|
|
|
|
|
if all_args[i] == "--[": |
|
arglist = my_args |
|
elif all_args[i] == "--]": |
|
arglist = mo_args |
|
elif all_args[i] == "-i" or all_args[i] == "-input-file": |
|
my_args.extend(["-i", all_args[i + 1]]) |
|
i += 1 |
|
elif all_args[i] == "-inputtype": |
|
if all_args[i + 1] != "0": |
|
|
|
errmsg = ( |
|
"FATAL ERROR: " |
|
"%s only supports plain text input at this point." |
|
% sys.argv[0]) |
|
raise Exception(errmsg) |
|
|
|
i += 1 |
|
elif all_args[i] == "-lattice-samples": |
|
|
|
|
|
|
|
|
|
|
|
errmsg = ( |
|
"FATAL ERROR: %s does not yet support lattice sampling." |
|
% sys.argv[0]) |
|
raise Exception(errmsg) |
|
|
|
elif all_args[i] == "-n-best-list": |
|
my_args.extend(["--nbest", all_args[i + 2]]) |
|
my_args.extend(["--nbest-file", all_args[i + 1]]) |
|
i += 2 |
|
|
|
elif all_args[i] == "-n-best-distinct": |
|
my_args.extend(["-u"]) |
|
|
|
else: |
|
arglist.append(all_args[i]) |
|
pass |
|
|
|
i += 1 |
|
pass |
|
return my_args, mo_args |
|
|
|
|
|
def interpret_args(my_args): |
|
""" |
|
Parse script-specific argument list. |
|
""" |
|
aparser = argparse.ArgumentParser() |
|
|
|
aparser.add_argument( |
|
"-s", "--server-cmd", default="mosesserver", dest="servercmd", |
|
help="Path to moses server command.") |
|
aparser.add_argument( |
|
"--url", help="URL of external moses server.") |
|
aparser.add_argument( |
|
"-p", "--port", type=int, default=7447, |
|
help="Port number to be used for server.") |
|
|
|
|
|
aparser.add_argument( |
|
"-i", "--input", default='-', help="source file") |
|
aparser.add_argument( |
|
"-r", "--ref", default=None, help="Reference translation.") |
|
aparser.add_argument( |
|
"-a", "--aln", default=None, help="Alignment.") |
|
aparser.add_argument( |
|
"-o", "--output", default="-", help="Output file.") |
|
aparser.add_argument( |
|
"-d", "--debug", action='store_true', help="Debug mode.") |
|
|
|
|
|
aparser.add_argument( |
|
"-A", "--with-alignment", dest="A", action='store_true', |
|
help="Include alignment in output.") |
|
aparser.add_argument( |
|
"-G", "--with-graph", type=bool, default=False, dest="G", |
|
help="Include search graph info in output.") |
|
aparser.add_argument( |
|
"-T", "--with-transopt", type=bool, default=False, dest="T", |
|
help="Include translation options info in output.") |
|
aparser.add_argument( |
|
"-F", "--report-all-factors", action="store_true", dest="F", |
|
help="Report all factors.") |
|
aparser.add_argument( |
|
"-n", "--nbest", type=int, dest="nbest", default=0, |
|
help="Size of nbest list.") |
|
aparser.add_argument( |
|
"-N", "--nbest-file", dest="nbestFile", default=0, |
|
help="Output file for nbest list.") |
|
aparser.add_argument( |
|
"-u", "--nbest-distinct", type=bool, dest="U", default=False, |
|
help="Report all factors.") |
|
|
|
return aparser.parse_args(my_args) |
|
|
|
|
|
def translate(proxy, args, line): |
|
if type(line) is unicode: |
|
param = {'text': line.strip().encode('utf8')} |
|
elif type(line) is str: |
|
param = {'text': line.strip()} |
|
else: |
|
raise Exception("Can't handle input") |
|
if args.A: |
|
param['align'] = True |
|
if args.T: |
|
param['topt'] = True |
|
if args.F: |
|
param['report-all-factors'] = True |
|
if args.nbest: |
|
param['nbest'] = int(args.nbest) |
|
param['add-score-breakdown'] = True |
|
pass |
|
if args.U: |
|
param['nbest-distinct'] = True |
|
pass |
|
attempts = 0 |
|
while attempts < 20: |
|
t1 = time.time() |
|
try: |
|
return proxy.translate(param) |
|
|
|
|
|
|
|
|
|
except xmlrpclib.Error as e: |
|
sys.stderr.flush() |
|
print >>sys.stderr, " XMLRPC error:", e |
|
print >>sys.stderr, "Input was" |
|
print >>sys.stderr, param |
|
sys.exit(1) |
|
|
|
except IOError as e: |
|
print >>sys.stderr, ( |
|
"I/O error({0}): {1}".format(e.errno, e.strerror)) |
|
time.sleep(5) |
|
|
|
except: |
|
serverstatus = mserver.process.poll() |
|
if serverstatus is None: |
|
print >>sys.stderr, ( |
|
"Connection failed after %f seconds" % (time.time() - t1)) |
|
attempts += 1 |
|
if attempts > 10: |
|
time.sleep(10) |
|
else: |
|
time.sleep(5) |
|
else: |
|
print >>sys.stderr, ( |
|
"Oopsidaisy, server exited with code %d (signal %d)" |
|
% (serverstatus / 256, serverstatus % 256)) |
|
pass |
|
pass |
|
pass |
|
raise Exception("Exception: could not reach translation server.") |
|
|
|
|
|
def read_data(fname): |
|
""" |
|
Read and return data (source, target or alignment) from file fname. |
|
""" |
|
if fname[-3:] == ".gz": |
|
process = Popen(["zcat", fname], stdout=PIPE) |
|
stdout, _ = process.communicate() |
|
foo = stdout.strip().split('\n') |
|
else: |
|
foo = [x.strip() for x in open(fname).readlines()] |
|
return foo |
|
|
|
|
|
def repack_result(idx, result): |
|
global args |
|
if args.nbest: |
|
for h in result['nbest']: |
|
fields = [idx, h['hyp'], h['fvals'], h['totalScore']] |
|
for i in xrange(len(fields)): |
|
if type(fields[i]) is unicode: |
|
fields[i] = fields[i].encode('utf-8') |
|
pass |
|
pass |
|
|
|
print >>NBestFile, "%d ||| %s ||| %s ||| %f" % tuple(fields) |
|
pass |
|
if 'align' in result: |
|
t = result['text'].split() |
|
span = '' |
|
i = 0 |
|
k = 0 |
|
for a in result['align']: |
|
k = a['tgt-start'] |
|
if k: |
|
print " ".join(t[i:k]).encode('utf8'), span, |
|
i = k |
|
span = "|%d %d|" % (a['src-start'], a['src-end']) |
|
print " ".join(t[k:]).encode('utf8'), span |
|
else: |
|
print result['text'].encode('utf8') |
|
|
|
|
|
if __name__ == "__main__": |
|
my_args, mo_args = split_args(sys.argv[1:]) |
|
|
|
|
|
|
|
|
|
global args |
|
args = interpret_args(my_args) |
|
|
|
if "-show-weights" in mo_args: |
|
|
|
|
|
devnull = open(os.devnull, "w") |
|
mo = Popen(mserver.cmd + mo_args, stdout=PIPE, stderr=devnull) |
|
print mo.communicate()[0].strip() |
|
sys.exit(0) |
|
pass |
|
|
|
if args.nbest: |
|
if args.nbestFile: |
|
NBestFile = open(args.nbestFile, "w") |
|
else: |
|
NBestFile = sys.stdout |
|
pass |
|
pass |
|
|
|
ref = None |
|
aln = None |
|
if args.ref: |
|
ref = read_data(args.ref) |
|
if args.aln: |
|
aln = read_data(args.aln) |
|
|
|
if ref and aln: |
|
try: |
|
mo_args.index("--serial") |
|
except: |
|
mo_args.append("--serial") |
|
pass |
|
pass |
|
|
|
if args.url: |
|
mserver.connect(args.url) |
|
else: |
|
mserver.start(args=mo_args, port=args.port, debug=args.debug) |
|
pass |
|
|
|
if (args.input == "-"): |
|
line = sys.stdin.readline() |
|
idx = 0 |
|
while line: |
|
result = translate(mserver.proxy, args, line) |
|
repack_result(idx, result) |
|
line = sys.stdin.readline() |
|
idx += 1 |
|
else: |
|
src = read_data(args.input) |
|
for i in xrange(len(src)): |
|
result = translate(mserver.proxy, args, src[i]) |
|
repack_result(i, result) |
|
if args.debug: |
|
print >>sys.stderr, result['text'].encode('utf-8') |
|
pass |
|
if ref and aln: |
|
result = mserver.proxy.updater({ |
|
'source': src[i], |
|
'target': ref[i], |
|
'alignment': aln[i], |
|
}) |
|
|